cuda: check for libcuda instead of /dev/nvidiactl

The check for `/dev/nvidiactl` to determine if the CUDA plugin can be used is unreliable. This patch changes the logic to check if the `libcuda.so.1` shared object can be loaded and a GPU device is available, which is a more accurate indicator. The subsequent check for `--action` option would confirm if the NVIDIA driver is installed and supports checkpoint/restore. Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
checkpoint-restore · Nov 4, 2024 · ac75276 · ac75276
1 parent dcc3b49
commit ac75276
Showing 1 changed file with 40 additions and 4 deletions.
diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c
@@ -16,6 +16,7 @@
 #include <unistd.h>
 #include <sys/ptrace.h>
 #include <sys/wait.h>
+#include <dlfcn.h>
 
 /* cuda-checkpoint binary should live in your PATH */
 #define CUDA_CHECKPOINT "cuda-checkpoint"
@@ -470,6 +471,34 @@ int cuda_plugin_resume_devices_late(int pid)
 }
 CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late)
 
+/**
+ * The cuda-checkpoint tool requires the libcuda library.
+ * Disable the CUDA plugin if this library is not installed.
+ */
+static bool is_libcuda_installed(void) {
+	void *handle = dlopen("libcuda.so.1", RTLD_LAZY);
+	if (!handle)
+		return false;
+	dlclose(handle);
+	return true;
+}
+
+static bool has_gpu()
+{
+	DIR *dir = opendir("/proc/driver/nvidia/gpus/");
+	if (dir != NULL) {
+		struct dirent *entry;
+		while ((entry = readdir(dir)) != NULL) {
+			if (entry->d_type == DT_DIR && entry->d_name[0] != '.') {
+				closedir(dir);
+				return true;
+			}
+		}
+		closedir(dir);
+	}
+	return false;
+}
+
 int cuda_plugin_init(int stage)
 {
 	int ret;
@@ -481,10 +510,17 @@ int cuda_plugin_init(int stage)
 		}
 	}
 
-	if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) {
-		pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n");
-		plugin_disabled = true;
-		return 0;
+	if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE)) {
+		if (!is_libcuda_installed()) {
+			pr_info("libcuda.so.1 is not installed; CUDA plugin is disabled\n");
+			plugin_disabled = true;
+			return 0;
+		}
+		if (!has_gpu()) {
+			pr_info("No GPU device found; CUDA plugin is disabled\n");
+			plugin_disabled = true;
+			return 0;
+		}
 	}
 
 	ret = cuda_checkpoint_supports_flag("--action");