Skip to content

Commit

Permalink
cuda: check for libcuda instead of /dev/nvidiactl
Browse files Browse the repository at this point in the history
The check for `/dev/nvidiactl` to determine if the CUDA plugin can be used
is unreliable. This patch changes the logic to check if the `libcuda.so.1`
shared object can be loaded and a GPU device is available, which is a more
accurate indicator.

The subsequent check for `--action` option would confirm if the NVIDIA driver
is installed and supports checkpoint/restore.

Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
  • Loading branch information
rst0git committed Nov 4, 2024
1 parent dcc3b49 commit ac75276
Showing 1 changed file with 40 additions and 4 deletions.
44 changes: 40 additions & 4 deletions plugins/cuda/cuda_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <unistd.h>
#include <sys/ptrace.h>
#include <sys/wait.h>
#include <dlfcn.h>

/* cuda-checkpoint binary should live in your PATH */
#define CUDA_CHECKPOINT "cuda-checkpoint"
Expand Down Expand Up @@ -470,6 +471,34 @@ int cuda_plugin_resume_devices_late(int pid)
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late)

/**
* The cuda-checkpoint tool requires the libcuda library.
* Disable the CUDA plugin if this library is not installed.
*/
static bool is_libcuda_installed(void) {

Check warning on line 478 in plugins/cuda/cuda_plugin.c

View workflow job for this annotation

GitHub Actions / build

void *handle = dlopen("libcuda.so.1", RTLD_LAZY);
if (!handle)
return false;
dlclose(handle);
return true;
}

static bool has_gpu()
{
DIR *dir = opendir("/proc/driver/nvidia/gpus/");
if (dir != NULL) {
struct dirent *entry;
while ((entry = readdir(dir)) != NULL) {
if (entry->d_type == DT_DIR && entry->d_name[0] != '.') {
closedir(dir);
return true;
}
}
closedir(dir);
}
return false;
}

int cuda_plugin_init(int stage)
{
int ret;
Expand All @@ -481,10 +510,17 @@ int cuda_plugin_init(int stage)
}
}

if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) {
pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n");
plugin_disabled = true;
return 0;
if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE)) {
if (!is_libcuda_installed()) {
pr_info("libcuda.so.1 is not installed; CUDA plugin is disabled\n");
plugin_disabled = true;
return 0;
}
if (!has_gpu()) {
pr_info("No GPU device found; CUDA plugin is disabled\n");
plugin_disabled = true;
return 0;
}
}

ret = cuda_checkpoint_supports_flag("--action");
Expand Down

0 comments on commit ac75276

Please sign in to comment.