Skip to content

Commit

Permalink
cuda: check for gpu instead of /dev/nvidiactl
Browse files Browse the repository at this point in the history
The check for `/dev/nvidiactl` to determine if the CUDA plugin can be
used is unreliable because in some cases the default path for driver
installation is different [1]. This patch changes the logic to check
if a GPU device is available in `/proc/driver/nvidia/gpus/`. This is
a more accurate indicator, and the subsequent check for `--action`
option would confirm if the NVIDIA driver supports checkpoint/restore.

[1] https://github.com/NVIDIA/gpu-operator

Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
  • Loading branch information
rst0git committed Nov 4, 2024
1 parent dcc3b49 commit 9a50892
Showing 1 changed file with 22 additions and 4 deletions.
26 changes: 22 additions & 4 deletions plugins/cuda/cuda_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,22 @@ int cuda_plugin_resume_devices_late(int pid)
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late)

static bool has_gpu()
{
DIR *dir = opendir("/proc/driver/nvidia/gpus/");
if (dir != NULL) {
struct dirent *entry;
while ((entry = readdir(dir)) != NULL) {
if (entry->d_type == DT_DIR && entry->d_name[0] != '.') {
closedir(dir);
return true;
}
}
closedir(dir);
}
return false;
}

int cuda_plugin_init(int stage)
{
int ret;
Expand All @@ -481,10 +497,12 @@ int cuda_plugin_init(int stage)
}
}

if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) {
pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n");
plugin_disabled = true;
return 0;
if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE)) {
if (!has_gpu()) {
pr_info("No GPU device found; CUDA plugin is disabled\n");
plugin_disabled = true;
return 0;
}
}

ret = cuda_checkpoint_supports_flag("--action");
Expand Down

0 comments on commit 9a50892

Please sign in to comment.