diff --git a/ansible/roles/ai_setup_nvidia_cuda/defaults/main.yml b/ansible/roles/ai_setup_nvidia_cuda/defaults/main.yml index 3f8b4c234da..1362fa4da0e 100644 --- a/ansible/roles/ai_setup_nvidia_cuda/defaults/main.yml +++ b/ansible/roles/ai_setup_nvidia_cuda/defaults/main.yml @@ -3,44 +3,28 @@ # Common vars -ai_setup_nvidia_cuda_python_version: '3.11' -ai_setup_nvidia_cuda_cuda_version: '12-4' -ai_setup_nvidia_cuda_debug: false +setup_nvidia_cuda_python_version: '3.11' +setup_nvidia_cuda_cuda_version: '12.4' +setup_nvidia_cuda_debug: false -ai_setup_nvidia_cuda_rhel_repos: +setup_nvidia_cuda_common_dnf_packages: - - name: CUDA - description: CUDA Repository - baseurl: https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo - enabled: true - gpgcheck: false - -ai_setup_nvidia_cuda_common_dnf_packages: - - - "python{{ ai_setup_nvidia_cuda_python_version }}" - - "python{{ ai_setup_nvidia_cuda_python_version }}-devel" - - "python{{ ai_setup_nvidia_cuda_python_version }}-pip" + - "python{{ setup_nvidia_cuda_python_version }}" + - "python{{ setup_nvidia_cuda_python_version }}-devel" + - "python{{ setup_nvidia_cuda_python_version }}-pip" - pciutils - nvtop - screen - tmux - hyperfine -# RHEL Vars - -ai_setup_nvidia_cuda_nvidia_rhel_dnf_packages: +setup_nvidia_cuda_nvidia_rhel_dnf_packages: - "@nvidia-driver:latest-dkms" - cuda-toolkit - nvidia-gds -ai_setup_nvidia_cuda_rhel_repos: - - # - name: epel-release-latest-9.noarch - # description: EPEL Repository - # baseurl: https://dl.fedoraproject.org/pub/epel - # enabled: true - # gpgcheck: false +setup_nvidia_cuda_rhel_repos: - name: cuda-rhel-x86_64 description: NVIDIA CUDA Repository @@ -48,11 +32,3 @@ ai_setup_nvidia_cuda_rhel_repos: enabled: true gpgcheck: false -# Nvidia Vars - -ai_setup_nvidia_cuda_fedora_version: 39 - -ai_setup_nvidia_cuda_nvidia_fedora_dnf_packages: - - - "@nvidia-driver:open-dkms" - - cuda-toolkit-12-4 diff --git a/ansible/roles/ai_setup_nvidia_cuda/tasks/main.yml b/ansible/roles/ai_setup_nvidia_cuda/tasks/main.yml index f0f8828a651..8fdda276404 100644 --- a/ansible/roles/ai_setup_nvidia_cuda/tasks/main.yml +++ b/ansible/roles/ai_setup_nvidia_cuda/tasks/main.yml @@ -17,7 +17,7 @@ baseurl: "{{ repo.baseurl }}" enabled: "{{ repo.enabled | default(true) }}" gpgcheck: "{{ repo.gpgcheck | default(false) }}" - loop: "{{ ai_setup_nvidia_cuda_rhel_repos }}" + loop: "{{ setup_nvidia_cuda_rhel_repos }}" loop_control: loop_var: repo @@ -25,56 +25,41 @@ ansible.builtin.dnf: name: "{{ package }}" state: present - loop: "{{ ai_setup_nvidia_cuda_nvidia_rhel_dnf_packages }}" + loop: "{{ setup_nvidia_cuda_nvidia_rhel_dnf_packages }}" loop_control: loop_var: package -- name: Setup Nvidia Drivers and CUDA for Fedora - when: ansible_distribution == 'Fedora' - block: + - name: Setup alternatives properly + block: - - name: Add a DNF repository - ansible.builtin.yum_repository: - name: cuda-fedora39 - description: NVIDIA CUDA Repository - baseurl: https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64 - enabled: true - gpgcheck: false + - name: Set CUDA alternatives/cuda + community.general.alternatives: + name: cuda + path: "/usr/local/cuda-{{ setup_nvidia_cuda_cuda_version }}" + link: /etc/alternatives/cuda + + # - name: Remove the cuda-12 alternatve that gets misconfigured + # ansible.builtin.file: + # path: /etc/alternatives/cuda-12 + # state: absent + # + # - name: Set CUDA alternatives/cuda-12 + # community.general.alternatives: + # name: cuda-12 + # path: "/usr/local/cuda-{{ setup_nvidia_cuda_cuda_version }}" + # link: /etc/alternatives/cuda-12 - - name: Setup nvdia repo, drivers, and cuda - ansible.builtin.dnf: - name: "{{ package }}" - state: present - loop: "{{ ai_setup_nvidia_cuda_nvidia_fedora_dnf_packages }}" - loop_control: - loop_var: package - -- name: Debug - Setup Nvidia Drivers and CUDA - when: ai_setup_nvidia_cuda_debug | default(false) | bool - block: - - - name: Check video driver - ansible.builtin.shell: "lspci -nn -k | grep -A 2 -e VGA -e 3D" - register: r_video_driver_check - changed_when: false - ignore_errors: true - - - name: Output video driver check - ansible.builtin.debug: - var: r_video_driver_check.stdout_lines - -# Common tasks to RHEL and Fedora + tags: + - nvidia-alternatives - name: Install common AI centric toolchain packages ansible.builtin.dnf: name: "{{ package }}" state: present - loop: "{{ ai_setup_nvidia_cuda_common_dnf_packages }}" + loop: "{{ setup_nvidia_cuda_common_dnf_packages }}" loop_control: loop_var: package -# TODO: Need to add a check here to see if the video driver is in use, if not, reboot the machine - # TODO: Is reboot really necessary here? Investigate # - name: Reboot the machine