Commit 0cf034e3 authored by Emmanuel Jaep's avatar Emmanuel Jaep
Browse files

complete rvamp for nvidia driver installation

parent 3a095604
......@@ -249,13 +249,16 @@
when: custom_packages_installed.stat.exists == false
# CUDA
- name: generate the script that will install CUDA upon reboot
ansible.builtin.template:
src: "templates/rc.local"
dest: /etc/rc.local
owner: root
group: root
mode: 0755
# - name: generate the script that will install CUDA upon reboot
# ansible.builtin.template:
# src: "templates/rc.local"
# dest: /etc/rc.local
# owner: root
# group: root
# mode: 0755
# - name: schedule the reboot of the machine to install CUDA
# shell: "sleep 15 && reboot &"
- name: include the tasks from the config-gpu
include_tasks: "tmp/config-gpu/tasks/main.yml"
- name: schedule the reboot of the machine to install CUDA
shell: "sleep 10 && reboot &"
nvidia_driver_package_state: present
nvidia_driver_package_version: ''
nvidia_driver_persistence_mode_on: yes
nvidia_driver_skip_reboot: yes
nvidia_driver_module_file: /etc/modprobe.d/nvidia.conf
nvidia_driver_module_params: ''
##############################################################################
# RedHat family #
##############################################################################
epel_package: "https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ ansible_distribution_major_version }}.noarch.rpm"
nvidia_driver_rhel_cuda_repo_baseurl: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _rhel_repo_dir }}/"
nvidia_driver_rhel_cuda_repo_gpgkey: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _rhel_repo_dir }}/7fa2af80.pub"
##############################################################################
# Ubuntu #
##############################################################################
# Determine if we should install from CUDA repo instead of Canonical repos
nvidia_driver_ubuntu_install_from_cuda_repo: yes
# Installing with Canonical repositories
nvidia_driver_ubuntu_branch: "495"
nvidia_driver_ubuntu_packages:
- "nvidia-headless-{{ nvidia_driver_ubuntu_branch }}-server"
- "nvidia-utils-{{ nvidia_driver_ubuntu_branch }}-server"
- "nvidia-headless-no-dkms-{{ nvidia_driver_ubuntu_branch }}-server"
- "nvidia-kernel-source-{{ nvidia_driver_ubuntu_branch }}-server"
# Installing with CUDA repositories
nvidia_driver_ubuntu_cuda_repo_gpgkey_url: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}/7fa2af80.pub"
nvidia_driver_ubuntu_cuda_repo_gpgkey_id: "7fa2af80"
nvidia_driver_ubuntu_cuda_repo_baseurl: "http://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}"
nvidia_driver_ubuntu_cuda_package: "cuda-drivers-{{ nvidia_driver_ubuntu_branch }}"
# 470.57.02
#python3 -c "import torch; torch.cuda.is_available()"
Package: nsight-compute
Pin: origin *ubuntu.com*
Pin-Priority: -1
Package: nsight-systems
Pin: origin *ubuntu.com*
Pin-Priority: -1
Package: *
Pin: release l=NVIDIA CUDA
Pin-Priority: 600
[Service]
ExecStart=
ExecStart=/usr/bin/nvidia-persistenced --user root --persistence-mode --verbose
---
# We have to do this because the CentOS mirrors don't keep kernel-headers, etc
# for older kernels.
- name: ensure we have kernel-headers installed for the current kernel
block:
- name: attempt to install kernel support packages for current version
yum:
name:
- "kernel-headers-{{ ansible_kernel }}"
- "kernel-tools-{{ ansible_kernel }}"
- "kernel-tools-libs-{{ ansible_kernel }}"
- "kernel-devel-{{ ansible_kernel }}"
- "kernel-debug-devel-{{ ansible_kernel }}"
state: present
environment: "{{proxy_env if proxy_env is defined else {}}}"
rescue:
- name: update the kernel to latest version so we have a supported version
yum:
name:
- "kernel"
- "kernel-headers"
- "kernel-tools"
- "kernel-tools-libs"
- "kernel-devel"
- "kernel-debug-devel"
state: latest
environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: reboot to pick up the new kernel
reboot:
- name: add epel repo
become: true
yum:
name:
- "{{ epel_package }}"
state: latest
environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: install dependencies
yum: name=dkms
- name: add repo
yum_repository:
name: cuda
description: NVIDIA CUDA YUM Repo
baseurl: "{{ nvidia_driver_rhel_cuda_repo_baseurl }}"
gpgkey: "{{ nvidia_driver_rhel_cuda_repo_gpgkey }}"
environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: install driver packages
yum:
name: "{{ nvidia_driver_package_version | ternary('nvidia-driver-latest-dkms-'+nvidia_driver_package_version, 'nvidia-driver-latest-dkms') }}"
state: "{{ nvidia_driver_package_state }}"
autoremove: "{{ nvidia_driver_package_state == 'absent' }}"
register: install_driver
environment: "{{proxy_env if proxy_env is defined else {}}}"
---
- name: remove ppa
apt_repository:
repo: ppa:graphics-drivers/ppa
state: absent
- name: add pin file
copy:
src: "cuda-ubuntu.pin"
dest: "/etc/apt/preferences.d/cuda-repository-pin-600"
owner: "root"
group: "root"
mode: "0644"
- name: add key
apt_key:
url: "{{ nvidia_driver_ubuntu_cuda_repo_gpgkey_url }}"
id: "{{ nvidia_driver_ubuntu_cuda_repo_gpgkey_id }}"
environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: add repo
apt_repository:
repo: "deb {{ nvidia_driver_ubuntu_cuda_repo_baseurl }} /"
update_cache: yes
environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: install driver packages
apt:
name: "{{ nvidia_driver_package_version | ternary(nvidia_driver_ubuntu_cuda_package+'='+nvidia_driver_package_version, nvidia_driver_ubuntu_cuda_package) }}"
state: "{{ nvidia_driver_package_state }}"
autoremove: "{{ nvidia_driver_package_state == 'absent' }}"
purge: "{{ nvidia_driver_package_state == 'absent' }}"
register: install_driver
environment: "{{proxy_env if proxy_env is defined else {}}}"
---
- name: remove ppa
apt_repository:
repo: ppa:graphics-drivers/ppa
state: absent
- name: install driver packages
apt:
name: "{{ nvidia_driver_package_version | ternary(item+'='+nvidia_driver_package_version, item) }}"
state: "{{ nvidia_driver_package_state }}"
autoremove: "{{ nvidia_driver_package_state == 'absent' }}"
purge: "{{ nvidia_driver_package_state == 'absent' }}"
with_items: "{{ nvidia_driver_ubuntu_packages }}"
register: install_driver
environment: "{{proxy_env if proxy_env is defined else {}}}"
---
- name: Disable services
systemd:
name: "{{ item }}"
enabled: no
masked: yes
with_items:
- sleep.target
- suspend.target
- hibernate.target
- hybrid-sleep.target
- name: Stop services
service:
name: "{{ item }}"
state: stopped
with_items:
- sleep.target
- suspend.target
- hibernate.target
- hybrid-sleep.target
- name: unload nouveau
modprobe:
name: nouveau
state: absent
ignore_errors: true
- name: ubuntu install tasks (canonical repos)
include_tasks: install-ubuntu.yml
when: ansible_distribution == 'Ubuntu' and (not nvidia_driver_ubuntu_install_from_cuda_repo)
- name: ubuntu install tasks (CUDA repo)
include_tasks: install-ubuntu-cuda-repo.yml
when: ansible_distribution == 'Ubuntu' and nvidia_driver_ubuntu_install_from_cuda_repo
- name: redhat family install tasks
include_tasks: install-redhat.yml
when: ansible_os_family == 'RedHat'
- name: create persistenced override dir
file:
path: /etc/systemd/system/nvidia-persistenced.service.d/
state: directory
recurse: yes
- name: configure persistenced service to turn on persistence mode
copy:
src: nvidia-persistenced-override.conf
dest: /etc/systemd/system/nvidia-persistenced.service.d/override.conf
when: nvidia_driver_persistence_mode_on
- name: remove persistenced service override
file:
path: /etc/systemd/system/nvidia-persistenced.service.d/override.conf
state: absent
when: not nvidia_driver_persistence_mode_on
- name: enable persistenced
systemd:
name: nvidia-persistenced
enabled: yes
when: nvidia_driver_package_state != 'absent'
- name: set module parameters
template:
src: nvidia.conf.j2
dest: "{{ nvidia_driver_module_file }}"
mode: '0644'
- name: reboot after driver install
reboot:
when: install_driver.changed and not nvidia_driver_skip_reboot
- name: Install packages
become: true
apt:
name: "{{ item }}"
state: present
update_cache: yes
with_items:
- nvidia-fabricmanager-{{ nvidia_driver_ubuntu_branch }}
- datacenter-gpu-manager
- cuda-toolkit-11-4
- cuda-toolkit-11-3
- name: Enable services
service:
name: "{{ item }}"
enabled: yes
with_items:
- nvidia-dcgm
- name: Start services
service:
name: "{{ item }}"
state: started
with_items:
- nvidia-dcgm
- name: Enable services
service:
name: "{{ item }}"
enabled: yes
with_items:
- nvidia-dcgm
when: accelerator == "nvidia-a100"
- name: Start services
service:
name: "{{ item }}"
state: started
with_items:
- nvidia-dcgm
when: accelerator == "nvidia-a100"
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "/usr/bin/nvidia-container-runtime",
"runtimeArgs": []
}
}
}
{{ nvidia_driver_module_params }}
_ubuntu_repo_dir: "{{ ansible_distribution | lower }}{{ ansible_distribution_version | replace('.', '') }}/{{ ansible_architecture }}"
_rhel_repo_dir: "rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}"
k8s_cluster:
children:
g9:
hosts:
iccluster[170:202].iccluster.epfl.ch:
iccluster[204:208].iccluster.epfl.ch:
vars:
gpu: True
accelerator: nvidia-v100
runaitype: G9
bond: True
nvme: True
dockerVolume: False
interface_bond_1: eno1
interface_bond_2: eno2
g10:
hosts:
iccluster[209:225].iccluster.epfl.ch:
vars:
gpu: True
runaitype: G10
accelerator: nvidia-a100
bond: True
nvme: True
dockerVolume: False
interface_bond_1: ens21f0
interface_bond_2: ens21f1
s8:
hosts:
iccluster[150:169].iccluster.epfl.ch:
vars:
gpu: False
accelerator:
runaitype: S8
bond: True
nvme: False
dockerVolume: False
interface_bond_1: ens4f0
interface_bond_2: ens4f1
admin:
hosts:
icadmin00[6:8].iccluster.epfl.ch:
vars:
gpu: False
runaitype: ICAdmin
accelerator:
bond: False
nvme: False
dockerVolume: False
interface_bond_1: ens785f0
interface_bond_2: ens785f1
ansible_python_interpreter: /usr/bin/python3
kube_control_plane:
hosts:
icadmin00[6:8].iccluster.epfl.ch:
vars:
kubecost_namespace: "monitoring"
kubecost_node_port: 31787
kubecost_prometheus_server:
node_port: 31784
pvc:
name: "pvc-kubecost-prometheus-server"
pv:
name: "pv-kubecost-prometheus-server"
nfs_server: "icadmin009.iccluster.epfl.ch"
nfs_server_share: "/data/tsdb"
storage_size: 32.0Gi
kubecost_cost_analyzer:
pv:
nfs_server: "icadmin009.iccluster.epfl.ch"
nfs_server_share: "/data/kubecost"
etcd:
hosts:
icadmin00[6:8].iccluster.epfl.ch:
kube_node:
hosts:
iccluster[150:169].iccluster.epfl.ch:
iccluster[170:202].iccluster.epfl.ch:
iccluster[204:208].iccluster.epfl.ch:
iccluster[209:225].iccluster.epfl.ch:
calico_rr:
hosts:
all:
hosts:
icadmin00[6:8].iccluster.epfl.ch:
iccluster[150:169].iccluster.epfl.ch:
iccluster[170:202].iccluster.epfl.ch:
iccluster[204:208].iccluster.epfl.ch:
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment