Commit 11578f01 authored by Emmanuel Jaep's avatar Emmanuel Jaep
Browse files

debug

parent ab2190f7
......@@ -115,6 +115,11 @@
- name: run the scratch creation script
shell: /tmp/scratchVolume.sh
- name: put in place the script to be run at next book
template:
src: rc.local
dest: /etc/rc.local
- name: set the semaphore file to indicate that the scratch is already configured
file:
path: /root/.scratch_already_configured
......@@ -249,23 +254,33 @@
when: custom_packages_installed.stat.exists == false
# CUDA
# - name: generate the script that will install CUDA upon reboot
# ansible.builtin.template:
# src: "templates/rc.local"
# dest: /etc/rc.local
# owner: root
# group: root
# mode: 0755
# - name: schedule the reboot of the machine to install CUDA
# shell: "sleep 15 && reboot &"
- name: Deploy CUDA 11.2
ansible.builtin.include_role:
# name: cuda_11_2
name: role-nvidia-driver
vars:
nvidia_driver_skip_reboot: yes
nvidia_driver_ubuntu_install_from_cuda_repo: yes
# nvidia_driver_ubuntu_cuda_repo_gpgkey_url: http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64
# nvidia_driver_ubuntu_cuda_repo_gpgkey_url: https://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}/a4b469963bf863cc.pub
# nvidia_driver_ubuntu_cuda_repo_gpgkey_id: A4B469963BF863CC
- name: check that the semaphore file exists (i.e. CUDA is already installed)
ansible.builtin.stat:
path: /root/.cuda_11-2_already_installed
register: cuda_installed
- block:
- name: Deploy NVidia drivers
ansible.builtin.include_role:
name: role-nvidia-driver
vars:
nvidia_driver_skip_reboot: yes
nvidia_driver_ubuntu_install_from_cuda_repo: yes
- name: Deploy Cuda 11.2
apt:
name: "cuda-11-2"
state: latest
update_cache: yes
cache_valid_time: 3600
- name: set the semaphore file to indicate that CUDA is installed
file:
path: /root/.cuda_11-2_already_installed
state: touch
when: cuda_installed.stat.exists == false
# Reboot
- name: final Reboot
shell: sleep 10 && reboot &
---
language: python
python: "2.7"
# Use the new container infrastructure
sudo: false
# Install ansible
addons:
apt:
packages:
- python-pip
install:
# Install ansible
- pip install ansible
# Check ansible version
- ansible --version
# Create ansible.cfg with correct roles_path
- printf '[defaults]\nroles_path=../' >ansible.cfg
script:
# Basic role syntax check
- ansible-playbook tests/test.yml -i tests/inventory --syntax-check
notifications:
webhooks: https://galaxy.ansible.com/api/v1/notifications/
\ No newline at end of file
Role Name
=========
A brief description of the role goes here.
Requirements
------------
Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required.
Role Variables
--------------
A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well.
Dependencies
------------
A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles.
Example Playbook
----------------
Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too:
- hosts: servers
roles:
- { role: username.rolename, x: 42 }
License
-------
BSD
Author Information
------------------
An optional section for the role authors to include contact information, or a website (HTML is not allowed).
nvidia_driver_package_state: present
nvidia_driver_package_version: ''
nvidia_driver_persistence_mode_on: yes
nvidia_driver_skip_reboot: yes
nvidia_driver_module_file: /etc/modprobe.d/nvidia.conf
nvidia_driver_module_params: ''
##############################################################################
# RedHat family #
##############################################################################
epel_package: "https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ ansible_distribution_major_version }}.noarch.rpm"
nvidia_driver_rhel_cuda_repo_baseurl: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _rhel_repo_dir }}/"
nvidia_driver_rhel_cuda_repo_gpgkey: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _rhel_repo_dir }}/7fa2af80.pub"
##############################################################################
# Ubuntu #
##############################################################################
# Determine if we should install from CUDA repo instead of Canonical repos
nvidia_driver_ubuntu_install_from_cuda_repo: yes
# Installing with Canonical repositories
nvidia_driver_ubuntu_branch: "495"
nvidia_driver_ubuntu_packages:
- "nvidia-headless-{{ nvidia_driver_ubuntu_branch }}-server"
- "nvidia-utils-{{ nvidia_driver_ubuntu_branch }}-server"
- "nvidia-headless-no-dkms-{{ nvidia_driver_ubuntu_branch }}-server"
- "nvidia-kernel-source-{{ nvidia_driver_ubuntu_branch }}-server"
# Installing with CUDA repositories
nvidia_driver_ubuntu_cuda_repo_gpgkey_url: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}/7fa2af80.pub"
nvidia_driver_ubuntu_cuda_repo_gpgkey_id: "7fa2af80"
nvidia_driver_ubuntu_cuda_repo_baseurl: "http://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}"
nvidia_driver_ubuntu_cuda_package: "cuda-drivers-{{ nvidia_driver_ubuntu_branch }}"
# 470.57.02
#python3 -c "import torch; torch.cuda.is_available()"
Package: nsight-compute
Pin: origin *ubuntu.com*
Pin-Priority: -1
Package: nsight-systems
Pin: origin *ubuntu.com*
Pin-Priority: -1
Package: *
Pin: release l=NVIDIA CUDA
Pin-Priority: 600
[Service]
ExecStart=
ExecStart=/usr/bin/nvidia-persistenced --user root --persistence-mode --verbose
galaxy_info:
author: your name
description: your role description
company: your company (optional)
# If the issue tracker for your role is not on github, uncomment the
# next line and provide a value
# issue_tracker_url: http://example.com/issue/tracker
# Choose a valid license ID from https://spdx.org - some suggested licenses:
# - BSD-3-Clause (default)
# - MIT
# - GPL-2.0-or-later
# - GPL-3.0-only
# - Apache-2.0
# - CC-BY-4.0
license: license (GPL-2.0-or-later, MIT, etc)
min_ansible_version: 2.1
# If this a Container Enabled role, provide the minimum Ansible Container version.
# min_ansible_container_version:
#
# Provide a list of supported platforms, and for each platform a list of versions.
# If you don't wish to enumerate all versions for a particular platform, use 'all'.
# To view available platforms and versions (or releases), visit:
# https://galaxy.ansible.com/api/v1/platforms/
#
# platforms:
# - name: Fedora
# versions:
# - all
# - 25
# - name: SomePlatform
# versions:
# - all
# - 1.0
# - 7
# - 99.99
galaxy_tags: []
# List tags for your role here, one per line. A tag is a keyword that describes
# and categorizes the role. Users find roles by searching for tags. Be sure to
# remove the '[]' above, if you add tags to this list.
#
# NOTE: A tag is limited to a single word comprised of alphanumeric characters.
# Maximum 20 tags per role.
dependencies: []
# List your role dependencies here, one per line. Be sure to remove the '[]' above,
# if you add dependencies to this list.
---
# We have to do this because the CentOS mirrors don't keep kernel-headers, etc
# for older kernels.
- name: ensure we have kernel-headers installed for the current kernel
block:
- name: attempt to install kernel support packages for current version
yum:
name:
- "kernel-headers-{{ ansible_kernel }}"
- "kernel-tools-{{ ansible_kernel }}"
- "kernel-tools-libs-{{ ansible_kernel }}"
- "kernel-devel-{{ ansible_kernel }}"
- "kernel-debug-devel-{{ ansible_kernel }}"
state: present
environment: "{{proxy_env if proxy_env is defined else {}}}"
rescue:
- name: update the kernel to latest version so we have a supported version
yum:
name:
- "kernel"
- "kernel-headers"
- "kernel-tools"
- "kernel-tools-libs"
- "kernel-devel"
- "kernel-debug-devel"
state: latest
environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: reboot to pick up the new kernel
reboot:
- name: add epel repo
become: true
yum:
name:
- "{{ epel_package }}"
state: latest
environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: install dependencies
yum: name=dkms
- name: add repo
yum_repository:
name: cuda
description: NVIDIA CUDA YUM Repo
baseurl: "{{ nvidia_driver_rhel_cuda_repo_baseurl }}"
gpgkey: "{{ nvidia_driver_rhel_cuda_repo_gpgkey }}"
environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: install driver packages
yum:
name: "{{ nvidia_driver_package_version | ternary('nvidia-driver-latest-dkms-'+nvidia_driver_package_version, 'nvidia-driver-latest-dkms') }}"
state: "{{ nvidia_driver_package_state }}"
autoremove: "{{ nvidia_driver_package_state == 'absent' }}"
register: install_driver
environment: "{{proxy_env if proxy_env is defined else {}}}"
---
- name: remove ppa
apt_repository:
repo: ppa:graphics-drivers/ppa
state: absent
- name: add pin file
copy:
src: "cuda-ubuntu.pin"
dest: "/etc/apt/preferences.d/cuda-repository-pin-600"
owner: "root"
group: "root"
mode: "0644"
# - name: add key
# apt_key:
# url: "{{ nvidia_driver_ubuntu_cuda_repo_gpgkey_url }}"
# id: "{{ nvidia_driver_ubuntu_cuda_repo_gpgkey_id }}"
# environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: check that cuda list does not exist (conflicting version)
stat:
path: /etc/apt/sources.list.d/cuda-ubuntu2004-x86_64.list
register: cuda_list_exists
- block:
- name: make a backup copy of the file
copy:
src: "/etc/apt/sources.list.d/cuda-ubuntu2004-x86_64.list"
dest: "/etc/apt/sources.list.d/cuda-ubuntu2004-x86_64.list.bak"
- name: remove the list
file:
path: "/etc/apt/sources.list.d/cuda-ubuntu2004-x86_64.list"
state: absent
when: cuda_list_exists.stat.exists
- name: remove the old nvidia key
apt_key:
id: "7fa2af80"
state: absent
- name: add key
apt_key:
keyserver: keyserver.ubuntu.com
id: "A4B469963BF863CC"
- name: get n
# # As per https://github.com/NVIDIA/nvidia-docker/issues/1632
# - name: remove the old nvidia key
# apt_key:
# id: "7fa2af80"
# state: absent
- name: install the package
apt:
deb: "https://developer.download.nvidia.com/compute/cuda/repos/{{ ansible_facts['distribution'] | lower }}{{ ansible_facts['distribution_version'] | replace('.', '') }}/{{ ansible_facts['architecture'] }}/cuda-keyring_1.0-1_all.deb"
- name: add repo
apt_repository:
repo: "deb {{ nvidia_driver_ubuntu_cuda_repo_baseurl }} /"
update_cache: yes
environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: install driver packages
apt:
name: "{{ nvidia_driver_package_version | ternary(nvidia_driver_ubuntu_cuda_package+'='+nvidia_driver_package_version, nvidia_driver_ubuntu_cuda_package) }}"
state: "{{ nvidia_driver_package_state }}"
autoremove: "{{ nvidia_driver_package_state == 'absent' }}"
purge: "{{ nvidia_driver_package_state == 'absent' }}"
allow_unauthenticated: yes
register: install_driver
environment: "{{proxy_env if proxy_env is defined else {}}}"
---
- name: remove ppa
apt_repository:
repo: ppa:graphics-drivers/ppa
state: absent
- name: install driver packages
apt:
name: "{{ nvidia_driver_package_version | ternary(item+'='+nvidia_driver_package_version, item) }}"
state: "{{ nvidia_driver_package_state }}"
autoremove: "{{ nvidia_driver_package_state == 'absent' }}"
purge: "{{ nvidia_driver_package_state == 'absent' }}"
with_items: "{{ nvidia_driver_ubuntu_packages }}"
register: install_driver
environment: "{{proxy_env if proxy_env is defined else {}}}"
---
- name: Disable services
systemd:
name: "{{ item }}"
enabled: no
masked: yes
with_items:
- sleep.target
- suspend.target
- hibernate.target
- hybrid-sleep.target
- name: Stop services
service:
name: "{{ item }}"
state: stopped
with_items:
- sleep.target
- suspend.target
- hibernate.target
- hybrid-sleep.target
- name: unload nouveau
modprobe:
name: nouveau
state: absent
ignore_errors: true
- name: ubuntu install tasks (canonical repos)
include_tasks: install-ubuntu.yml
when: ansible_distribution == 'Ubuntu' and (not nvidia_driver_ubuntu_install_from_cuda_repo)
- name: ubuntu install tasks (CUDA repo)
include_tasks: install-ubuntu-cuda-repo.yml
when: ansible_distribution == 'Ubuntu' and nvidia_driver_ubuntu_install_from_cuda_repo
- name: redhat family install tasks
include_tasks: install-redhat.yml
when: ansible_os_family == 'RedHat'
- name: create persistenced override dir
file:
path: /etc/systemd/system/nvidia-persistenced.service.d/
state: directory
recurse: yes
- name: configure persistenced service to turn on persistence mode
copy:
src: nvidia-persistenced-override.conf
dest: /etc/systemd/system/nvidia-persistenced.service.d/override.conf
when: nvidia_driver_persistence_mode_on
- name: remove persistenced service override
file:
path: /etc/systemd/system/nvidia-persistenced.service.d/override.conf
state: absent
when: not nvidia_driver_persistence_mode_on
- name: enable persistenced
systemd:
name: nvidia-persistenced
enabled: yes
when: nvidia_driver_package_state != 'absent'
- name: set module parameters
template:
src: nvidia.conf.j2
dest: "{{ nvidia_driver_module_file }}"
mode: '0644'
- name: reboot after driver install
reboot:
when: install_driver.changed and not nvidia_driver_skip_reboot
- name: Install packages
become: true
apt:
name: "{{ item }}"
state: present
update_cache: yes
with_items:
- nvidia-fabricmanager-{{ nvidia_driver_ubuntu_branch }}
- datacenter-gpu-manager
- cuda-toolkit-11-4
- cuda-toolkit-11-3
- name: Enable services
service:
name: "{{ item }}"
enabled: yes
with_items:
- nvidia-dcgm
- name: Start services
service:
name: "{{ item }}"
state: started
with_items:
- nvidia-dcgm
- name: Enable services
service:
name: "{{ item }}"
enabled: yes
with_items:
- nvidia-dcgm
when: accelerator == "nvidia-a100"
- name: Start services
service:
name: "{{ item }}"
state: started
with_items:
- nvidia-dcgm
when: accelerator == "nvidia-a100"
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "/usr/bin/nvidia-container-runtime",
"runtimeArgs": []
}
}
}
---
- hosts: localhost
remote_user: root
roles:
- cuda_11_2
_ubuntu_repo_dir: "{{ ansible_distribution | lower }}{{ ansible_distribution_version | replace('.', '') }}/{{ ansible_architecture }}"
_rhel_repo_dir: "rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}"
#!/bin/sh -e
#!/usr/bin/env bash
FLAG="/var/log/firstboot.cuda.log"
if [ ! -f $FLAG ]; then
touch $FLAG
curl -s http://install.iccluster.epfl.ch/scripts/soft/cuda/cuda_10.1.168_418.67.sh >> /tmp/cuda.sh ; chmod +x /tmp/cuda.sh; /tmp/cuda.sh;
fi
set -e #Exit immediately if a pipeline returns a non-zero status
startTS=$(date --iso-8601='seconds')
echo "[$startTS] [info] starting the /scratch ownership fixer" | tee -a /var/log/nlp.sh.log
FLAGSCRATCH="/var/log/firstboot.scratch.log"
if [ ! -f $FLAGSCRATCH ]; then
echo "[$(date --iso-8601='seconds')] [info] first boot, running /scratch ownership fix" | tee -a /var/log/nlp.sh.log
touch $FLAGSCRATCH
chmod 775 /scratch
chown root:dlab_AppGrpU /scratch
else
echo "[$(date --iso-8601='seconds')] [info] not first boot, skipping /scratch ownership fix" | tee -a /var/log/nlp.sh.log
fi
# end
endTS=$(date --iso-8601='seconds')
echo "[$endTS] [info] finished" | tee -a /var/log/nlp.sh.log
exit 0
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment