Skip to content

Commit

Permalink
Initial POC for installing nvidia driver on EDPM nodes
Browse files Browse the repository at this point in the history
TODO:
 - molecule tests
 - docs
 - more checks
  • Loading branch information
sbauza committed Oct 7, 2024
1 parent 1d7b650 commit 23da455
Show file tree
Hide file tree
Showing 18 changed files with 416 additions and 1 deletion.
2 changes: 1 addition & 1 deletion galaxy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ namespace: cifmw
name: general

# The version of the collection. Must be compatible with semantic versioning
version: 1.0.0
version: 1.0.0+15f75128

# The path to the Markdown (.md) readme file. This path is relative to the root of the collection
readme: README.md
Expand Down
67 changes: 67 additions & 0 deletions playbooks/nvidia-mdev.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
---
# Copyright 2024 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.


- name: Gather the list of EDPM computes
hosts: "{{ cifmw_target_hook_host | default('localhost') }}"
gather_facts: false
tasks:
- name: Fetch OSP BMO nodesets
environment:
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
PATH: "{{ cifmw_path }}"
ansible.builtin.command:
cmd: >-
oc get OpenStackBaremetalSet -n "{{ namespace|default('openstack') }}" -o yaml
register: _osp_bmo_nodsets_oc_out

- name: Add OSP BMO nodesets to Ansible
ansible.builtin.add_host:
name: "{{ item.name }}"
groups: "{{ item.group }}"
ansible_ssh_user: "{{ item.user }}"
ansible_host: "{{ item.ip }}"
ansible_ssh_private_key_file: "{{ ansible_user_dir }}/.ssh/id_cifw"
ansible_ssh_extra_args: '-o StrictHostKeyChecking=no'
loop: >-
{% set hosts = [] -%}
{% set nodesets = (_osp_bmo_nodsets_oc_out.stdout | from_yaml)['items'] | default([]) -%}
{% for spec in nodesets | map(attribute='spec') -%}
{% for host_key, host_val in spec.baremetalHosts.items() -%}
{% set _ = hosts.append(
{
'name': host_key,
'ip': host_val['ctlPlaneIP'] | ansible.utils.ipaddr('address'),
'user': spec.cloudUserName,
'group': host_key | split('-') | first + 's'
}) -%}
{% endfor -%}
{% endfor -%}
{{ hosts }}
- name: Run the Nvidia role
hosts: computes
tasks:
- name: Run phase1
ansible.builtin.import_role:
name: edpm_nvidia_mdev_prepare
# As a reminder, at the end of phase1, the compute will reboot
tasks_from: phase1

- name: Run phase 2
ansible.builtin.import_role:
name: edpm_nvidia_mdev_prepare
tasks_from: phase2
10 changes: 10 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# edpm_nvidia_mdev_prepare
Please explain the role purpose.

## Privilege escalation
If apply, please explain the privilege escalation done in this role.

## Parameters
* `param_1`: this is an example

## Examples
28 changes: 28 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
---
# Copyright Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

# Does the OS needs to disable the nouveau driver ?
cifmw_edpm_nvidia_mdev_prepare_disable_nouveau: true

# What is the URL or path for the nvidia driver RPM ?
cifmw_edpm_nvidia_mdev_prepare_driver_url: ''

# What will be the name of the nvidia package ?
cifmw_edpm_nvidia_mdev_prepare_package_name: "NVIDIA-vGPU-rhel"

# Which SR-IOV GPU devices should be creating VFs ?
cifmw_edpm_nvidia_mdev_prepare_sriov_devices:
- ALL
Empty file.
26 changes: 26 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/files/[email protected]
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[Unit]
After = nvidia-vgpu-mgr.service
After = nvidia-vgpud.service
Description = Enable Nvidia GPU virtual functions

[Service]
Type = oneshot
User = root
Group = root
ExecStart = /usr/lib/nvidia/sriov-manage -e %i
# Give a reasonable amount of time for the server to start up/shut down
TimeoutSec = 120
# This creates a specific slice which all services will operate from
# The accounting options give us the ability to see resource usage
# through the `systemd-cgtop` command.
Slice = system.slice
# Set Accounting
CPUAccounting = True
BlockIOAccounting = True
MemoryAccounting = True
TasksAccounting = True
RemainAfterExit = True
ExecStartPre = /usr/bin/sleep 30

[Install]
WantedBy = multi-user.target
15 changes: 15 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/handlers/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
---
# Copyright Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
30 changes: 30 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/meta/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
---
# Copyright Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.


galaxy_info:
author: CI Framework
description: CI Framework Role -- edpm_nvidia_mdev_prepare
company: Red Hat
license: Apache-2.0
min_ansible_version: "2.14"
namespace: cifmw
galaxy_tags:
- cifmw

# List your role dependencies here, one per line. Be sure to remove the '[]' above,
# if you add dependencies to this list.
dependencies: []
29 changes: 29 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/molecule/default/converge.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
# Copyright Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.


- name: Converge
hosts: all
tasks:
- name: Run phase1
ansible.builtin.import_role:
name: edpm_nvidia_mdev_prepare
tasks_from: phase1

- name: Run phase 2
ansible.builtin.import_role:
name: edpm_nvidia_mdev_prepare
tasks_from: phase2
11 changes: 11 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/molecule/default/molecule.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
# Mainly used to override the defaults set in .config/molecule/
# By default, it uses the "config_podman.yml" - in CI, it will use
# "config_local.yml".
log: true

provisioner:
name: ansible
log: true
env:
ANSIBLE_STDOUT_CALLBACK: yaml
21 changes: 21 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/molecule/default/prepare.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
# Copyright Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.


- name: Prepare
hosts: all
roles:
- role: test_deps
19 changes: 19 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/tasks/cleanup.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
# Copyright Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

- name: Cleaning the World
ansible.builtin.debug:
msg: "So here edpm_nvidia_mdev_prepare should clean things up!"
16 changes: 16 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
# Copyright Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

78 changes: 78 additions & 0 deletions roles/edpm_nvidia_mdev_prepare/tasks/phase1.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
---
# Copyright 2024 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

- name: Blacklist nouveau
become: true
ansible.builtin.copy:
dest: "/etc/modprobe.d/blacklist-nouveau.conf"
mode: "0644"
content: |-
blacklist nouveau
options nouveau modeset=0
force: false
when:
- cifmw_edpm_nvidia_mdev_prepare_disable_nouveau | bool
register: _blacklist_nouveau

- name: Make sure that we defined the driver URL
ansible.builtin.assert:
that:
- cifmw_edpm_nvidia_mdev_prepare_driver_url is defined
- cifmw_edpm_nvidia_mdev_prepare_driver_url | length > 0
msg: "You need to set cifmw_edpm_nvidia_mdev_prepare_driver_url"

- name: Gather the package facts
ansible.builtin.package_facts:
manager: auto

- name: Install nvidia driver RPM either from path or URL
become: true
ansible.builtin.dnf:
name: "{{ cifmw_edpm_nvidia_mdev_prepare_driver_url }}"
state: present
disable_gpg_check: true
when: cifmw_edpm_nvidia_mdev_prepare_package_name not in ansible_facts.packages
register: _nvidia_driver_install

- name: Regenerate initramfs
become: true
ansible.builtin.command: "{{ item }}"
loop:
- 'dracut --force'
- 'grub2-mkconfig -o /boot/efi/EFI/redhat/grub.cfg'
when: _blacklist_nouveau.changed or _nvidia_driver_install.changed

- name: Enforce a reboot to ensure that we have the driver loaded
block:
- name: Create directory required by edpm-reboot role
become: true
ansible.builtin.file:
path: /var/lib/openstack/reboot_required/
state: directory
mode: "0755"
- name: Create required file to enforce a reboot
become: true
ansible.builtin.file:
path: /var/lib/openstack/reboot_required/nvidia_mdev_reboot
state: touch
mode: "0600"
- name: Call edpm_reboot role
# Since the EDPM role isn't installed, we can't call it
# Removing it for now
# ansible.builtin.include_role:
# name: edpm_reboot
# Instead, use a regular reboot
ansible.builtin.reboot:
Loading

0 comments on commit 23da455

Please sign in to comment.