canonical · tomponline · Aug 28, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jul 26, 2024
diff --git a/doc/.custom_wordlist.txt b/doc/.custom_wordlist.txt
@@ -25,6 +25,7 @@ BPF
 Btrfs
 bugfix
 bugfixes
+CDI
 CentOS
 Ceph
 CephFS
@@ -46,6 +47,7 @@ CSM
 CSV
 CUDA
 dataset
+dGPU
 DCO
 dereferenced
 DHCP
@@ -96,6 +98,8 @@ idmap
 idmapped
 idmaps
 IdP
+iGPU
+iGPUs
 incrementing
 InfiniBand
 init
@@ -139,6 +143,7 @@ MicroCloud
 MII
 MinIO
 MITM
+MNIST
 MTU
 Mullvad
 multicast
@@ -152,7 +157,10 @@ NIC
 NICs
 NUMA
 NVMe
+NVML
 NVRAM
+NVIDIA
+OCI
 OData
 OIDC
 OpenFGA
@@ -207,6 +215,7 @@ SATA
 scalable
 scriptlet
 SDC
+SDK
 SDN
 SDS
 SDT
@@ -223,6 +232,7 @@ SKBPRIO
 SLAAC
 SMTP
 Snapcraft
+SoC
 Solaris
 SPAs
 SPL
@@ -256,6 +266,8 @@ sysfs
 syslog
 Tbit
 TCP
+TensorRT
+Tegra
 TiB
 Tibit
 TinyPNG

diff --git a/doc/howto/container_gpu_passthrough_with_docker.md b/doc/howto/container_gpu_passthrough_with_docker.md
@@ -0,0 +1,152 @@
+(container-gpu-passthrough-with-docker)=
+# How to pass an NVIDIA GPU to a container
+
+If you have an NVIDIA GPU (either discrete (dGPU) or integrated (iGPU)) and you want to pass the runtime libraries and configuration installed on your host to your container, you should add a {ref}`LXD GPU device <devices-gpu>`.
+Consider the following scenario:
+
+Your host is an NVIDIA single board computer that has a Tegra SoC with an iGPU, and you have the Tegra SDK installed on the host. You want to create a LXD container and run an application inside the container using the iGPU as a compute backend. You want to run this application inside a Docker container (or another OCI-compliant runtime).
+To achieve this, complete the following steps:
+
+1. Running a Docker container inside a LXD container can potentially consume a lot of disk space if the outer container is not well configured. Here are two options you can use to optimize the consumed disk space:
+
+    - Either you create a BTRFS storage pool to back the LXD container so that the Docker image later used does not use the VFS storage driver which is very space inefficient, then you initialize the LXD container with {config:option}`instance-security:security.nesting` enabled (needed for running a Docker container inside a LXD container) and using the BTRFS storage pool:
+
+          lxc storage create p1 btrfs size=15GiB
+          lxc init ubuntu:24.04 t1 --config security.nesting=true -s p1
+
+    - Or you use the `overlayFS` storage driver in Docker but you need to specify the following syscall interceptions, still with the {config:option}`instance-security:security.nesting` enabled:
+
+          lxc init ubuntu:24.04 t1 --config security.nesting=true --config security.syscalls.intercept.mknod=true --config security.syscalls.intercept.setxattr=true
+
+1. Add the GPU device to your container:
+
+    - If you want to do an iGPU pass-through:
+
+          lxc config device add t1 igpu0 gpu gputype=physical id=nvidia.com/igpu=0
+
+    - If you want to do a dGPU pass-through:
+
+          lxc config device add t1 gpu0 gpu gputype=physical id=nvidia.com/gpu=0
+
+After adding the device, let's try to run a basic [MNIST](https://en.wikipedia.org/wiki/MNIST_database) inference job inside our LXD container.
+
+1. Create a `cloud-init` script that installs the Docker runtime, the [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-container-toolkit), and a script to run a test [TensorRT](https://github.com/NVIDIA/TensorRT) workload:
+
+        #cloud-config
+        package_update: true
+        write_files:
+          # `run_tensorrt.sh` compiles samples TensorRT applications and run the the `sample_onnx_mnist` program which loads an ONNX model into the TensorRT inference server and execute a digit recognition job.
+          - path: /root/run_tensorrt.sh
+            permissions: "0755"
+            owner: root:root
+            content: |
+              #!/bin/bash
+              echo "OS release,Kernel version"
+              (. /etc/os-release; echo "${PRETTY_NAME}"; uname -r) | paste -s -d,
+              echo
+              nvidia-smi -q
+              echo
+              exec bash -o pipefail -c "
+              cd /workspace/tensorrt/samples
+              make -j4
+              cd /workspace/tensorrt/bin
+              ./sample_onnx_mnist
+              retstatus=\${PIPESTATUS[0]}
+              echo \"Test exited with status code: \${retstatus}\" >&2
+              exit \${retstatus}
+              "
+        runcmd:
+          # Install Docker to run the AI workload
+          - curl -fsSL https://get.docker.com -o install-docker.sh
+          - sh install-docker.sh --version 24.0
+          # The following installs the NVIDIA container toolkit
+          # as explained in the official doc website: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt
+          - curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg
+            --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+          - curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed -e 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' -e '/experimental/ s/^#//g' | tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+          # Now that an new apt source/key was added, update the package definitions.
+          - apt-get update
+          # Install NVIDIA container toolkit
+          - DEBIAN_FRONTEND=noninteractive apt-get install -y nvidia-container-toolkit
+          # Ultimately, we need to tell Docker, our container runtime, to use `nvidia-ctk` as a runtime.
+          - nvidia-ctk runtime configure --runtime=docker
+            --config=/etc/docker/daemon.json
+          - systemctl restart docker
+
+1. Apply this `cloud-init` setup to your instance:
+
+        lxc config set t1 cloud-init.user-data - < cloud-init.yml
+
+1. Start the instance:
+
+        lxc start t1
+
+1. Wait for the `cloud-init` process to finish:
+
+        lxc exec t1 -- cloud-init status --wait
+
+1. Once `cloud-init` is finished, open a shell in the instance:
+
+        lxc exec t1 -- bash
+
+1. Edit the NVIDIA container runtime to avoid using `cgroups`:
+
+        sudo nvidia-ctk config  --in-place --set nvidia-container-cli.no-cgroups
+
+1. If you use an iGPU and your NVIDIA container runtime is not automatically enabled with CSV mode (needed for NVIDIA Tegra board), enable it manually:
+
+        sudo nvidia-ctk config --in-place --set nvidia-container-runtime.mode=csv
+
+1. Now, run the inference workload with Docker:
+
+    - If you set up a dGPU pass-through:
+
+          docker run --gpus all --runtime nvidia --rm -v $(pwd):/sh_input nvcr.io/nvidia/tensorrt:24.02-py3 bash /sh_input/run_tensorrt.sh
+
+    - If you set up an iGPU pass-through:
+
+          docker run --gpus all --runtime nvidia --rm -v $(pwd):/sh_input nvcr.io/nvidia/tensorrt:24.02-py3-igpu bash /sh_input/run_tensorrt.sh
+
+  In the end you should see something like:
+
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @@@@@@@@@@=   ++++#++=*@@@@@
+        @@@@@@@@#.            *@@@@@
+        @@@@@@@@=             *@@@@@
+        @@@@@@@@.   .. ...****%@@@@@
+        @@@@@@@@: .%@@#@@@@@@@@@@@@@
+        @@@@@@@%  -@@@@@@@@@@@@@@@@@
+        @@@@@@@%  -@@*@@@*@@@@@@@@@@
+        @@@@@@@#  :#- ::. ::=@@@@@@@
+        @@@@@@@-             -@@@@@@
+        @@@@@@%.              *@@@@@
+        @@@@@@#     :==*+==   *@@@@@
+        @@@@@@%---%%@@@@@@@.  *@@@@@
+        @@@@@@@@@@@@@@@@@@@+  *@@@@@
+        @@@@@@@@@@@@@@@@@@@=  *@@@@@
+        @@@@@@@@@@@@@@@@@@*   *@@@@@
+        @@@@@%+%@@@@@@@@%.   .%@@@@@
+        @@@@@*  .******=    -@@@@@@@
+        @@@@@*             .#@@@@@@@
+        @@@@@*            =%@@@@@@@@
+        @@@@@@%#+++=     =@@@@@@@@@@
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        [07/31/2024-13:19:21] [I] Output:
+        [07/31/2024-13:19:21] [I]  Prob 0  0.0000 Class 0:
+        [07/31/2024-13:19:21] [I]  Prob 1  0.0000 Class 1:
+        [07/31/2024-13:19:21] [I]  Prob 2  0.0000 Class 2:
+        [07/31/2024-13:19:21] [I]  Prob 3  0.0000 Class 3:
+        [07/31/2024-13:19:21] [I]  Prob 4  0.0000 Class 4:
+        [07/31/2024-13:19:21] [I]  Prob 5  1.0000 Class 5: **********
+        [07/31/2024-13:19:21] [I]  Prob 6  0.0000 Class 6:
+        [07/31/2024-13:19:21] [I]  Prob 7  0.0000 Class 7:
+        [07/31/2024-13:19:21] [I]  Prob 8  0.0000 Class 8:
+        [07/31/2024-13:19:21] [I]  Prob 9  0.0000 Class 9:
+        [07/31/2024-13:19:21] [I]
+        &&&& PASSED TensorRT.sample_onnx_mnist [TensorRT v8603] # ./sample_onnx_mnist
diff --git a/doc/instances.md b/doc/instances.md
@@ -57,6 +57,16 @@ How to import instances:
 :diataxis:Migrate from LXC </howto/migrate_from_lxc>
 ```
 
+```{only} diataxis
+How to pass an NVIDIA GPU to a container with a Docker workload:
+```
+
+```{filtered-toctree}
+:titlesonly:
+
+:diataxis:Pass NVIDIA GPUs </howto/container_gpu_passthrough_with_docker>
+```
+
 ## Related topics
 
 ```{only} diataxis

diff --git a/doc/metadata.txt b/doc/metadata.txt
@@ -275,9 +275,15 @@ You can omit the `MIG-` prefix when specifying this option.
 ```
 
 ```{config:option} id device-gpu-physical-device-conf
-:shortdesc: "DRM card ID of the GPU device"
+:shortdesc: "ID of the GPU device"
 :type: "string"
+The ID can either be the DRM card ID of the GPU device (container or VM) or a fully-qualified Container Device Interface (CDI) name (container only).
+Here are some examples of fully-qualified CDI names:
 
+- `nvidia.com/gpu=0`: Instructs LXD to operate a discrete GPU (dGPU) pass-through of brand NVIDIA with the first discovered GPU on your system. You can use the `nvidia-smi` tool on your host to find out which identifier to use.
+- `nvidia.com/gpu=1833c8b5-9aa0-5382-b784-68b7e77eb185`: Instructs LXD to operate a discrete GPU (dGPU) pass-through of brand NVIDIA with a given GPU unique identifier. This identifier should also appear with `nvidia-smi -L`.
+- `nvidia.com/igpu=all`: Instructs LXD to pass all the host integrated GPUs (iGPU) of brand NVIDIA. The concept of an index does not currently map to iGPUs. It is possible to list them with the `nvidia-smi -L` command. A special `nvgpu` mention should appear in the generated list to indicate a device to be an iGPU.
+- `nvidia.com/gpu=all`: Instructs LXD to pass all the host GPUs of brand NVIDIA through to the container.
 ```
 
 ```{config:option} mode device-gpu-physical-device-conf

diff --git a/doc/reference/devices_gpu.md b/doc/reference/devices_gpu.md
@@ -53,6 +53,22 @@ Add a specific GPU from the host system as a `physical` GPU device to an instanc
 
 See {ref}`instances-configure-devices` for more information.
 
+#### CDI mode
+
+Add a specific GPU from the host system as a `physical` GPU device to an instance using the [Container Device Interface](https://github.com/cncf-tags/container-device-interface) (CDI) notation through a fully-qualified CDI name:
+
+    lxc config device add <instance_name> <device_name> gpu gputype=physical id=<fully_qualified_CDI_name>
+
+For example, add the first available NVIDIA discrete GPU on your system:
+
+    lxc config device add <instance_name> <device_name> gpu gputype=physical id=nvidia.com/gpu=0
+
+If your machine has an NVIDIA iGPU (integrated GPU) located at index 0, you can add it like this:
+
+    lxc config device add <instance_name> <device_name> gpu gputype=physical id=nvidia.com/igpu=0
+
+For a complete example on how to use a GPU CDI pass-through, see {ref}`container-gpu-passthrough-with-docker`.
+
 (gpu-mdev)=
 ## `gputype`: `mdev`
 

diff --git a/go.mod b/go.mod
@@ -3,6 +3,7 @@ module github.com/canonical/lxd
 go 1.22.5
 
 require (
+	github.com/NVIDIA/nvidia-container-toolkit v1.16.1
 	github.com/Rican7/retry v0.3.1
 	github.com/armon/go-proxyproto v0.1.0
 	github.com/canonical/go-dqlite v1.22.0
@@ -64,9 +65,13 @@ require (
 	gopkg.in/tomb.v2 v2.0.0-20161208151619-d5d1b5820637
 	gopkg.in/yaml.v2 v2.4.0
 	k8s.io/utils v0.0.0-20240711033017-18e509b52bc8
+	tags.cncf.io/container-device-interface v0.8.0
+	tags.cncf.io/container-device-interface/specs-go v0.8.0
 )
 
 require (
+	github.com/NVIDIA/go-nvlib v0.6.0 // indirect
+	github.com/NVIDIA/go-nvml v0.12.4-0 // indirect
 	github.com/antlr4-go/antlr/v4 v4.13.1 // indirect
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/bmatcuk/doublestar/v4 v4.6.1 // indirect
@@ -117,6 +122,8 @@ require (
 	github.com/muhlemmer/httpforwarded v0.1.0 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/natefinch/wrap v0.2.0 // indirect
+	github.com/opencontainers/runtime-spec v1.2.0 // indirect
+	github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect
 	github.com/pelletier/go-toml/v2 v2.2.2 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
@@ -158,4 +165,5 @@ require (
 	gopkg.in/ini.v1 v1.67.0 // indirect
 	gopkg.in/mgo.v2 v2.0.0-20190816093944-a6b53ec6cb22 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
+	sigs.k8s.io/yaml v1.4.0 // indirect
 )
-Original file line number
+Diff line change
@@ Expand Up / @@ -25,6 +25,7 @@ BPF @@
     Btrfs
     bugfix
     bugfixes
+    CDI
     CentOS
     Ceph
     CephFS
@@ Expand All / @@ -46,6 +47,7 @@ CSM @@
     CSV
     CUDA
     dataset
+    dGPU
     DCO
     dereferenced
     DHCP
@@ Expand Down Expand Up / @@ -96,6 +98,8 @@ idmap @@
     idmapped
     idmaps
     IdP
+    iGPU
+    iGPUs
     incrementing
     InfiniBand
     init
@@ Expand Down Expand Up / @@ -139,6 +143,7 @@ MicroCloud @@
     MII
     MinIO
     MITM
+    MNIST
     MTU
     Mullvad
     multicast
@@ Expand All / @@ -152,7 +157,10 @@ NIC @@
     NICs
     NUMA
     NVMe
+    NVML
     NVRAM
+    NVIDIA
+    OCI
     OData
     OIDC
     OpenFGA
@@ Expand Down Expand Up / @@ -207,6 +215,7 @@ SATA @@
     scalable
     scriptlet
     SDC
+    SDK
     SDN
     SDS
     SDT
@@ Expand All / @@ -223,6 +232,7 @@ SKBPRIO @@
     SLAAC
     SMTP
     Snapcraft
+    SoC
     Solaris
     SPAs
     SPL
@@ Expand Down Expand Up / @@ -256,6 +266,8 @@ sysfs @@
     syslog
     Tbit
     TCP
+    TensorRT
+    Tegra
     TiB
     Tibit
     TinyPNG
@@ Expand Down @@