From 72f0564dfd4af1e5b9ed66422b87e61fa9270b89 Mon Sep 17 00:00:00 2001 From: Mark Ericksen Date: Mon, 15 Apr 2024 17:12:22 -0600 Subject: [PATCH 1/3] working on updated Dockerfil --- Dockerfile | 82 ++++++++++++++++++++++++------------------------------ README.md | 41 +++++++++++++++++++++++++++ mix.lock | 8 +++--- 3 files changed, 81 insertions(+), 50 deletions(-) diff --git a/Dockerfile b/Dockerfile index 73181ef..d1672c6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,42 +1,46 @@ -# Find eligible builder and runner images on Docker Hub. We use Ubuntu/Debian -# instead of Alpine to avoid DNS resolution issues in production. +# Based on: # -# https://hub.docker.com/r/hexpm/elixir/tags?page=1&name=ubuntu -# https://hub.docker.com/_/ubuntu?tab=tags -# -# This file is based on these images: -# -# - https://hub.docker.com/r/hexpm/elixir/tags - for the build image -# - https://hub.docker.com/_/debian?tab=tags&page=1&name=bullseye-20230227-slim - for the release image -# - https://pkgs.org/ - resource for finding needed packages -# - Ex: hexpm/elixir:1.14.3-erlang-25.3-debian-bullseye-20230227-slim -# -ARG ELIXIR_VERSION=1.15.7 -ARG OTP_VERSION=26.1.2 -ARG DEBIAN_VERSION=bullseye-20230612-slim +# - https://hub.docker.com/r/hexpm/elixir/tags +# - https://hub.docker.com/r/nvidia/cuda/tags +# - https://github.com/livebook-dev/livebook/blob/main/docker/base/elixir-cuda.dockerfile +# - https://wiki.ubuntu.com/Releases + +ARG UBUNTU_VERSION=22.04 +ARG UBUNTU_NAMED_VERSION=jammy-20240227 +ARG CUDA_VERSION=12.4.1 +ARG ELIXIR_VERSION=1.16.2 +ARG ERLANG_VERSION=26.2.4 -ARG BUILDER_IMAGE="hexpm/elixir:${ELIXIR_VERSION}-erlang-${OTP_VERSION}-debian-${DEBIAN_VERSION}" -ARG RUNNER_IMAGE="debian:${DEBIAN_VERSION}" +# Target the CUDA build image +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +# NOTE: TRYING TO GET IT WORKING. DON'T KEEP "devel" VERSION FOR RUNTIME? +ARG BASE_CUDA_RUNTIME_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} -FROM ${BUILDER_IMAGE} as builder +FROM hexpm/elixir:${ELIXIR_VERSION}-erlang-${ERLANG_VERSION}-ubuntu-${UBUNTU_NAMED_VERSION} AS elixir -# install build dependencies -RUN apt-get update -y && apt-get install -y build-essential git curl wget \ - && apt-get clean && rm -f /var/lib/apt/lists/*_* +FROM ${BASE_CUDA_DEV_CONTAINER} as builder -# Add the repository for the Nvidia CUDA -# Import the Nvidia repository GPG key -RUN apt update -q && apt install -y ca-certificates wget && \ - wget -qO /cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \ - dpkg -i /cuda-keyring.deb && apt update -q +RUN apt-get update && \ + apt-get install -y software-properties-common && \ + apt-get install -y build-essential git git curl wget cmake openssl libncurses5 locales && \ + apt-get clean && rm -f /var/lib/apt/lists/*_* +# Elixir: We copy the top-level directory first to preserve symlinks in /usr/local/bin +COPY --from=elixir /usr/local /usr/ELIXIR_LOCAL -# Install nvidia GPU support -RUN apt-get install -y cuda-nvcc-12-2 libcublas-12-2 libcudnn8 +RUN cp -r /usr/ELIXIR_LOCAL/lib/* /usr/local/lib && \ + cp -r /usr/ELIXIR_LOCAL/bin/* /usr/local/bin && \ + rm -rf /usr/ELIXIR_LOCAL # prepare build dir WORKDIR /app +# Set the locale +RUN sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen && locale-gen +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US:en +ENV LC_ALL en_US.UTF-8 + # install hex + rebar RUN mix local.hex --force && \ mix local.rebar --force @@ -79,32 +83,19 @@ RUN mix release # start a new build stage so that the final image will only contain # the compiled release and other runtime necessities -FROM ${RUNNER_IMAGE} +FROM ${BASE_CUDA_RUNTIME_CONTAINER} RUN apt-get update -y && \ apt-get install -y libstdc++6 openssl libncurses5 locales ca-certificates \ && apt-get clean && rm -f /var/lib/apt/lists/*_* -# Nvidia stuff -RUN apt update -q && apt install -y ca-certificates wget && \ - wget -qO /cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \ - dpkg -i /cuda-keyring.deb && apt update -q - - -# Nvidia support in runtime layer -RUN apt-get install -y --no-install-recommends cuda-nvcc-12-2 libcublas-12-2 libcudnn8 -# Copy over needed nvidia support -# COPY --from=builder /usr/local/bin/deviceQuery /usr/local/bin/deviceQuery - # Set the locale RUN sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen && locale-gen - ENV LANG en_US.UTF-8 ENV LANGUAGE en_US:en ENV LC_ALL en_US.UTF-8 WORKDIR "/app" -# RUN chown nobody /app # set runner ENV ENV MIX_ENV="prod" @@ -113,12 +104,11 @@ ENV MIX_ENV="prod" ENV XLA_TARGET="cuda120" ENV BUMBLEBEE_CACHE_DIR="/data/cache/bumblebee" ENV XLA_CACHE_DIR="/data/cache/xla" -# NOTE: This seems to be causing a crash loop on boot. -# ENV ELIXIR_ERL_OPTIONS = "-proto_dist inet6_tcp +sssdio 128" + +ENV ECTO_IPV6 true +ENV ERL_AFLAGS "-proto_dist inet6_tcp" # Only copy the final release from the build stage COPY --from=builder /app/_build/${MIX_ENV}/rel/harness ./ -# USER nobody - CMD ["/app/bin/server"] diff --git a/README.md b/README.md index f960034..0f2e565 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,47 @@ To test and verify that you've successfully deployed the application to a machine with GPU access and that your application has all the necessary support for taking advantage of the GPU, do the following: +``` +$ fly ssh console +nvidia-smi +``` + +If the required NVidia libraries and hardware are in place, then the `nvidia-smi` tool should output a table with the information like this: + +``` ++---------------------------------------------------------------------------------------+ +| NVIDIA-SMI 545.23.08 Driver Version: 545.23.08 CUDA Version: 12.3 | +|-----------------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+======================+======================| +| 0 NVIDIA A100-PCIE-40GB Off | 00000000:00:06.0 Off | On | +| N/A 38C P0 39W / 250W | 0MiB / 40960MiB | N/A Default | +| | | Enabled | ++-----------------------------------------+----------------------+----------------------+ + ++---------------------------------------------------------------------------------------+ +| MIG devices: | ++------------------+--------------------------------+-----------+-----------------------+ +| GPU GI CI MIG | Memory-Usage | Vol| Shared | +| ID ID Dev | BAR1-Usage | SM Unc| CE ENC DEC OFA JPG | +| | | ECC| | +|==================+================================+===========+=======================| +| No MIG devices found | ++---------------------------------------------------------------------------------------+ + ++---------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=======================================================================================| +| No running processes found | ++---------------------------------------------------------------------------------------+ +``` + +The next layer to test is that Elixir has access to the GPU. For that, run the following: + ``` $ fly ssh console # bin/harness remote diff --git a/mix.lock b/mix.lock index 22a21f1..42c5923 100644 --- a/mix.lock +++ b/mix.lock @@ -1,15 +1,15 @@ %{ "axon": {:hex, :axon, "0.6.1", "1d042fdba1c1b4413a3d65800524feebd1bc8ed218f8cdefe7a97510c3f427f3", [:mix], [{:kino, "~> 0.7", [hex: :kino, repo: "hexpm", optional: true]}, {:kino_vega_lite, "~> 0.1.7", [hex: :kino_vega_lite, repo: "hexpm", optional: true]}, {:nx, "~> 0.6.0 or ~> 0.7.0", [hex: :nx, repo: "hexpm", optional: false]}, {:polaris, "~> 0.1", [hex: :polaris, repo: "hexpm", optional: false]}, {:table_rex, "~> 3.1.1", [hex: :table_rex, repo: "hexpm", optional: true]}], "hexpm", "d6b0ae2f0dd284f6bf702edcab71e790d6c01ca502dd06c4070836554f5a48e1"}, "bumblebee": {:hex, :bumblebee, "0.5.3", "151c215fd6014958dbfc322fe5f31b44d170293f69cfdca419936c81e39b1f64", [:mix], [{:axon, "~> 0.6.1", [hex: :axon, repo: "hexpm", optional: false]}, {:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:jason, "~> 1.4.0", [hex: :jason, repo: "hexpm", optional: false]}, {:nx, "~> 0.7.0", [hex: :nx, repo: "hexpm", optional: false]}, {:nx_image, "~> 0.1.0", [hex: :nx_image, repo: "hexpm", optional: false]}, {:nx_signal, "~> 0.2.0", [hex: :nx_signal, repo: "hexpm", optional: false]}, {:progress_bar, "~> 3.0", [hex: :progress_bar, repo: "hexpm", optional: false]}, {:safetensors, "~> 0.1.3", [hex: :safetensors, repo: "hexpm", optional: false]}, {:tokenizers, "~> 0.4", [hex: :tokenizers, repo: "hexpm", optional: false]}, {:unpickler, "~> 0.1.0", [hex: :unpickler, repo: "hexpm", optional: false]}, {:unzip, "~> 0.10.0", [hex: :unzip, repo: "hexpm", optional: false]}], "hexpm", "5518f11e424c431a9cbedc80e0d26525368f0b6e50572a674ff247ec3b26bdd7"}, - "castore": {:hex, :castore, "1.0.5", "9eeebb394cc9a0f3ae56b813459f990abb0a3dedee1be6b27fdb50301930502f", [:mix], [], "hexpm", "8d7c597c3e4a64c395980882d4bca3cebb8d74197c590dc272cfd3b6a6310578"}, + "castore": {:hex, :castore, "1.0.6", "ffc42f110ebfdafab0ea159cd43d31365fa0af0ce4a02ecebf1707ae619ee727", [:mix], [], "hexpm", "374c6e7ca752296be3d6780a6d5b922854ffcc74123da90f2f328996b962d33a"}, "complex": {:hex, :complex, "0.5.0", "af2d2331ff6170b61bb738695e481b27a66780e18763e066ee2cd863d0b1dd92", [:mix], [], "hexpm", "2683bd3c184466cfb94fad74cbfddfaa94b860e27ad4ca1bffe3bff169d91ef1"}, "cowboy": {:hex, :cowboy, "2.10.0", "ff9ffeff91dae4ae270dd975642997afe2a1179d94b1887863e43f681a203e26", [:make, :rebar3], [{:cowlib, "2.12.1", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, "1.8.0", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "3afdccb7183cc6f143cb14d3cf51fa00e53db9ec80cdcd525482f5e99bc41d6b"}, "cowboy_telemetry": {:hex, :cowboy_telemetry, "0.4.0", "f239f68b588efa7707abce16a84d0d2acf3a0f50571f8bb7f56a15865aae820c", [:rebar3], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:telemetry, "~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "7d98bac1ee4565d31b62d59f8823dfd8356a169e7fcbb83831b8a5397404c9de"}, "cowlib": {:hex, :cowlib, "2.12.1", "a9fa9a625f1d2025fe6b462cb865881329b5caff8f1854d1cbc9f9533f00e1e1", [:make, :rebar3], [], "hexpm", "163b73f6367a7341b33c794c4e88e7dbfe6498ac42dcd69ef44c5bc5507c8db0"}, "decimal": {:hex, :decimal, "2.1.1", "5611dca5d4b2c3dd497dec8f68751f1f1a54755e8ed2a966c2633cf885973ad6", [:mix], [], "hexpm", "53cfe5f497ed0e7771ae1a475575603d77425099ba5faef9394932b35020ffcc"}, - "elixir_make": {:hex, :elixir_make, "0.7.8", "505026f266552ee5aabca0b9f9c229cbb496c689537c9f922f3eb5431157efc7", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:certifi, "~> 2.0", [hex: :certifi, repo: "hexpm", optional: true]}], "hexpm", "7a71945b913d37ea89b06966e1342c85cfe549b15e6d6d081e8081c493062c07"}, + "elixir_make": {:hex, :elixir_make, "0.8.3", "d38d7ee1578d722d89b4d452a3e36bcfdc644c618f0d063b874661876e708683", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:certifi, "~> 2.0", [hex: :certifi, repo: "hexpm", optional: true]}], "hexpm", "5c99a18571a756d4af7a4d89ca75c28ac899e6103af6f223982f09ce44942cc9"}, "esbuild": {:hex, :esbuild, "0.8.1", "0cbf919f0eccb136d2eeef0df49c4acf55336de864e63594adcea3814f3edf41", [:mix], [{:castore, ">= 0.0.0", [hex: :castore, repo: "hexpm", optional: false]}, {:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "25fc876a67c13cb0a776e7b5d7974851556baeda2085296c14ab48555ea7560f"}, - "exla": {:hex, :exla, "0.7.0", "27fac40a580f0d3816fe3bf35c50dfc2f99597d26ac7e2aca4a3c62b89bb427f", [:make, :mix], [{:elixir_make, "~> 0.6", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:nx, "~> 0.7.0", [hex: :nx, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.0 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}, {:xla, "~> 0.6.0", [hex: :xla, repo: "hexpm", optional: false]}], "hexpm", "d3bfc622deb52cec95efc9d76063891afc7cd33e38eddbb01f3385c53e043c40"}, + "exla": {:hex, :exla, "0.7.1", "790493288cf4441abed98df0c4e98da15a2e3a7fa27cd2a1f74ec0693952c579", [:make, :mix], [{:elixir_make, "~> 0.6", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.0", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:nx, "~> 0.7.1", [hex: :nx, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.0 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}, {:xla, "~> 0.6.0", [hex: :xla, repo: "hexpm", optional: false]}], "hexpm", "ec9c1698a9a17b859d79f9b3c1d75c370335580cdd0353db9c2017f86155e2ec"}, "expo": {:hex, :expo, "0.5.2", "beba786aab8e3c5431813d7a44b828e7b922bfa431d6bfbada0904535342efe2", [:mix], [], "hexpm", "8c9bfa06ca017c9cb4020fabe980bc7fdb1aaec059fd004c2ab3bff03b1c599c"}, "file_system": {:hex, :file_system, "0.2.10", "fb082005a9cd1711c05b5248710f8826b02d7d1784e7c3451f9c1231d4fc162d", [:mix], [], "hexpm", "41195edbfb562a593726eda3b3e8b103a309b733ad25f3d642ba49696bf715dc"}, "finch": {:hex, :finch, "0.18.0", "944ac7d34d0bd2ac8998f79f7a811b21d87d911e77a786bc5810adb75632ada4", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.3", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 0.2.6 or ~> 1.0", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "69f5045b042e531e53edc2574f15e25e735b522c37e2ddb766e15b979e03aa65"}, @@ -21,7 +21,7 @@ "mint": {:hex, :mint, "1.5.2", "4805e059f96028948870d23d7783613b7e6b0e2fb4e98d720383852a760067fd", [:mix], [{:castore, "~> 0.1.0 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:hpax, "~> 0.1.1", [hex: :hpax, repo: "hexpm", optional: false]}], "hexpm", "d77d9e9ce4eb35941907f1d3df38d8f750c357865353e21d335bdcdf6d892a02"}, "nimble_options": {:hex, :nimble_options, "1.1.0", "3b31a57ede9cb1502071fade751ab0c7b8dbe75a9a4c2b5bbb0943a690b63172", [:mix], [], "hexpm", "8bbbb3941af3ca9acc7835f5655ea062111c9c27bcac53e004460dfd19008a99"}, "nimble_pool": {:hex, :nimble_pool, "1.0.0", "5eb82705d138f4dd4423f69ceb19ac667b3b492ae570c9f5c900bb3d2f50a847", [:mix], [], "hexpm", "80be3b882d2d351882256087078e1b1952a28bf98d0a287be87e4a24a710b67a"}, - "nx": {:hex, :nx, "0.7.0", "cec684cada356e9d268af01daa758882f7372aa952716dbe0369c657abb9e762", [:mix], [{:complex, "~> 0.5", [hex: :complex, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.0 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "68edaa48a5841495ecab0dd4cf7b11b2fc0ad809754ae7f82d9c4090b91acf55"}, + "nx": {:hex, :nx, "0.7.1", "5f6376e3d18408116e8a84b8f4ac851fb07dfe61764a5410ebf0b5dcb69c1b7e", [:mix], [{:complex, "~> 0.5", [hex: :complex, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.0 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "e3ddd6a3f2a9bac79c67b3933368c25bb5ec814a883fc68aba8fd8a236751777"}, "nx_image": {:hex, :nx_image, "0.1.2", "0c6e3453c1dc30fc80c723a54861204304cebc8a89ed3b806b972c73ee5d119d", [:mix], [{:nx, "~> 0.4", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "9161863c42405ddccb6dbbbeae078ad23e30201509cc804b3b3a7c9e98764b81"}, "nx_signal": {:hex, :nx_signal, "0.2.0", "e1ca0318877b17c81ce8906329f5125f1e2361e4c4235a5baac8a95ee88ea98e", [:mix], [{:nx, "~> 0.6", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "7247e5e18a177a59c4cb5355952900c62fdeadeb2bad02a9a34237b68744e2bb"}, "phoenix": {:hex, :phoenix, "1.7.11", "1d88fc6b05ab0c735b250932c4e6e33bfa1c186f76dcf623d8dd52f07d6379c7", [:mix], [{:castore, ">= 0.0.0", [hex: :castore, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 2.1", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:phoenix_template, "~> 1.0", [hex: :phoenix_template, repo: "hexpm", optional: false]}, {:phoenix_view, "~> 2.0", [hex: :phoenix_view, repo: "hexpm", optional: true]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 2.7", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:plug_crypto, "~> 1.2 or ~> 2.0", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}, {:websock_adapter, "~> 0.5.3", [hex: :websock_adapter, repo: "hexpm", optional: false]}], "hexpm", "b1ec57f2e40316b306708fe59b92a16b9f6f4bf50ccfa41aa8c7feb79e0ec02a"}, From 365668d6cb252bd206abb572b2fb7b1c875b444e Mon Sep 17 00:00:00 2001 From: Mark Ericksen Date: Tue, 16 Apr 2024 13:33:19 -0600 Subject: [PATCH 2/3] dockerfile updates --- Dockerfile | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index d1672c6..c4a7812 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,9 +13,9 @@ ARG ERLANG_VERSION=26.2.4 # Target the CUDA build image ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} -# NOTE: TRYING TO GET IT WORKING. DON'T KEEP "devel" VERSION FOR RUNTIME? -ARG BASE_CUDA_RUNTIME_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +ARG BASE_CUDA_RUNTIME_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} +# Use the Elixir container for our build tools FROM hexpm/elixir:${ELIXIR_VERSION}-erlang-${ERLANG_VERSION}-ubuntu-${UBUNTU_NAMED_VERSION} AS elixir FROM ${BASE_CUDA_DEV_CONTAINER} as builder @@ -105,9 +105,6 @@ ENV XLA_TARGET="cuda120" ENV BUMBLEBEE_CACHE_DIR="/data/cache/bumblebee" ENV XLA_CACHE_DIR="/data/cache/xla" -ENV ECTO_IPV6 true -ENV ERL_AFLAGS "-proto_dist inet6_tcp" - # Only copy the final release from the build stage COPY --from=builder /app/_build/${MIX_ENV}/rel/harness ./ From 64924d18d1b0c62e725eb6862cf8fbc965711a4b Mon Sep 17 00:00:00 2001 From: Mark Ericksen Date: Tue, 16 Apr 2024 13:33:51 -0600 Subject: [PATCH 3/3] readme troubleshooting updated --- README.md | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 0f2e565..c7ecee2 100644 --- a/README.md +++ b/README.md @@ -92,30 +92,27 @@ If the required NVidia libraries and hardware are in place, then the `nvidia-smi | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| -| 0 NVIDIA A100-PCIE-40GB Off | 00000000:00:06.0 Off | On | -| N/A 38C P0 39W / 250W | 0MiB / 40960MiB | N/A Default | -| | | Enabled | +| 0 NVIDIA A100-PCIE-40GB Off | 00000000:00:06.0 Off | 0 | +| N/A 34C P0 38W / 250W | 36735MiB / 40960MiB | 0% Default | +| | | Disabled | +-----------------------------------------+----------------------+----------------------+ -+---------------------------------------------------------------------------------------+ -| MIG devices: | -+------------------+--------------------------------+-----------+-----------------------+ -| GPU GI CI MIG | Memory-Usage | Vol| Shared | -| ID ID Dev | BAR1-Usage | SM Unc| CE ENC DEC OFA JPG | -| | | ECC| | -|==================+================================+===========+=======================| -| No MIG devices found | -+---------------------------------------------------------------------------------------+ - +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| -| No running processes found | +| 0 N/A N/A 492 C /app/erts-14.2.4/bin/beam.smp 0MiB | +---------------------------------------------------------------------------------------+ ``` +There should not be a "MIG" section above the "Processes" one. A [longer-term fix to fully reset the GPUs configuration is coming](https://community.fly.io/t/getting-error-when-trying-to-run-llama2chatmodel-in-gpu-machine/19127). As a short-term workaround, the following command can be run and restart or redeploy the application. + +``` +$ fly ssh console +nvidia-smi -mig 0 +``` + The next layer to test is that Elixir has access to the GPU. For that, run the following: ```