Merge pull request #157 from huggingface/main

Merge changes
Skquark · May 9, 2024 · d074b67 · d074b67
2 parents bbbc31e + caf9e98
commit d074b67
Show file tree

Hide file tree

Showing 145 changed files with 4,532 additions and 2,034 deletions.
diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml
@@ -19,7 +19,7 @@ env:
 jobs:
   setup_torch_cuda_pipeline_matrix:
     name: Setup Torch Pipelines Matrix
-    runs-on: ubuntu-latest
+    runs-on: diffusers/diffusers-pytorch-cpu
     outputs:
       pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
     steps:
@@ -67,19 +67,19 @@ jobs:
           fetch-depth: 2
       - name: NVIDIA-SMI
         run: nvidia-smi
-      
+
       - name: Install dependencies
         run: |
           python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
           python -m uv pip install -e [quality,test]
           python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
           python -m uv pip install pytest-reportlog
-      
+
       - name: Environment
         run: |
           python utils/print_env.py
-      
-      - name: Nightly PyTorch CUDA checkpoint (pipelines) tests 
+
+      - name: Nightly PyTorch CUDA checkpoint (pipelines) tests
         env:
           HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
           # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
@@ -88,9 +88,9 @@ jobs:
           python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
             -s -v -k "not Flax and not Onnx" \
             --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
-            --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \ 
+            --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
             tests/pipelines/${{ matrix.module }}
-      
+
       - name: Failure short reports
         if: ${{ failure() }}
         run: |
@@ -103,7 +103,7 @@ jobs:
         with:
           name: pipeline_${{ matrix.module }}_test_reports
           path: reports
-      
+
       - name: Generate Report and Notify Channel
         if: always()
         run: |
@@ -112,7 +112,7 @@ jobs:
 
   run_nightly_tests_for_other_torch_modules:
     name: Torch Non-Pipelines CUDA Nightly Tests
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
     container:
       image: diffusers/diffusers-pytorch-cuda
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
@@ -139,7 +139,7 @@ jobs:
       run: python utils/print_env.py
 
     - name: Run nightly PyTorch CUDA tests for non-pipeline modules
-      if: ${{ matrix.module != 'examples'}} 
+      if: ${{ matrix.module != 'examples'}}
       env:
         HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
         # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
@@ -148,7 +148,7 @@ jobs:
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "not Flax and not Onnx" \
           --make-reports=tests_torch_${{ matrix.module }}_cuda \
-          --report-log=tests_torch_${{ matrix.module }}_cuda.log \ 
+          --report-log=tests_torch_${{ matrix.module }}_cuda.log \
           tests/${{ matrix.module }}
 
     - name: Run nightly example tests with Torch
@@ -161,13 +161,13 @@ jobs:
         python -m uv pip install peft@git+https://github.com/huggingface/peft.git
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v --make-reports=examples_torch_cuda \
-          --report-log=examples_torch_cuda.log \ 
+          --report-log=examples_torch_cuda.log \
           examples/
 
     - name: Failure short reports
       if: ${{ failure() }}
       run: |
-        cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt 
+        cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt
         cat reports/tests_torch_${{ matrix.module }}_cuda_failures_short.txt
 
     - name: Test suite reports artifacts
@@ -185,7 +185,7 @@ jobs:
 
   run_lora_nightly_tests:
     name: Nightly LoRA Tests with PEFT and TORCH
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
     container:
       image: diffusers/diffusers-pytorch-cuda
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
@@ -218,13 +218,13 @@ jobs:
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "not Flax and not Onnx" \
           --make-reports=tests_torch_lora_cuda \
-          --report-log=tests_torch_lora_cuda.log \ 
+          --report-log=tests_torch_lora_cuda.log \
           tests/lora
-    
+
     - name: Failure short reports
       if: ${{ failure() }}
       run: |
-        cat reports/tests_torch_lora_cuda_stats.txt 
+        cat reports/tests_torch_lora_cuda_stats.txt
         cat reports/tests_torch_lora_cuda_failures_short.txt
 
     - name: Test suite reports artifacts
@@ -239,12 +239,12 @@ jobs:
       run: |
         pip install slack_sdk tabulate
         python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
-  
+
   run_flax_tpu_tests:
     name: Nightly Flax TPU Tests
     runs-on: docker-tpu
     if: github.event_name == 'schedule'
-    
+
     container:
       image: diffusers/diffusers-flax-tpu
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
@@ -274,7 +274,7 @@ jobs:
         python -m pytest -n 0 \
           -s -v -k "Flax" \
           --make-reports=tests_flax_tpu \
-          --report-log=tests_flax_tpu.log \ 
+          --report-log=tests_flax_tpu.log \
           tests/
 
     - name: Failure short reports
@@ -298,11 +298,11 @@ jobs:
 
   run_nightly_onnx_tests:
     name: Nightly ONNXRuntime CUDA tests on Ubuntu
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
     container:
       image: diffusers/diffusers-onnxruntime-cuda
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
-    
+
     steps:
     - name: Checkout diffusers
       uses: actions/checkout@v3
@@ -321,15 +321,15 @@ jobs:
 
     - name: Environment
       run: python utils/print_env.py
-    
+
     - name: Run nightly ONNXRuntime CUDA tests
       env:
         HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       run: |
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "Onnx" \
           --make-reports=tests_onnx_cuda \
-          --report-log=tests_onnx_cuda.log \ 
+          --report-log=tests_onnx_cuda.log \
           tests/
 
     - name: Failure short reports
@@ -344,7 +344,7 @@ jobs:
       with:
         name: ${{ matrix.config.report }}_test_reports
         path: reports
-    
+
     - name: Generate Report and Notify Channel
       if: always()
       run: |

diff --git a/.github/workflows/pr_test_fetcher.yml b/.github/workflows/pr_test_fetcher.yml
@@ -15,7 +15,7 @@ concurrency:
 jobs:
   setup_pr_tests:
     name: Setup PR Tests
-    runs-on: docker-cpu
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
     container:
       image: diffusers/diffusers-pytorch-cpu
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -73,7 +73,7 @@ jobs:
       max-parallel: 2
       matrix:
         modules: ${{ fromJson(needs.setup_pr_tests.outputs.matrix) }}
-    runs-on: docker-cpu
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
     container:
       image: diffusers/diffusers-pytorch-cpu
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -123,7 +123,7 @@ jobs:
         config:
           - name: Hub tests for models, schedulers, and pipelines
             framework: hub_tests_pytorch
-            runner: docker-cpu
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
             image: diffusers/diffusers-pytorch-cpu
             report: torch_hub
 

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
@@ -21,22 +21,23 @@ env:
 jobs:
   setup_torch_cuda_pipeline_matrix:
     name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on: ubuntu-latest
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    container:
+      image: diffusers/diffusers-pytorch-cpu
     outputs:
       pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
     steps:
       - name: Checkout diffusers
         uses: actions/checkout@v3
         with:
           fetch-depth: 2
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.8"
       - name: Install dependencies
         run: |
-          pip install -e .
-          pip install huggingface_hub
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m uv pip install -e [quality,test]
+      - name: Environment
+        run: |
+          python utils/print_env.py
       - name: Fetch Pipeline Matrix
         id: fetch_pipeline_matrix
         run: |
@@ -60,7 +61,7 @@ jobs:
     runs-on: [single-gpu, nvidia-gpu, t4, ci]
     container:
       image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 --privileged
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
     steps:
       - name: Checkout diffusers
         uses: actions/checkout@v3
@@ -114,10 +115,10 @@ jobs:
 
   torch_cuda_tests:
     name: Torch CUDA Tests
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
     container:
       image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
     defaults:
       run:
         shell: bash
@@ -166,10 +167,10 @@ jobs:
 
   peft_cuda_tests:
     name: PEFT CUDA Tests
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
     container:
       image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
     defaults:
       run:
         shell: bash
@@ -219,7 +220,7 @@ jobs:
     runs-on: docker-tpu
     container:
       image: diffusers/diffusers-flax-tpu
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --privileged
     defaults:
       run:
         shell: bash
@@ -263,10 +264,10 @@ jobs:
 
   onnx_cuda_tests:
     name: ONNX CUDA Tests
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
     container:
       image: diffusers/diffusers-onnxruntime-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
     defaults:
       run:
         shell: bash
@@ -311,11 +312,11 @@ jobs:
   run_torch_compile_tests:
     name: PyTorch Compile CUDA tests
 
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
 
     container:
       image: diffusers/diffusers-pytorch-compile-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 
     steps:
     - name: Checkout diffusers
@@ -352,11 +353,11 @@ jobs:
   run_xformers_tests:
     name: PyTorch xformers CUDA tests
 
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
 
     container:
       image: diffusers/diffusers-pytorch-xformers-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 
     steps:
     - name: Checkout diffusers
@@ -393,11 +394,11 @@ jobs:
   run_examples_tests:
     name: Examples PyTorch CUDA tests on Ubuntu
 
-    runs-on: docker-gpu
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
 
     container:
       image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 
     steps:
     - name: Checkout diffusers

diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml
@@ -0,0 +1,46 @@
+name: SSH into runners
+
+on:
+  workflow_dispatch:
+    inputs:
+      runner_type:
+        description: 'Type of runner to test (a10 or t4)'
+        required: true
+      docker_image:
+        description: 'Name of the Docker image'
+        required: true
+
+env:
+  IS_GITHUB_CI: "1"
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  HF_HOME: /mnt/cache
+  DIFFUSERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  RUN_SLOW: yes
+
+jobs:
+  ssh_runner:
+    name: "SSH"
+    runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
+    container:
+      image: ${{ github.event.inputs.docker_image }}
+      options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Tailscale # In order to be able to SSH when a test fails
+        uses: huggingface/tailscale-action@v1
+        with:
+          authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
+          slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
+          slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          waitForSSH: true