diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index c0ae87521fdc..718c67731bd5 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -31,7 +31,6 @@ jobs: nvidia-smi - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install pandas peft diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml index 3d061f88c241..82ef885b240e 100644 --- a/.github/workflows/build_docker_images.yml +++ b/.github/workflows/build_docker_images.yml @@ -20,7 +20,7 @@ env: jobs: test-build-docker-images: - runs-on: ubuntu-latest + runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ] if: github.event_name == 'pull_request' steps: - name: Set up Docker Buildx @@ -50,7 +50,7 @@ jobs: if: steps.file_changes.outputs.all != '' build-and-push-docker-images: - runs-on: ubuntu-latest + runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ] if: github.event_name != 'pull_request' permissions: @@ -73,13 +73,13 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v3 - + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 - name: Login to Docker Hub uses: docker/login-action@v2 with: username: ${{ env.REGISTRY }} password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Build and push uses: docker/build-push-action@v3 with: diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml index d489da8e48eb..2f73c66de829 100644 --- a/.github/workflows/nightly_tests.yml +++ b/.github/workflows/nightly_tests.yml @@ -70,7 +70,6 @@ jobs: - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git @@ -131,7 +130,6 @@ jobs: - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git @@ -202,7 +200,6 @@ jobs: - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git @@ -262,7 +259,6 @@ jobs: - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git diff --git a/.github/workflows/pr_test_fetcher.yml b/.github/workflows/pr_test_fetcher.yml index 6a7da2cb73d0..4dbb118c6092 100644 --- a/.github/workflows/pr_test_fetcher.yml +++ b/.github/workflows/pr_test_fetcher.yml @@ -32,7 +32,6 @@ jobs: fetch-depth: 0 - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] - name: Environment @@ -89,7 +88,6 @@ jobs: - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m pip install -e [quality,test] python -m pip install accelerate @@ -147,7 +145,6 @@ jobs: - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m pip install -e [quality,test] diff --git a/.github/workflows/pr_test_peft_backend.yml b/.github/workflows/pr_test_peft_backend.yml index c7a6ea4fb7c7..b4915a3bf4d2 100644 --- a/.github/workflows/pr_test_peft_backend.yml +++ b/.github/workflows/pr_test_peft_backend.yml @@ -32,9 +32,7 @@ jobs: python -m pip install --upgrade pip pip install .[quality] - name: Check quality - run: | - ruff check examples tests src utils scripts - ruff format examples tests src utils scripts --check + run: make quality - name: Check if failure if: ${{ failure() }} run: | @@ -53,7 +51,7 @@ jobs: run: | python -m pip install --upgrade pip pip install .[quality] - - name: Check quality + - name: Check repo consistency run: | python utils/check_copies.py python utils/check_dummies.py @@ -73,7 +71,7 @@ jobs: name: LoRA - ${{ matrix.lib-versions }} - runs-on: docker-cpu + runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ] container: image: diffusers/diffusers-pytorch-cpu @@ -91,11 +89,10 @@ jobs: - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] if [ "${{ matrix.lib-versions }}" == "main" ]; then - python -m uv pip install -U peft@git+https://github.com/huggingface/peft.git + python -m pip install -U peft@git+https://github.com/huggingface/peft.git python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git else @@ -110,7 +107,7 @@ jobs: - name: Run fast PyTorch LoRA CPU tests with PEFT backend run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" - python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \ + python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \ -s -v \ --make-reports=tests_${{ matrix.config.report }} \ tests/lora/ diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index 7ec4ffa713b8..b1bed6568aa4 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -40,9 +40,7 @@ jobs: python -m pip install --upgrade pip pip install .[quality] - name: Check quality - run: | - ruff check examples tests src utils scripts - ruff format examples tests src utils scripts --check + run: make quality - name: Check if failure if: ${{ failure() }} run: | @@ -61,7 +59,7 @@ jobs: run: | python -m pip install --upgrade pip pip install .[quality] - - name: Check quality + - name: Check repo consistency run: | python utils/check_copies.py python utils/check_dummies.py @@ -79,22 +77,22 @@ jobs: config: - name: Fast PyTorch Pipeline CPU tests framework: pytorch_pipelines - runner: docker-cpu + runner: [ self-hosted, intel-cpu, 32-cpu, 256-ram, ci ] image: diffusers/diffusers-pytorch-cpu report: torch_cpu_pipelines - name: Fast PyTorch Models & Schedulers CPU tests framework: pytorch_models - runner: docker-cpu + runner: [ self-hosted, intel-cpu, 8-cpu, ci ] image: diffusers/diffusers-pytorch-cpu report: torch_cpu_models_schedulers - name: Fast Flax CPU tests framework: flax - runner: docker-cpu + runner: [ self-hosted, intel-cpu, 8-cpu, ci ] image: diffusers/diffusers-flax-cpu report: flax_cpu - name: PyTorch Example CPU tests framework: pytorch_examples - runner: docker-cpu + runner: [ self-hosted, intel-cpu, 8-cpu, ci ] image: diffusers/diffusers-pytorch-cpu report: torch_example_cpu @@ -118,7 +116,6 @@ jobs: - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install accelerate @@ -132,7 +129,7 @@ jobs: if: ${{ matrix.config.framework == 'pytorch_pipelines' }} run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" - python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ + python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile \ -s -v -k "not Flax and not Onnx" \ --make-reports=tests_${{ matrix.config.report }} \ tests/pipelines @@ -141,7 +138,7 @@ jobs: if: ${{ matrix.config.framework == 'pytorch_models' }} run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" - python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ + python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \ -s -v -k "not Flax and not Onnx and not Dependency" \ --make-reports=tests_${{ matrix.config.report }} \ tests/models tests/schedulers tests/others @@ -150,7 +147,7 @@ jobs: if: ${{ matrix.config.framework == 'flax' }} run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" - python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ + python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \ -s -v -k "Flax" \ --make-reports=tests_${{ matrix.config.report }} \ tests @@ -160,7 +157,7 @@ jobs: run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install peft - python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ + python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \ --make-reports=tests_${{ matrix.config.report }} \ examples @@ -183,7 +180,7 @@ jobs: config: - name: Hub tests for models, schedulers, and pipelines framework: hub_tests_pytorch - runner: docker-cpu + runner: [ self-hosted, intel-cpu, 8-cpu, ci ] image: diffusers/diffusers-pytorch-cpu report: torch_hub @@ -207,7 +204,6 @@ jobs: - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index 0a316c90dfed..a6cb123a7035 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -60,7 +60,7 @@ jobs: runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-cuda - options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 + options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 --privileged steps: - name: Checkout diffusers uses: actions/checkout@v3 @@ -69,9 +69,14 @@ jobs: - name: NVIDIA-SMI run: | nvidia-smi + - name: Tailscale + uses: huggingface/tailscale-action@v1 + with: + authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }} + slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }} + slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git @@ -88,6 +93,12 @@ jobs: -s -v -k "not Flax and not Onnx" \ --make-reports=tests_pipeline_${{ matrix.module }}_cuda \ tests/pipelines/${{ matrix.module }} + - name: Tailscale Wait + if: ${{ failure() || runner.debug == '1' }} + uses: huggingface/tailscale-action@v1 + with: + waitForSSH: true + authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }} - name: Failure short reports if: ${{ failure() }} run: | @@ -121,7 +132,6 @@ jobs: - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git @@ -171,11 +181,10 @@ jobs: - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git - python -m uv pip install peft@git+https://github.com/huggingface/peft.git + python -m pip install -U peft@git+https://github.com/huggingface/peft.git - name: Environment run: | @@ -222,7 +231,6 @@ jobs: - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git @@ -270,7 +278,6 @@ jobs: - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git @@ -430,4 +437,4 @@ jobs: uses: actions/upload-artifact@v2 with: name: examples_test_reports - path: reports \ No newline at end of file + path: reports diff --git a/.github/workflows/push_tests_fast.yml b/.github/workflows/push_tests_fast.yml index 6b01577041b2..7c50da7b5c34 100644 --- a/.github/workflows/push_tests_fast.yml +++ b/.github/workflows/push_tests_fast.yml @@ -29,22 +29,22 @@ jobs: config: - name: Fast PyTorch CPU tests on Ubuntu framework: pytorch - runner: docker-cpu + runner: [ self-hosted, intel-cpu, 8-cpu, ci ] image: diffusers/diffusers-pytorch-cpu report: torch_cpu - name: Fast Flax CPU tests on Ubuntu framework: flax - runner: docker-cpu + runner: [ self-hosted, intel-cpu, 8-cpu, ci ] image: diffusers/diffusers-flax-cpu report: flax_cpu - name: Fast ONNXRuntime CPU tests on Ubuntu framework: onnxruntime - runner: docker-cpu + runner: [ self-hosted, intel-cpu, 8-cpu, ci ] image: diffusers/diffusers-onnxruntime-cpu report: onnx_cpu - name: PyTorch Example CPU tests on Ubuntu framework: pytorch_examples - runner: docker-cpu + runner: [ self-hosted, intel-cpu, 8-cpu, ci ] image: diffusers/diffusers-pytorch-cpu report: torch_example_cpu @@ -68,7 +68,6 @@ jobs: - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] @@ -81,7 +80,7 @@ jobs: if: ${{ matrix.config.framework == 'pytorch' }} run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" - python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ + python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \ -s -v -k "not Flax and not Onnx" \ --make-reports=tests_${{ matrix.config.report }} \ tests/ @@ -90,7 +89,7 @@ jobs: if: ${{ matrix.config.framework == 'flax' }} run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" - python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ + python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \ -s -v -k "Flax" \ --make-reports=tests_${{ matrix.config.report }} \ tests/ @@ -99,7 +98,7 @@ jobs: if: ${{ matrix.config.framework == 'onnxruntime' }} run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" - python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ + python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \ -s -v -k "Onnx" \ --make-reports=tests_${{ matrix.config.report }} \ tests/ @@ -109,7 +108,7 @@ jobs: run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install peft - python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ + python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \ --make-reports=tests_${{ matrix.config.report }} \ examples diff --git a/.github/workflows/update_metadata.yml b/.github/workflows/update_metadata.yml new file mode 100644 index 000000000000..f91fa29a1ab9 --- /dev/null +++ b/.github/workflows/update_metadata.yml @@ -0,0 +1,30 @@ +name: Update Diffusers metadata + +on: + workflow_dispatch: + push: + branches: + - main + - update_diffusers_metadata* + +jobs: + update_metadata: + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash -l {0} + + steps: + - uses: actions/checkout@v3 + + - name: Setup environment + run: | + pip install --upgrade pip + pip install datasets pandas + pip install .[torch] + + - name: Update metadata + env: + HUGGING_FACE_HUB_TOKEN: ${{ secrets.SAYAK_HF_TOKEN }} + run: | + python utils/update_metadata.py --commit_sha ${{ github.sha }} diff --git a/Makefile b/Makefile index c92285b48c71..9af2e8b1a5c9 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,7 @@ repo-consistency: quality: ruff check $(check_dirs) setup.py ruff format --check $(check_dirs) setup.py + doc-builder style src/diffusers docs/source --max_len 119 --check_only python utils/check_doc_toc.py # Format source code automatically and check is there are any problems left that need manual fixing @@ -55,6 +56,7 @@ extra_style_checks: style: ruff check $(check_dirs) setup.py --fix ruff format $(check_dirs) setup.py + doc-builder style src/diffusers docs/source --max_len 119 ${MAKE} autogenerate_code ${MAKE} extra_style_checks diff --git a/docker/diffusers-flax-cpu/Dockerfile b/docker/diffusers-flax-cpu/Dockerfile index 36d036e34e5f..005c0f9caacf 100644 --- a/docker/diffusers-flax-cpu/Dockerfile +++ b/docker/diffusers-flax-cpu/Dockerfile @@ -12,6 +12,7 @@ RUN apt update && \ curl \ ca-certificates \ libsndfile1-dev \ + libgl1 \ python3.8 \ python3-pip \ python3.8-venv && \ diff --git a/docker/diffusers-flax-tpu/Dockerfile b/docker/diffusers-flax-tpu/Dockerfile index 78d5f972a753..05ea22488ab9 100644 --- a/docker/diffusers-flax-tpu/Dockerfile +++ b/docker/diffusers-flax-tpu/Dockerfile @@ -12,6 +12,7 @@ RUN apt update && \ curl \ ca-certificates \ libsndfile1-dev \ + libgl1 \ python3.8 \ python3-pip \ python3.8-venv && \ diff --git a/docker/diffusers-onnxruntime-cpu/Dockerfile b/docker/diffusers-onnxruntime-cpu/Dockerfile index 0d032d91e5eb..b60b467b7485 100644 --- a/docker/diffusers-onnxruntime-cpu/Dockerfile +++ b/docker/diffusers-onnxruntime-cpu/Dockerfile @@ -12,6 +12,7 @@ RUN apt update && \ curl \ ca-certificates \ libsndfile1-dev \ + libgl1 \ python3.8 \ python3-pip \ python3.8-venv && \ diff --git a/docker/diffusers-onnxruntime-cuda/Dockerfile b/docker/diffusers-onnxruntime-cuda/Dockerfile index 34e611df7257..16a0d76460f4 100644 --- a/docker/diffusers-onnxruntime-cuda/Dockerfile +++ b/docker/diffusers-onnxruntime-cuda/Dockerfile @@ -12,6 +12,7 @@ RUN apt update && \ curl \ ca-certificates \ libsndfile1-dev \ + libgl1 \ python3.8 \ python3-pip \ python3.8-venv && \ diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 29bd65fb4dba..ea5d1a021c94 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -24,14 +24,12 @@ title: Tutorials - sections: - sections: - - local: using-diffusers/loading_overview - title: Overview - local: using-diffusers/loading - title: Load pipelines, models, and schedulers - - local: using-diffusers/schedulers - title: Load and compare different schedulers + title: Load pipelines - local: using-diffusers/custom_pipeline_overview title: Load community pipelines and components + - local: using-diffusers/schedulers + title: Load schedulers and models - local: using-diffusers/using_safetensors title: Load safetensors - local: using-diffusers/other-formats @@ -71,7 +69,7 @@ - local: using-diffusers/control_brightness title: Control image brightness - local: using-diffusers/weighted_prompts - title: Prompt weighting + title: Prompt techniques - local: using-diffusers/freeu title: Improve generation quality with FreeU title: Techniques @@ -86,6 +84,8 @@ title: Kandinsky - local: using-diffusers/controlnet title: ControlNet + - local: using-diffusers/t2i_adapter + title: T2I-Adapter - local: using-diffusers/shap-e title: Shap-E - local: using-diffusers/diffedit @@ -170,6 +170,8 @@ title: Token merging - local: optimization/deepcache title: DeepCache + - local: optimization/tgate + title: TGATE title: General optimizations - sections: - local: using-diffusers/stable_diffusion_jax_how_to @@ -280,6 +282,10 @@ title: ControlNet - local: api/pipelines/controlnet_sdxl title: ControlNet with Stable Diffusion XL + - local: api/pipelines/controlnetxs + title: ControlNet-XS + - local: api/pipelines/controlnetxs_sdxl + title: ControlNet-XS with Stable Diffusion XL - local: api/pipelines/dance_diffusion title: Dance Diffusion - local: api/pipelines/ddim @@ -358,7 +364,7 @@ - local: api/pipelines/stable_diffusion/ldm3d_diffusion title: LDM3D Text-to-(RGB, Depth), Text-to-(RGB-pano, Depth-pano), LDM3D Upscaler - local: api/pipelines/stable_diffusion/adapter - title: Stable Diffusion T2I-Adapter + title: T2I-Adapter - local: api/pipelines/stable_diffusion/gligen title: GLIGEN (Grounded Language-to-Image Generation) title: Stable Diffusion diff --git a/docs/source/en/api/pipelines/audioldm2.md b/docs/source/en/api/pipelines/audioldm2.md index b29bea96e324..ac4459c60706 100644 --- a/docs/source/en/api/pipelines/audioldm2.md +++ b/docs/source/en/api/pipelines/audioldm2.md @@ -20,7 +20,8 @@ The abstract of the paper is the following: *Although audio generation shares commonalities across different types of audio, such as speech, music, and sound effects, designing models for each type requires careful consideration of specific objectives and biases that can significantly differ from those of other types. To bring us closer to a unified perspective of audio generation, this paper proposes a framework that utilizes the same learning method for speech, music, and sound effect generation. Our framework introduces a general representation of audio, called "language of audio" (LOA). Any audio can be translated into LOA based on AudioMAE, a self-supervised pre-trained representation learning model. In the generation process, we translate any modalities into LOA by using a GPT-2 model, and we perform self-supervised audio generation learning with a latent diffusion model conditioned on LOA. The proposed framework naturally brings advantages such as in-context learning abilities and reusable self-supervised pretrained AudioMAE and latent diffusion models. Experiments on the major benchmarks of text-to-audio, text-to-music, and text-to-speech demonstrate state-of-the-art or competitive performance against previous approaches. Our code, pretrained model, and demo are available at [this https URL](https://audioldm.github.io/audioldm2).* -This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi). The original codebase can be found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2). +This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi) and [Nguyễn Công Tú Anh](https://github.com/tuanh123789). The original codebase can be +found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2). ## Tips @@ -36,6 +37,8 @@ See table below for details on the three checkpoints: | [audioldm2](https://huggingface.co/cvssp/audioldm2) | Text-to-audio | 350M | 1.1B | 1150k | | [audioldm2-large](https://huggingface.co/cvssp/audioldm2-large) | Text-to-audio | 750M | 1.5B | 1150k | | [audioldm2-music](https://huggingface.co/cvssp/audioldm2-music) | Text-to-music | 350M | 1.1B | 665k | +| [audioldm2-gigaspeech](https://huggingface.co/anhnct/audioldm2_gigaspeech) | Text-to-speech | 350M | 1.1B |10k | +| [audioldm2-ljspeech](https://huggingface.co/anhnct/audioldm2_ljspeech) | Text-to-speech | 350M | 1.1B | | ### Constructing a prompt @@ -53,7 +56,7 @@ See table below for details on the three checkpoints: * The quality of the generated waveforms can vary significantly based on the seed. Try generating with different seeds until you find a satisfactory generation. * Multiple waveforms can be generated in one go: set `num_waveforms_per_prompt` to a value greater than 1. Automatic scoring will be performed between the generated waveforms and prompt text, and the audios ranked from best to worst accordingly. -The following example demonstrates how to construct good music generation using the aforementioned tips: [example](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example). +The following example demonstrates how to construct good music and speech generation using the aforementioned tips: [example](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example). diff --git a/examples/research_projects/controlnetxs/README.md b/docs/source/en/api/pipelines/controlnetxs.md similarity index 61% rename from examples/research_projects/controlnetxs/README.md rename to docs/source/en/api/pipelines/controlnetxs.md index 72ed91c01db2..2d4ae7b8ce46 100644 --- a/examples/research_projects/controlnetxs/README.md +++ b/docs/source/en/api/pipelines/controlnetxs.md @@ -1,3 +1,15 @@ + + # ControlNet-XS ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results. @@ -12,5 +24,16 @@ Here's the overview from the [project page](https://vislearn.github.io/ControlNe This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️ + + +Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines. + + + +## StableDiffusionControlNetXSPipeline +[[autodoc]] StableDiffusionControlNetXSPipeline + - all + - __call__ -> 🧠 Make sure to check out the Schedulers [guide](https://huggingface.co/docs/diffusers/main/en/using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines. \ No newline at end of file +## StableDiffusionPipelineOutput +[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput diff --git a/examples/research_projects/controlnetxs/README_sdxl.md b/docs/source/en/api/pipelines/controlnetxs_sdxl.md similarity index 56% rename from examples/research_projects/controlnetxs/README_sdxl.md rename to docs/source/en/api/pipelines/controlnetxs_sdxl.md index d401c1e76698..31075c0ef96a 100644 --- a/examples/research_projects/controlnetxs/README_sdxl.md +++ b/docs/source/en/api/pipelines/controlnetxs_sdxl.md @@ -1,3 +1,15 @@ + + # ControlNet-XS with Stable Diffusion XL ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results. @@ -12,4 +24,22 @@ Here's the overview from the [project page](https://vislearn.github.io/ControlNe This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️ -> 🧠 Make sure to check out the Schedulers [guide](https://huggingface.co/docs/diffusers/main/en/using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines. \ No newline at end of file + + +🧪 Many of the SDXL ControlNet checkpoints are experimental, and there is a lot of room for improvement. Feel free to open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) and leave us feedback on how we can improve! + + + + + +Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines. + + + +## StableDiffusionXLControlNetXSPipeline +[[autodoc]] StableDiffusionXLControlNetXSPipeline + - all + - __call__ + +## StableDiffusionPipelineOutput +[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput diff --git a/docs/source/en/api/pipelines/stable_diffusion/adapter.md b/docs/source/en/api/pipelines/stable_diffusion/adapter.md index aa38e3d9741f..ca42fdc83984 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/adapter.md +++ b/docs/source/en/api/pipelines/stable_diffusion/adapter.md @@ -10,9 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# Text-to-Image Generation with Adapter Conditioning - -## Overview +# T2I-Adapter [T2I-Adapter: Learning Adapters to Dig out More Controllable Ability for Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.08453) by Chong Mou, Xintao Wang, Liangbin Xie, Jian Zhang, Zhongang Qi, Ying Shan, Xiaohu Qie. @@ -24,236 +22,26 @@ The abstract of the paper is the following: This model was contributed by the community contributor [HimariO](https://github.com/HimariO) ❤️ . -## Available Pipelines: - -| Pipeline | Tasks | Demo -|---|---|:---:| -| [StableDiffusionAdapterPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py) | *Text-to-Image Generation with T2I-Adapter Conditioning* | - -| [StableDiffusionXLAdapterPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py) | *Text-to-Image Generation with T2I-Adapter Conditioning on StableDiffusion-XL* | - - -## Usage example with the base model of StableDiffusion-1.4/1.5 - -In the following we give a simple example of how to use a *T2I-Adapter* checkpoint with Diffusers for inference based on StableDiffusion-1.4/1.5. -All adapters use the same pipeline. - - 1. Images are first converted into the appropriate *control image* format. - 2. The *control image* and *prompt* are passed to the [`StableDiffusionAdapterPipeline`]. - -Let's have a look at a simple example using the [Color Adapter](https://huggingface.co/TencentARC/t2iadapter_color_sd14v1). - -```python -from diffusers.utils import load_image, make_image_grid - -image = load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_ref.png") -``` - -![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_ref.png) - - -Then we can create our color palette by simply resizing it to 8 by 8 pixels and then scaling it back to original size. - -```python -from PIL import Image - -color_palette = image.resize((8, 8)) -color_palette = color_palette.resize((512, 512), resample=Image.Resampling.NEAREST) -``` - -Let's take a look at the processed image. - -![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_palette.png) - - -Next, create the adapter pipeline - -```py -import torch -from diffusers import StableDiffusionAdapterPipeline, T2IAdapter - -adapter = T2IAdapter.from_pretrained("TencentARC/t2iadapter_color_sd14v1", torch_dtype=torch.float16) -pipe = StableDiffusionAdapterPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - adapter=adapter, - torch_dtype=torch.float16, -) -pipe.to("cuda") -``` - -Finally, pass the prompt and control image to the pipeline - -```py -# fix the random seed, so you will get the same result as the example -generator = torch.Generator("cuda").manual_seed(7) - -out_image = pipe( - "At night, glowing cubes in front of the beach", - image=color_palette, - generator=generator, -).images[0] -make_image_grid([image, color_palette, out_image], rows=1, cols=3) -``` - -![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_output.png) - -## Usage example with the base model of StableDiffusion-XL - -In the following we give a simple example of how to use a *T2I-Adapter* checkpoint with Diffusers for inference based on StableDiffusion-XL. -All adapters use the same pipeline. - - 1. Images are first downloaded into the appropriate *control image* format. - 2. The *control image* and *prompt* are passed to the [`StableDiffusionXLAdapterPipeline`]. - -Let's have a look at a simple example using the [Sketch Adapter](https://huggingface.co/Adapter/t2iadapter/tree/main/sketch_sdxl_1.0). - -```python -from diffusers.utils import load_image, make_image_grid - -sketch_image = load_image("https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch.png").convert("L") -``` - -![img](https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch.png) - -Then, create the adapter pipeline - -```py -import torch -from diffusers import ( - T2IAdapter, - StableDiffusionXLAdapterPipeline, - DDPMScheduler -) - -model_id = "stabilityai/stable-diffusion-xl-base-1.0" -adapter = T2IAdapter.from_pretrained("Adapter/t2iadapter", subfolder="sketch_sdxl_1.0", torch_dtype=torch.float16, adapter_type="full_adapter_xl") -scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler") - -pipe = StableDiffusionXLAdapterPipeline.from_pretrained( - model_id, adapter=adapter, safety_checker=None, torch_dtype=torch.float16, variant="fp16", scheduler=scheduler -) - -pipe.to("cuda") -``` - -Finally, pass the prompt and control image to the pipeline - -```py -# fix the random seed, so you will get the same result as the example -generator = torch.Generator().manual_seed(42) - -sketch_image_out = pipe( - prompt="a photo of a dog in real world, high quality", - negative_prompt="extra digit, fewer digits, cropped, worst quality, low quality", - image=sketch_image, - generator=generator, - guidance_scale=7.5 -).images[0] -make_image_grid([sketch_image, sketch_image_out], rows=1, cols=2) -``` - -![img](https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch_output.png) - -## Available checkpoints - -Non-diffusers checkpoints can be found under [TencentARC/T2I-Adapter](https://huggingface.co/TencentARC/T2I-Adapter/tree/main/models). - -### T2I-Adapter with Stable Diffusion 1.4 - -| Model Name | Control Image Overview| Control Image Example | Generated Image Example | -|---|---|---|---| -|[TencentARC/t2iadapter_color_sd14v1](https://huggingface.co/TencentARC/t2iadapter_color_sd14v1)
*Trained with spatial color palette* | An image with 8x8 color palette.||| -|[TencentARC/t2iadapter_canny_sd14v1](https://huggingface.co/TencentARC/t2iadapter_canny_sd14v1)
*Trained with canny edge detection* | A monochrome image with white edges on a black background.||| -|[TencentARC/t2iadapter_sketch_sd14v1](https://huggingface.co/TencentARC/t2iadapter_sketch_sd14v1)
*Trained with [PidiNet](https://github.com/zhuoinoulu/pidinet) edge detection* | A hand-drawn monochrome image with white outlines on a black background.||| -|[TencentARC/t2iadapter_depth_sd14v1](https://huggingface.co/TencentARC/t2iadapter_depth_sd14v1)
*Trained with Midas depth estimation* | A grayscale image with black representing deep areas and white representing shallow areas.||| -|[TencentARC/t2iadapter_openpose_sd14v1](https://huggingface.co/TencentARC/t2iadapter_openpose_sd14v1)
*Trained with OpenPose bone image* | A [OpenPose bone](https://github.com/CMU-Perceptual-Computing-Lab/openpose) image.||| -|[TencentARC/t2iadapter_keypose_sd14v1](https://huggingface.co/TencentARC/t2iadapter_keypose_sd14v1)
*Trained with mmpose skeleton image* | A [mmpose skeleton](https://github.com/open-mmlab/mmpose) image.||| -|[TencentARC/t2iadapter_seg_sd14v1](https://huggingface.co/TencentARC/t2iadapter_seg_sd14v1)
*Trained with semantic segmentation* | An [custom](https://github.com/TencentARC/T2I-Adapter/discussions/25) segmentation protocol image.|| | -|[TencentARC/t2iadapter_canny_sd15v2](https://huggingface.co/TencentARC/t2iadapter_canny_sd15v2)|| -|[TencentARC/t2iadapter_depth_sd15v2](https://huggingface.co/TencentARC/t2iadapter_depth_sd15v2)|| -|[TencentARC/t2iadapter_sketch_sd15v2](https://huggingface.co/TencentARC/t2iadapter_sketch_sd15v2)|| -|[TencentARC/t2iadapter_zoedepth_sd15v1](https://huggingface.co/TencentARC/t2iadapter_zoedepth_sd15v1)|| -|[Adapter/t2iadapter, subfolder='sketch_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/sketch_sdxl_1.0)|| -|[Adapter/t2iadapter, subfolder='canny_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/canny_sdxl_1.0)|| -|[Adapter/t2iadapter, subfolder='openpose_sdxl_1.0'](https://huggingface.co/Adapter/t2iadapter/tree/main/openpose_sdxl_1.0)|| - -## Combining multiple adapters - -[`MultiAdapter`] can be used for applying multiple conditionings at once. - -Here we use the keypose adapter for the character posture and the depth adapter for creating the scene. - -```py -from diffusers.utils import load_image, make_image_grid - -cond_keypose = load_image( - "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png" -) -cond_depth = load_image( - "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png" -) -cond = [cond_keypose, cond_depth] - -prompt = ["A man walking in an office room with a nice view"] -``` - -The two control images look as such: - -![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png) -![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png) - - -`MultiAdapter` combines keypose and depth adapters. - -`adapter_conditioning_scale` balances the relative influence of the different adapters. - -```py -import torch -from diffusers import StableDiffusionAdapterPipeline, MultiAdapter, T2IAdapter - -adapters = MultiAdapter( - [ - T2IAdapter.from_pretrained("TencentARC/t2iadapter_keypose_sd14v1"), - T2IAdapter.from_pretrained("TencentARC/t2iadapter_depth_sd14v1"), - ] -) -adapters = adapters.to(torch.float16) - -pipe = StableDiffusionAdapterPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - torch_dtype=torch.float16, - adapter=adapters, -).to("cuda") - -image = pipe(prompt, cond, adapter_conditioning_scale=[0.8, 0.8]).images[0] -make_image_grid([cond_keypose, cond_depth, image], rows=1, cols=3) -``` - -![img](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_depth_sample_output.png) - - -## T2I-Adapter vs ControlNet - -T2I-Adapter is similar to [ControlNet](https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet). -T2I-Adapter uses a smaller auxiliary network which is only run once for the entire diffusion process. -However, T2I-Adapter performs slightly worse than ControlNet. - ## StableDiffusionAdapterPipeline + [[autodoc]] StableDiffusionAdapterPipeline - - all - - __call__ - - enable_attention_slicing - - disable_attention_slicing - - enable_vae_slicing - - disable_vae_slicing - - enable_xformers_memory_efficient_attention - - disable_xformers_memory_efficient_attention + - all + - __call__ + - enable_attention_slicing + - disable_attention_slicing + - enable_vae_slicing + - disable_vae_slicing + - enable_xformers_memory_efficient_attention + - disable_xformers_memory_efficient_attention ## StableDiffusionXLAdapterPipeline + [[autodoc]] StableDiffusionXLAdapterPipeline - - all - - __call__ - - enable_attention_slicing - - disable_attention_slicing - - enable_vae_slicing - - disable_vae_slicing - - enable_xformers_memory_efficient_attention - - disable_xformers_memory_efficient_attention + - all + - __call__ + - enable_attention_slicing + - disable_attention_slicing + - enable_vae_slicing + - disable_vae_slicing + - enable_xformers_memory_efficient_attention + - disable_xformers_memory_efficient_attention diff --git a/docs/source/en/optimization/tgate.md b/docs/source/en/optimization/tgate.md new file mode 100644 index 000000000000..d208ddfa8411 --- /dev/null +++ b/docs/source/en/optimization/tgate.md @@ -0,0 +1,182 @@ +# T-GATE + +[T-GATE](https://github.com/HaozheLiu-ST/T-GATE/tree/main) accelerates inference for [Stable Diffusion](../api/pipelines/stable_diffusion/overview), [PixArt](../api/pipelines/pixart), and [Latency Consistency Model](../api/pipelines/latent_consistency_models.md) pipelines by skipping the cross-attention calculation once it converges. This method doesn't require any additional training and it can speed up inference from 10-50%. T-GATE is also compatible with other optimization methods like [DeepCache](./deepcache). + +Before you begin, make sure you install T-GATE. + +```bash +pip install tgate +pip install -U pytorch diffusers transformers accelerate DeepCache +``` + + +To use T-GATE with a pipeline, you need to use its corresponding loader. + +| Pipeline | T-GATE Loader | +|---|---| +| PixArt | TgatePixArtLoader | +| Stable Diffusion XL | TgateSDXLLoader | +| Stable Diffusion XL + DeepCache | TgateSDXLDeepCacheLoader | +| Stable Diffusion | TgateSDLoader | +| Stable Diffusion + DeepCache | TgateSDDeepCacheLoader | + +Next, create a `TgateLoader` with a pipeline, the gate step (the time step to stop calculating the cross attention), and the number of inference steps. Then call the `tgate` method on the pipeline with a prompt, gate step, and the number of inference steps. + +Let's see how to enable this for several different pipelines. + + + + +Accelerate `PixArtAlphaPipeline` with T-GATE: + +```py +import torch +from diffusers import PixArtAlphaPipeline +from tgate import TgatePixArtLoader + +pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16) + +gate_step = 8 +inference_step = 25 +pipe = TgatePixArtLoader( + pipe, + gate_step=gate_step, + num_inference_steps=inference_step, +).to("cuda") + +image = pipe.tgate( + "An alpaca made of colorful building blocks, cyberpunk.", + gate_step=gate_step, + num_inference_steps=inference_step, +).images[0] +``` + + + +Accelerate `StableDiffusionXLPipeline` with T-GATE: + +```py +import torch +from diffusers import StableDiffusionXLPipeline +from diffusers import DPMSolverMultistepScheduler +from tgate import TgateSDXLLoader + +pipe = StableDiffusionXLPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + torch_dtype=torch.float16, + variant="fp16", + use_safetensors=True, +) +pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) + +gate_step = 10 +inference_step = 25 +pipe = TgateSDXLLoader( + pipe, + gate_step=gate_step, + num_inference_steps=inference_step, +).to("cuda") + +image = pipe.tgate( + "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.", + gate_step=gate_step, + num_inference_steps=inference_step +).images[0] +``` + + + +Accelerate `StableDiffusionXLPipeline` with [DeepCache](https://github.com/horseee/DeepCache) and T-GATE: + +```py +import torch +from diffusers import StableDiffusionXLPipeline +from diffusers import DPMSolverMultistepScheduler +from tgate import TgateSDXLDeepCacheLoader + +pipe = StableDiffusionXLPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + torch_dtype=torch.float16, + variant="fp16", + use_safetensors=True, +) +pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) + +gate_step = 10 +inference_step = 25 +pipe = TgateSDXLDeepCacheLoader( + pipe, + cache_interval=3, + cache_branch_id=0, +).to("cuda") + +image = pipe.tgate( + "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.", + gate_step=gate_step, + num_inference_steps=inference_step +).images[0] +``` + + + +Accelerate `latent-consistency/lcm-sdxl` with T-GATE: + +```py +import torch +from diffusers import StableDiffusionXLPipeline +from diffusers import UNet2DConditionModel, LCMScheduler +from diffusers import DPMSolverMultistepScheduler +from tgate import TgateSDXLLoader + +unet = UNet2DConditionModel.from_pretrained( + "latent-consistency/lcm-sdxl", + torch_dtype=torch.float16, + variant="fp16", +) +pipe = StableDiffusionXLPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + unet=unet, + torch_dtype=torch.float16, + variant="fp16", +) +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +gate_step = 1 +inference_step = 4 +pipe = TgateSDXLLoader( + pipe, + gate_step=gate_step, + num_inference_steps=inference_step, + lcm=True +).to("cuda") + +image = pipe.tgate( + "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.", + gate_step=gate_step, + num_inference_steps=inference_step +).images[0] +``` + + + +T-GATE also supports [`StableDiffusionPipeline`] and [PixArt-alpha/PixArt-LCM-XL-2-1024-MS](https://hf.co/PixArt-alpha/PixArt-LCM-XL-2-1024-MS). + +## Benchmarks +| Model | MACs | Param | Latency | Zero-shot 10K-FID on MS-COCO | +|-----------------------|----------|-----------|---------|---------------------------| +| SD-1.5 | 16.938T | 859.520M | 7.032s | 23.927 | +| SD-1.5 w/ T-GATE | 9.875T | 815.557M | 4.313s | 20.789 | +| SD-2.1 | 38.041T | 865.785M | 16.121s | 22.609 | +| SD-2.1 w/ T-GATE | 22.208T | 815.433 M | 9.878s | 19.940 | +| SD-XL | 149.438T | 2.570B | 53.187s | 24.628 | +| SD-XL w/ T-GATE | 84.438T | 2.024B | 27.932s | 22.738 | +| Pixart-Alpha | 107.031T | 611.350M | 61.502s | 38.669 | +| Pixart-Alpha w/ T-GATE | 65.318T | 462.585M | 37.867s | 35.825 | +| DeepCache (SD-XL) | 57.888T | - | 19.931s | 23.755 | +| DeepCache w/ T-GATE | 43.868T | - | 14.666s | 23.999 | +| LCM (SD-XL) | 11.955T | 2.570B | 3.805s | 25.044 | +| LCM w/ T-GATE | 11.171T | 2.024B | 3.533s | 25.028 | +| LCM (Pixart-Alpha) | 8.563T | 611.350M | 4.733s | 36.086 | +| LCM w/ T-GATE | 7.623T | 462.585M | 4.543s | 37.048 | + +The latency is tested on an NVIDIA 1080TI, MACs and Params are calculated with [calflops](https://github.com/MrYxJ/calculate-flops.pytorch), and the FID is calculated with [PytorchFID](https://github.com/mseitzer/pytorch-fid). diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md index 008dc3002bb5..40876a26e6a3 100644 --- a/docs/source/en/training/distributed_inference.md +++ b/docs/source/en/training/distributed_inference.md @@ -52,6 +52,76 @@ To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](h
+### Device placement + +> [!WARNING] +> This feature is experimental and its APIs might change in the future. + +With Accelerate, you can use the `device_map` to determine how to distribute the models of a pipeline across multiple devices. This is useful in situations where you have more than one GPU. + +For example, if you have two 8GB GPUs, then using [`~DiffusionPipeline.enable_model_cpu_offload`] may not work so well because: + +* it only works on a single GPU +* a single model might not fit on a single GPU ([`~DiffusionPipeline.enable_sequential_cpu_offload`] might work but it will be extremely slow and it is also limited to a single GPU) + +To make use of both GPUs, you can use the "balanced" device placement strategy which splits the models across all available GPUs. + +> [!WARNING] +> Only the "balanced" strategy is supported at the moment, and we plan to support additional mapping strategies in the future. + +```diff +from diffusers import DiffusionPipeline +import torch + +pipeline = DiffusionPipeline.from_pretrained( +- "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True, ++ "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True, device_map="balanced" +) +image = pipeline("a dog").images[0] +image +``` + +You can also pass a dictionary to enforce the maximum GPU memory that can be used on each device: + +```diff +from diffusers import DiffusionPipeline +import torch + +max_memory = {0:"1GB", 1:"1GB"} +pipeline = DiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + torch_dtype=torch.float16, + use_safetensors=True, + device_map="balanced", ++ max_memory=max_memory +) +image = pipeline("a dog").images[0] +image +``` + +If a device is not present in `max_memory`, then it will be completely ignored and will not participate in the device placement. + +By default, Diffusers uses the maximum memory of all devices. If the models don't fit on the GPUs, they are offloaded to the CPU. If the CPU doesn't have enough memory, then you might see an error. In that case, you could defer to using [`~DiffusionPipeline.enable_sequential_cpu_offload`] and [`~DiffusionPipeline.enable_model_cpu_offload`]. + +Call [`~DiffusionPipeline.reset_device_map`] to reset the `device_map` of a pipeline. This is also necessary if you want to use methods like `to()`, [`~DiffusionPipeline.enable_sequential_cpu_offload`], and [`~DiffusionPipeline.enable_model_cpu_offload`] on a pipeline that was device-mapped. + +```py +pipeline.reset_device_map() +``` + +Once a pipeline has been device-mapped, you can also access its device map via `hf_device_map`: + +```py +print(pipeline.hf_device_map) +``` + +An example device map would look like so: + + +```bash +{'unet': 1, 'vae': 1, 'safety_checker': 0, 'text_encoder': 0} +``` + ## PyTorch Distributed PyTorch supports [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) which enables data parallelism. diff --git a/docs/source/en/using-diffusers/callback.md b/docs/source/en/using-diffusers/callback.md index 296245c3abe2..3f3e8dae9f2d 100644 --- a/docs/source/en/using-diffusers/callback.md +++ b/docs/source/en/using-diffusers/callback.md @@ -148,9 +148,9 @@ pipeline = AutoPipelineForText2Image.from_pretrained( use_safetensors=True ).to("cuda") -image = pipe( - prompt = "A croissant shaped like a cute bear." - negative_prompt = "Deformed, ugly, bad anatomy" +image = pipeline( + prompt="A croissant shaped like a cute bear.", + negative_prompt="Deformed, ugly, bad anatomy", callback_on_step_end=decode_tensors, callback_on_step_end_tensor_inputs=["latents"], ).images[0] diff --git a/docs/source/en/using-diffusers/custom_pipeline_overview.md b/docs/source/en/using-diffusers/custom_pipeline_overview.md index 3c03ddc732f1..0b6bb53f10d6 100644 --- a/docs/source/en/using-diffusers/custom_pipeline_overview.md +++ b/docs/source/en/using-diffusers/custom_pipeline_overview.md @@ -16,17 +16,19 @@ specific language governing permissions and limitations under the License. ## Community pipelines -Community pipelines are any [`DiffusionPipeline`] class that are different from the original implementation as specified in their paper (for example, the [`StableDiffusionControlNetPipeline`] corresponds to the [Text-to-Image Generation with ControlNet Conditioning](https://arxiv.org/abs/2302.05543) paper). They provide additional functionality or extend the original implementation of a pipeline. +Community pipelines are any [`DiffusionPipeline`] class that are different from the original paper implementation (for example, the [`StableDiffusionControlNetPipeline`] corresponds to the [Text-to-Image Generation with ControlNet Conditioning](https://arxiv.org/abs/2302.05543) paper). They provide additional functionality or extend the original implementation of a pipeline. -There are many cool community pipelines like [Speech to Image](https://github.com/huggingface/diffusers/tree/main/examples/community#speech-to-image) or [Composable Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#composable-stable-diffusion), and you can find all the official community pipelines [here](https://github.com/huggingface/diffusers/tree/main/examples/community). +There are many cool community pipelines like [Marigold Depth Estimation](https://github.com/huggingface/diffusers/tree/main/examples/community#marigold-depth-estimation) or [InstantID](https://github.com/huggingface/diffusers/tree/main/examples/community#instantid-pipeline), and you can find all the official community pipelines [here](https://github.com/huggingface/diffusers/tree/main/examples/community). -To load any community pipeline on the Hub, pass the repository id of the community pipeline to the `custom_pipeline` argument and the model repository where you'd like to load the pipeline weights and components from. For example, the example below loads a dummy pipeline from [`hf-internal-testing/diffusers-dummy-pipeline`](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py) and the pipeline weights and components from [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32): +There are two types of community pipelines, those stored on the Hugging Face Hub and those stored on Diffusers GitHub repository. Hub pipelines are completely customizable (scheduler, models, pipeline code, etc.) while Diffusers GitHub pipelines are only limited to custom pipeline code. Refer to this [table](./contribute_pipeline#share-your-pipeline) for a more detailed comparison of Hub vs GitHub community pipelines. - + + -🔒 By loading a community pipeline from the Hugging Face Hub, you are trusting that the code you are loading is safe. Make sure to inspect the code online before loading and running it automatically! +To load a Hugging Face Hub community pipeline, pass the repository id of the community pipeline to the `custom_pipeline` argument and the model repository where you'd like to load the pipeline weights and components from. For example, the example below loads a dummy pipeline from [hf-internal-testing/diffusers-dummy-pipeline](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py) and the pipeline weights and components from [google/ddpm-cifar10-32](https://huggingface.co/google/ddpm-cifar10-32): - +> [!WARNING] +> By loading a community pipeline from the Hugging Face Hub, you are trusting that the code you are loading is safe. Make sure to inspect the code online before loading and running it automatically! ```py from diffusers import DiffusionPipeline @@ -36,7 +38,10 @@ pipeline = DiffusionPipeline.from_pretrained( ) ``` -Loading an official community pipeline is similar, but you can mix loading weights from an official repository id and pass pipeline components directly. The example below loads the community [CLIP Guided Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#clip-guided-stable-diffusion) pipeline, and you can pass the CLIP model components directly to it: + + + +To load a GitHub community pipeline, pass the repository id of the community pipeline to the `custom_pipeline` argument and the model repository where you you'd like to load the pipeline weights and components from. You can also load model components directly. The example below loads the community [CLIP Guided Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#clip-guided-stable-diffusion) pipeline and the CLIP model components. ```py from diffusers import DiffusionPipeline @@ -56,9 +61,12 @@ pipeline = DiffusionPipeline.from_pretrained( ) ``` + + + ### Load from a local file -Community pipelines can also be loaded from a local file if you pass a file path instead. The path to the passed directory must contain a `pipeline.py` file that contains the pipeline class in order to successfully load it. +Community pipelines can also be loaded from a local file if you pass a file path instead. The path to the passed directory must contain a pipeline.py file that contains the pipeline class. ```py pipeline = DiffusionPipeline.from_pretrained( @@ -77,7 +85,7 @@ By default, community pipelines are loaded from the latest stable version of Dif -For example, to load from the `main` branch: +For example, to load from the main branch: ```py pipeline = DiffusionPipeline.from_pretrained( @@ -93,7 +101,7 @@ pipeline = DiffusionPipeline.from_pretrained( -For example, to load from a previous version of Diffusers like `v0.25.0`: +For example, to load from a previous version of Diffusers like v0.25.0: ```py pipeline = DiffusionPipeline.from_pretrained( @@ -109,8 +117,49 @@ pipeline = DiffusionPipeline.from_pretrained( +### Load with from_pipe -For more information about community pipelines, take a look at the [Community pipelines](custom_pipeline_examples) guide for how to use them and if you're interested in adding a community pipeline check out the [How to contribute a community pipeline](contribute_pipeline) guide! +Community pipelines can also be loaded with the [`~DiffusionPipeline.from_pipe`] method which allows you to load and reuse multiple pipelines without any additional memory overhead (learn more in the [Reuse a pipeline](./loading#reuse-a-pipeline) guide). The memory requirement is determined by the largest single pipeline loaded. + +For example, let's load a community pipeline that supports [long prompts with weighting](https://github.com/huggingface/diffusers/tree/main/examples/community#long-prompt-weighting-stable-diffusion) from a Stable Diffusion pipeline. + +```py +import torch +from diffusers import DiffusionPipeline + +pipe_sd = DiffusionPipeline.from_pretrained("emilianJR/CyberRealistic_V3", torch_dtype=torch.float16) +pipe_sd.to("cuda") +# load long prompt weighting pipeline +pipe_lpw = DiffusionPipeline.from_pipe( + pipe_sd, + custom_pipeline="lpw_stable_diffusion", +).to("cuda") + +prompt = "cat, hiding in the leaves, ((rain)), zazie rainyday, beautiful eyes, macro shot, colorful details, natural lighting, amazing composition, subsurface scattering, amazing textures, filmic, soft light, ultra-detailed eyes, intricate details, detailed texture, light source contrast, dramatic shadows, cinematic light, depth of field, film grain, noise, dark background, hyperrealistic dslr film still, dim volumetric cinematic lighting" +neg_prompt = "(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers:1.4), (deformed, distorted, disfigured:1.3), poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation" +generator = torch.Generator(device="cpu").manual_seed(20) +out_lpw = pipe_lpw( + prompt, + negative_prompt=neg_prompt, + width=512, + height=512, + max_embeddings_multiples=3, + num_inference_steps=50, + generator=generator, + ).images[0] +out_lpw +``` + +
+
+ +
Stable Diffusion with long prompt weighting
+
+
+ +
Stable Diffusion
+
+
## Community components @@ -118,7 +167,7 @@ Community components allow users to build pipelines that may have customized com This section shows how users should use community components to build a community pipeline. -You'll use the [showlab/show-1-base](https://huggingface.co/showlab/show-1-base) pipeline checkpoint as an example. So, let's start loading the components: +You'll use the [showlab/show-1-base](https://huggingface.co/showlab/show-1-base) pipeline checkpoint as an example. 1. Import and load the text encoder from Transformers: @@ -152,17 +201,17 @@ In steps 4 and 5, the custom [UNet](https://github.com/showlab/Show-1/blob/main/ -4. Now you'll load a [custom UNet](https://github.com/showlab/Show-1/blob/main/showone/models/unet_3d_condition.py), which in this example, has already been implemented in the `showone_unet_3d_condition.py` [script](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) for your convenience. You'll notice the `UNet3DConditionModel` class name is changed to `ShowOneUNet3DConditionModel` because [`UNet3DConditionModel`] already exists in Diffusers. Any components needed for the `ShowOneUNet3DConditionModel` class should be placed in the `showone_unet_3d_condition.py` script. +4. Now you'll load a [custom UNet](https://github.com/showlab/Show-1/blob/main/showone/models/unet_3d_condition.py), which in this example, has already been implemented in [showone_unet_3d_condition.py](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) for your convenience. You'll notice the [`UNet3DConditionModel`] class name is changed to `ShowOneUNet3DConditionModel` because [`UNet3DConditionModel`] already exists in Diffusers. Any components needed for the `ShowOneUNet3DConditionModel` class should be placed in showone_unet_3d_condition.py. -Once this is done, you can initialize the UNet: + Once this is done, you can initialize the UNet: -```python -from showone_unet_3d_condition import ShowOneUNet3DConditionModel + ```python + from showone_unet_3d_condition import ShowOneUNet3DConditionModel -unet = ShowOneUNet3DConditionModel.from_pretrained(pipe_id, subfolder="unet") -``` + unet = ShowOneUNet3DConditionModel.from_pretrained(pipe_id, subfolder="unet") + ``` -5. Finally, you'll load the custom pipeline code. For this example, it has already been created for you in the `pipeline_t2v_base_pixel.py` [script](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/pipeline_t2v_base_pixel.py). This script contains a custom `TextToVideoIFPipeline` class for generating videos from text. Just like the custom UNet, any code needed for the custom pipeline to work should go in the `pipeline_t2v_base_pixel.py` script. +5. Finally, you'll load the custom pipeline code. For this example, it has already been created for you in [pipeline_t2v_base_pixel.py](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/pipeline_t2v_base_pixel.py). This script contains a custom `TextToVideoIFPipeline` class for generating videos from text. Just like the custom UNet, any code needed for the custom pipeline to work should go in pipeline_t2v_base_pixel.py. Once everything is in place, you can initialize the `TextToVideoIFPipeline` with the `ShowOneUNet3DConditionModel`: @@ -187,13 +236,16 @@ Push the pipeline to the Hub to share with the community! pipeline.push_to_hub("custom-t2v-pipeline") ``` -After the pipeline is successfully pushed, you need a couple of changes: +After the pipeline is successfully pushed, you need to make a few changes: -1. Change the `_class_name` attribute in [`model_index.json`](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/model_index.json#L2) to `"pipeline_t2v_base_pixel"` and `"TextToVideoIFPipeline"`. -2. Upload `showone_unet_3d_condition.py` to the `unet` [directory](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py). -3. Upload `pipeline_t2v_base_pixel.py` to the pipeline base [directory](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py). +1. Change the `_class_name` attribute in [model_index.json](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/model_index.json#L2) to `"pipeline_t2v_base_pixel"` and `"TextToVideoIFPipeline"`. +2. Upload `showone_unet_3d_condition.py` to the [unet](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) subfolder. +3. Upload `pipeline_t2v_base_pixel.py` to the pipeline [repository](https://huggingface.co/sayakpaul/show-1-base-with-code/tree/main). -To run inference, simply add the `trust_remote_code` argument while initializing the pipeline to handle all the "magic" behind the scenes. +To run inference, add the `trust_remote_code` argument while initializing the pipeline to handle all the "magic" behind the scenes. + +> [!WARNING] +> As an additional precaution with `trust_remote_code=True`, we strongly encourage you to pass a commit hash to the `revision` parameter in [`~DiffusionPipeline.from_pretrained`] to make sure the code hasn't been updated with some malicious new lines of code (unless you fully trust the model owners). ```python from diffusers import DiffusionPipeline @@ -221,10 +273,9 @@ video_frames = pipeline( ).frames ``` -As an additional reference example, you can refer to the repository structure of [stabilityai/japanese-stable-diffusion-xl](https://huggingface.co/stabilityai/japanese-stable-diffusion-xl/), that makes use of the `trust_remote_code` feature: +As an additional reference, take a look at the repository structure of [stabilityai/japanese-stable-diffusion-xl](https://huggingface.co/stabilityai/japanese-stable-diffusion-xl/) which also uses the `trust_remote_code` feature. ```python - from diffusers import DiffusionPipeline import torch @@ -232,14 +283,4 @@ pipeline = DiffusionPipeline.from_pretrained( "stabilityai/japanese-stable-diffusion-xl", trust_remote_code=True ) pipeline.to("cuda") - -# if using torch < 2.0 -# pipeline.enable_xformers_memory_efficient_attention() - -prompt = "柴犬、カラフルアート" - -image = pipeline(prompt=prompt).images[0] ``` - -> [!TIP] -> When using `trust_remote_code=True`, it is also strongly encouraged to pass a commit hash as a `revision` to make sure the author of the models did not update the code with some malicious new lines (unless you fully trust the authors of the models). \ No newline at end of file diff --git a/docs/source/en/using-diffusers/ip_adapter.md b/docs/source/en/using-diffusers/ip_adapter.md index 4ae403538d2b..dc64b2548529 100644 --- a/docs/source/en/using-diffusers/ip_adapter.md +++ b/docs/source/en/using-diffusers/ip_adapter.md @@ -362,14 +362,12 @@ IP-Adapter's image prompting and compatibility with other adapters and models ma ### Face model -Generating accurate faces is challenging because they are complex and nuanced. Diffusers supports two IP-Adapter checkpoints specifically trained to generate faces: +Generating accurate faces is challenging because they are complex and nuanced. Diffusers supports two IP-Adapter checkpoints specifically trained to generate faces from the [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter) repository: * [ip-adapter-full-face_sd15.safetensors](https://huggingface.co/h94/IP-Adapter/blob/main/models/ip-adapter-full-face_sd15.safetensors) is conditioned with images of cropped faces and removed backgrounds * [ip-adapter-plus-face_sd15.safetensors](https://huggingface.co/h94/IP-Adapter/blob/main/models/ip-adapter-plus-face_sd15.safetensors) uses patch embeddings and is conditioned with images of cropped faces -> [!TIP] -> -> [IP-Adapter-FaceID](https://huggingface.co/h94/IP-Adapter-FaceID) is a face-specific IP-Adapter trained with face ID embeddings instead of CLIP image embeddings, allowing you to generate more consistent faces in different contexts and styles. Try out this popular [community pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community#ip-adapter-face-id) and see how it compares to the other face IP-Adapters. +Additionally, Diffusers supports all IP-Adapter checkpoints trained with face embeddings extracted by `insightface` face models. Supported models are from the [h94/IP-Adapter-FaceID](https://huggingface.co/h94/IP-Adapter-FaceID) repository. For face models, use the [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter) checkpoint. It is also recommended to use [`DDIMScheduler`] or [`EulerDiscreteScheduler`] for face models. @@ -411,6 +409,56 @@ image +To use IP-Adapter FaceID models, first extract face embeddings with `insightface`. Then pass the list of tensors to the pipeline as `ip_adapter_image_embeds`. + +```py +import torch +from diffusers import StableDiffusionPipeline, DDIMScheduler +from diffusers.utils import load_image +from insightface.app import FaceAnalysis + +pipeline = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + torch_dtype=torch.float16, +).to("cuda") +pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config) +pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid_sd15.bin", image_encoder_folder=None) +pipeline.set_ip_adapter_scale(0.6) + +image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl1.png") + +ref_images_embeds = [] +app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) +app.prepare(ctx_id=0, det_size=(640, 640)) +image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB) +faces = app.get(image) +image = torch.from_numpy(faces[0].normed_embedding) +ref_images_embeds.append(image.unsqueeze(0)) +ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0) +neg_ref_images_embeds = torch.zeros_like(ref_images_embeds) +id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda")) + +generator = torch.Generator(device="cpu").manual_seed(42) + +images = pipeline( + prompt="A photo of a girl", + ip_adapter_image_embeds=[id_embeds], + negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", + num_inference_steps=20, num_images_per_prompt=1, + generator=generator +).images +``` + +Both IP-Adapter FaceID Plus and Plus v2 models require CLIP image embeddings. You can prepare face embeddings as shown previously, then you can extract and pass CLIP embeddings to the hidden image projection layers. + +```py +clip_embeds = pipeline.prepare_ip_adapter_image_embeds([ip_adapter_images], None, torch.device("cuda"), num_images, True)[0] + +pipeline.unet.encoder_hid_proj.image_projection_layers[0].clip_embeds = clip_embeds.to(dtype=torch.float16) +pipeline.unet.encoder_hid_proj.image_projection_layers[0].shortcut = False # True if Plus v2 +``` + + ### Multi IP-Adapter More than one IP-Adapter can be used at the same time to generate specific images in more diverse styles. For example, you can use IP-Adapter-Face to generate consistent faces and characters, and IP-Adapter Plus to generate those faces in a specific style. diff --git a/docs/source/en/using-diffusers/loading.md b/docs/source/en/using-diffusers/loading.md index 9d5534154fc8..e7b2c4b7acb3 100644 --- a/docs/source/en/using-diffusers/loading.md +++ b/docs/source/en/using-diffusers/loading.md @@ -10,57 +10,75 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# Load pipelines, models, and schedulers +# Load pipelines [[open-in-colab]] -Having an easy way to use a diffusion system for inference is essential to 🧨 Diffusers. Diffusion systems often consist of multiple components like parameterized models, tokenizers, and schedulers that interact in complex ways. That is why we designed the [`DiffusionPipeline`] to wrap the complexity of the entire diffusion system into an easy-to-use API, while remaining flexible enough to be adapted for other use cases, such as loading each component individually as building blocks to assemble your own diffusion system. - -Everything you need for inference or training is accessible with the `from_pretrained()` method. +Diffusion systems consist of multiple components like parameterized models and schedulers that interact in complex ways. That is why we designed the [`DiffusionPipeline`] to wrap the complexity of the entire diffusion system into an easy-to-use API. At the same time, the [`DiffusionPipeline`] is entirely customizable so you can modify each component to build a diffusion system for your use case. This guide will show you how to load: - pipelines from the Hub and locally - different components into a pipeline +- multiple pipelines without increasing memory usage - checkpoint variants such as different floating point types or non-exponential mean averaged (EMA) weights -- models and schedulers -## Diffusion Pipeline +## Load a pipeline + +> [!TIP] +> Skip to the [DiffusionPipeline explained](#diffusionpipeline-explained) section if you're interested in an explanation about how the [`DiffusionPipeline`] class works. - +There are two ways to load a pipeline for a task: -💡 Skip to the [DiffusionPipeline explained](#diffusionpipeline-explained) section if you are interested in learning in more detail about how the [`DiffusionPipeline`] class works. +1. Load the generic [`DiffusionPipeline`] class and allow it to automatically detect the correct pipeline class from the checkpoint. +2. Load a specific pipeline class for a specific task. - + + -The [`DiffusionPipeline`] class is the simplest and most generic way to load the latest trending diffusion model from the [Hub](https://huggingface.co/models?library=diffusers&sort=trending). The [`DiffusionPipeline.from_pretrained`] method automatically detects the correct pipeline class from the checkpoint, downloads, and caches all the required configuration and weight files, and returns a pipeline instance ready for inference. +The [`DiffusionPipeline`] class is a simple and generic way to load the latest trending diffusion model from the [Hub](https://huggingface.co/models?library=diffusers&sort=trending). It uses the [`~DiffusionPipeline.from_pretrained`] method to automatically detect the correct pipeline class for a task from the checkpoint, downloads and caches all the required configuration and weight files, and returns a pipeline ready for inference. ```python from diffusers import DiffusionPipeline -repo_id = "runwayml/stable-diffusion-v1-5" -pipe = DiffusionPipeline.from_pretrained(repo_id, use_safetensors=True) +pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True) ``` -You can also load a checkpoint with its specific pipeline class. The example above loaded a Stable Diffusion model; to get the same result, use the [`StableDiffusionPipeline`] class: +This same checkpoint can also be used for an image-to-image task. The [`DiffusionPipeline`] class can handle any task as long as you provide the appropriate inputs. For example, for an image-to-image task, you need to pass an initial image to the pipeline. + +```py +from diffusers import DiffusionPipeline + +pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True) + +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png") +prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" +image = pipeline("Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", image=init_image).images[0] +``` + + + + +Checkpoints can be loaded by their specific pipeline class if you already know it. For example, to load a Stable Diffusion model, use the [`StableDiffusionPipeline`] class. ```python from diffusers import StableDiffusionPipeline -repo_id = "runwayml/stable-diffusion-v1-5" -pipe = StableDiffusionPipeline.from_pretrained(repo_id, use_safetensors=True) +pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True) ``` -A checkpoint (such as [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) or [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)) may also be used for more than one task, like text-to-image or image-to-image. To differentiate what task you want to use the checkpoint for, you have to load it directly with its corresponding task-specific pipeline class: +This same checkpoint may also be used for another task like image-to-image. To differentiate what task you want to use the checkpoint for, you have to use the corresponding task-specific pipeline class. For example, to use the same checkpoint for image-to-image, use the [`StableDiffusionImg2ImgPipeline`] class. -```python +```py from diffusers import StableDiffusionImg2ImgPipeline -repo_id = "runwayml/stable-diffusion-v1-5" -pipe = StableDiffusionImg2ImgPipeline.from_pretrained(repo_id) +pipeline = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True) ``` -You can use the Space below to gauge the memory requirements of a pipeline you want to load beforehand without downloading the pipeline checkpoints: + + + +Use the Space below to gauge a pipeline's memory requirements before you download and load it to see if it runs on your hardware.