diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 718c67731bd5..b01e55f8d19e 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -39,7 +39,7 @@ jobs: python utils/print_env.py - name: Diffusers Benchmarking env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }} + HF_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }} BASE_PATH: benchmark_outputs run: | export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))") diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml index 82ef885b240e..386ebd0649ca 100644 --- a/.github/workflows/build_docker_images.yml +++ b/.github/workflows/build_docker_images.yml @@ -25,17 +25,17 @@ jobs: steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v1 - + - name: Check out code uses: actions/checkout@v3 - + - name: Find Changed Dockerfiles id: file_changes uses: jitterbit/get-changed-files@v1 with: format: 'space-delimited' token: ${{ secrets.GITHUB_TOKEN }} - + - name: Build Changed Docker Images run: | CHANGED_FILES="${{ steps.file_changes.outputs.all }}" @@ -52,7 +52,7 @@ jobs: build-and-push-docker-images: runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ] if: github.event_name != 'pull_request' - + permissions: contents: read packages: write @@ -69,6 +69,7 @@ jobs: - diffusers-flax-tpu - diffusers-onnxruntime-cpu - diffusers-onnxruntime-cuda + - diffusers-doc-builder steps: - name: Checkout repository @@ -90,24 +91,11 @@ jobs: - name: Post to a Slack channel id: slack - uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001 + uses: huggingface/hf-workflows/.github/actions/post-slack@main with: # Slack channel id, channel name, or user id to post message. # See also: https://api.slack.com/methods/chat.postMessage#channels - channel-id: ${{ env.CI_SLACK_CHANNEL }} - # For posting a rich message using Block Kit - payload: | - { - "text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.head_commit.url }}", - "blocks": [ - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.head_commit.url }}" - } - } - ] - } - env: - SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + slack_channel: ${{ env.CI_SLACK_CHANNEL }} + title: "🤗 Results of the ${{ matrix.image-name }} Docker Image build" + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index d9054928ed8d..6d4193e3cccc 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -21,7 +21,7 @@ jobs: package: diffusers notebook_folder: diffusers_doc languages: en ko zh ja pt - + custom_container: diffusers/diffusers-doc-builder secrets: token: ${{ secrets.HUGGINGFACE_PUSH }} hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 8e19d8fafbe3..52e075733163 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -20,3 +20,4 @@ jobs: install_libgl1: true package: diffusers languages: en ko zh ja pt + custom_container: diffusers/diffusers-doc-builder diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml index 2e9ac33d6b00..7ca065f69cd6 100644 --- a/.github/workflows/nightly_tests.yml +++ b/.github/workflows/nightly_tests.yml @@ -81,7 +81,7 @@ jobs: - name: Nightly PyTorch CUDA checkpoint (pipelines) tests env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms CUBLAS_WORKSPACE_CONFIG: :16:8 run: | @@ -141,7 +141,7 @@ jobs: - name: Run nightly PyTorch CUDA tests for non-pipeline modules if: ${{ matrix.module != 'examples'}} env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms CUBLAS_WORKSPACE_CONFIG: :16:8 run: | @@ -154,7 +154,7 @@ jobs: - name: Run nightly example tests with Torch if: ${{ matrix.module == 'examples' }} env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms CUBLAS_WORKSPACE_CONFIG: :16:8 run: | @@ -211,7 +211,7 @@ jobs: - name: Run nightly LoRA tests with PEFT and Torch env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms CUBLAS_WORKSPACE_CONFIG: :16:8 run: | @@ -269,7 +269,7 @@ jobs: - name: Run nightly Flax TPU tests env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | python -m pytest -n 0 \ -s -v -k "Flax" \ @@ -324,7 +324,7 @@ jobs: - name: Run nightly ONNXRuntime CUDA tests env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \ -s -v -k "Onnx" \ @@ -390,7 +390,7 @@ jobs: shell: arch -arch arm64 bash {0} env: HF_HOME: /System/Volumes/Data/mnt/cache - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \ --report-log=tests_torch_mps.log \ diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index b1bed6568aa4..d5d1fc719305 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -156,7 +156,7 @@ jobs: if: ${{ matrix.config.framework == 'pytorch_examples' }} run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" - python -m uv pip install peft + python -m uv pip install peft timm python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \ --make-reports=tests_${{ matrix.config.report }} \ examples diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index d071af2b0be2..7a194f717895 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -56,6 +56,7 @@ jobs: needs: setup_torch_cuda_pipeline_matrix strategy: fail-fast: false + max-parallel: 8 matrix: module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }} runs-on: [single-gpu, nvidia-gpu, t4, ci] @@ -86,7 +87,7 @@ jobs: python utils/print_env.py - name: Slow PyTorch CUDA checkpoint tests on Ubuntu env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms CUBLAS_WORKSPACE_CONFIG: :16:8 run: | @@ -124,7 +125,7 @@ jobs: shell: bash strategy: matrix: - module: [models, schedulers, lora, others] + module: [models, schedulers, lora, others, single_file] steps: - name: Checkout diffusers uses: actions/checkout@v3 @@ -143,7 +144,7 @@ jobs: - name: Run slow PyTorch CUDA tests env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms CUBLAS_WORKSPACE_CONFIG: :16:8 run: | @@ -193,7 +194,7 @@ jobs: - name: Run slow PEFT CUDA tests env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms CUBLAS_WORKSPACE_CONFIG: :16:8 run: | @@ -242,7 +243,7 @@ jobs: - name: Run slow Flax TPU tests env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | python -m pytest -n 0 \ -s -v -k "Flax" \ @@ -289,7 +290,7 @@ jobs: - name: Run slow ONNXRuntime CUDA tests env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \ -s -v -k "Onnx" \ @@ -336,7 +337,7 @@ jobs: python utils/print_env.py - name: Run example tests on GPU env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/ - name: Failure short reports @@ -377,7 +378,7 @@ jobs: python utils/print_env.py - name: Run example tests on GPU env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/ - name: Failure short reports @@ -422,9 +423,10 @@ jobs: - name: Run example tests on GPU env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install timm python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/ - name: Failure short reports diff --git a/.github/workflows/push_tests_fast.yml b/.github/workflows/push_tests_fast.yml index 7c50da7b5c34..54ff48993768 100644 --- a/.github/workflows/push_tests_fast.yml +++ b/.github/workflows/push_tests_fast.yml @@ -107,7 +107,7 @@ jobs: if: ${{ matrix.config.framework == 'pytorch_examples' }} run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" - python -m uv pip install peft + python -m uv pip install peft timm python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \ --make-reports=tests_${{ matrix.config.report }} \ examples diff --git a/.github/workflows/push_tests_mps.yml b/.github/workflows/push_tests_mps.yml index 3a14f856346b..5a7560d34d43 100644 --- a/.github/workflows/push_tests_mps.yml +++ b/.github/workflows/push_tests_mps.yml @@ -23,7 +23,7 @@ concurrency: jobs: run_fast_tests_apple_m1: name: Fast PyTorch MPS tests on MacOS - runs-on: [ self-hosted, apple-m1 ] + runs-on: macos-13-xlarge steps: - name: Checkout diffusers @@ -59,7 +59,7 @@ jobs: shell: arch -arch arm64 bash {0} env: HF_HOME: /System/Volumes/Data/mnt/cache - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | ${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/ diff --git a/.github/workflows/update_metadata.yml b/.github/workflows/update_metadata.yml index f91fa29a1ab9..92aea0369ba8 100644 --- a/.github/workflows/update_metadata.yml +++ b/.github/workflows/update_metadata.yml @@ -25,6 +25,6 @@ jobs: - name: Update metadata env: - HUGGING_FACE_HUB_TOKEN: ${{ secrets.SAYAK_HF_TOKEN }} + HF_TOKEN: ${{ secrets.SAYAK_HF_TOKEN }} run: | python utils/update_metadata.py --commit_sha ${{ github.sha }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 887e4dd43c45..59d39155952a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -355,7 +355,7 @@ You will need basic `git` proficiency to be able to contribute to manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro Git](https://git-scm.com/book/en/v2) is a very good reference. -Follow these steps to start contributing ([supported Python versions](https://github.com/huggingface/diffusers/blob/main/setup.py#L265)): +Follow these steps to start contributing ([supported Python versions](https://github.com/huggingface/diffusers/blob/42f25d601a910dceadaee6c44345896b4cfa9928/setup.py#L270)): 1. Fork the [repository](https://github.com/huggingface/diffusers) by clicking on the 'Fork' button on the repository's page. This creates a copy of the code diff --git a/README.md b/README.md index 5c9b53b5160e..bdbf7e6858e3 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi ## Quickstart -Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 22000+ checkpoints): +Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 25.000+ checkpoints): ```python from diffusers import DiffusionPipeline @@ -221,7 +221,7 @@ Also, say 👋 in our public Discord channel =1.13.1" \ --extra-index-url https://download.pytorch.org/whl/cu117 && \ - python3 -m uv pip install --no-cache-dir \ + python3.10 -m uv pip install --no-cache-dir \ accelerate \ datasets \ hf-doc-builder \ diff --git a/docker/diffusers-pytorch-compile-cuda/Dockerfile b/docker/diffusers-pytorch-compile-cuda/Dockerfile index b4f507fcf511..fbee3389edc2 100644 --- a/docker/diffusers-pytorch-compile-cuda/Dockerfile +++ b/docker/diffusers-pytorch-compile-cuda/Dockerfile @@ -4,8 +4,11 @@ LABEL repository="diffusers" ENV DEBIAN_FRONTEND=noninteractive -RUN apt update && \ - apt install -y bash \ +RUN apt-get -y update \ + && apt-get install -y software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa + +RUN apt install -y bash \ build-essential \ git \ git-lfs \ @@ -13,24 +16,23 @@ RUN apt update && \ ca-certificates \ libsndfile1-dev \ libgl1 \ - python3.9 \ - python3.9-dev \ + python3.10 \ python3-pip \ - python3.9-venv && \ + python3.10-venv && \ rm -rf /var/lib/apt/lists # make sure to use venv -RUN python3.9 -m venv /opt/venv +RUN python3.10 -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py) -RUN python3.9 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ - python3.9 -m uv pip install --no-cache-dir \ +RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ + python3.10 -m uv pip install --no-cache-dir \ torch \ torchvision \ torchaudio \ invisible_watermark && \ - python3.9 -m pip install --no-cache-dir \ + python3.10 -m pip install --no-cache-dir \ accelerate \ datasets \ hf-doc-builder \ diff --git a/docker/diffusers-pytorch-cpu/Dockerfile b/docker/diffusers-pytorch-cpu/Dockerfile index 213662fa2fe9..af48ac415ec6 100644 --- a/docker/diffusers-pytorch-cpu/Dockerfile +++ b/docker/diffusers-pytorch-cpu/Dockerfile @@ -4,33 +4,36 @@ LABEL repository="diffusers" ENV DEBIAN_FRONTEND=noninteractive -RUN apt update && \ - apt install -y bash \ +RUN apt-get -y update \ + && apt-get install -y software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa + +RUN apt install -y bash \ build-essential \ git \ git-lfs \ curl \ ca-certificates \ libsndfile1-dev \ - python3.8 \ + python3.10 \ python3-pip \ libgl1 \ - python3.8-venv && \ + python3.10-venv && \ rm -rf /var/lib/apt/lists # make sure to use venv -RUN python3 -m venv /opt/venv +RUN python3.10 -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py) -RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ - python3 -m uv pip install --no-cache-dir \ +RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ + python3.10 -m uv pip install --no-cache-dir \ torch \ torchvision \ torchaudio \ invisible_watermark \ --extra-index-url https://download.pytorch.org/whl/cpu && \ - python3 -m uv pip install --no-cache-dir \ + python3.10 -m uv pip install --no-cache-dir \ accelerate \ datasets \ hf-doc-builder \ diff --git a/docker/diffusers-pytorch-cuda/Dockerfile b/docker/diffusers-pytorch-cuda/Dockerfile index 8e0e56bafa02..88805f2140d7 100644 --- a/docker/diffusers-pytorch-cuda/Dockerfile +++ b/docker/diffusers-pytorch-cuda/Dockerfile @@ -4,8 +4,11 @@ LABEL repository="diffusers" ENV DEBIAN_FRONTEND=noninteractive -RUN apt update && \ - apt install -y bash \ +RUN apt-get -y update \ + && apt-get install -y software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa + +RUN apt install -y bash \ build-essential \ git \ git-lfs \ @@ -13,23 +16,23 @@ RUN apt update && \ ca-certificates \ libsndfile1-dev \ libgl1 \ - python3.8 \ + python3.10 \ python3-pip \ - python3.8-venv && \ + python3.10-venv && \ rm -rf /var/lib/apt/lists # make sure to use venv -RUN python3 -m venv /opt/venv +RUN python3.10 -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py) -RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ - python3 -m uv pip install --no-cache-dir \ +RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ + python3.10 -m uv pip install --no-cache-dir \ torch \ torchvision \ torchaudio \ invisible_watermark && \ - python3 -m pip install --no-cache-dir \ + python3.10 -m pip install --no-cache-dir \ accelerate \ datasets \ hf-doc-builder \ diff --git a/docker/diffusers-pytorch-xformers-cuda/Dockerfile b/docker/diffusers-pytorch-xformers-cuda/Dockerfile index 90ac0d1ebeaf..503b01a66019 100644 --- a/docker/diffusers-pytorch-xformers-cuda/Dockerfile +++ b/docker/diffusers-pytorch-xformers-cuda/Dockerfile @@ -4,8 +4,11 @@ LABEL repository="diffusers" ENV DEBIAN_FRONTEND=noninteractive -RUN apt update && \ - apt install -y bash \ +RUN apt-get -y update \ + && apt-get install -y software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa + +RUN apt install -y bash \ build-essential \ git \ git-lfs \ @@ -13,23 +16,23 @@ RUN apt update && \ ca-certificates \ libsndfile1-dev \ libgl1 \ - python3.8 \ + python3.10 \ python3-pip \ - python3.8-venv && \ + python3.10-venv && \ rm -rf /var/lib/apt/lists # make sure to use venv -RUN python3 -m venv /opt/venv +RUN python3.10 -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py) -RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ - python3 -m pip install --no-cache-dir \ +RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ + python3.10 -m pip install --no-cache-dir \ torch \ torchvision \ torchaudio \ invisible_watermark && \ - python3 -m uv pip install --no-cache-dir \ + python3.10 -m uv pip install --no-cache-dir \ accelerate \ datasets \ hf-doc-builder \ diff --git a/docs/README.md b/docs/README.md index e7aa8c4f682b..f36b76fb0789 100644 --- a/docs/README.md +++ b/docs/README.md @@ -242,10 +242,10 @@ Here's an example of a tuple return, comprising several objects: ``` Returns: - `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs: - - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` -- + `tuple(torch.Tensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs: + - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.Tensor` of shape `(1,)` -- Total loss is the sum of the masked language modeling loss and the next sequence prediction (classification) loss. - - **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) -- + - **prediction_scores** (`torch.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`) -- Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). ``` diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 1c21d4cd9f74..2f4651ba3417 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -305,6 +305,8 @@ title: Personalized Image Animator (PIA) - local: api/pipelines/pixart title: PixArt-α + - local: api/pipelines/pixart_sigma + title: PixArt-Σ - local: api/pipelines/self_attention_guidance title: Self-Attention Guidance - local: api/pipelines/semantic_stable_diffusion @@ -439,6 +441,8 @@ title: Utilities - local: api/image_processor title: VAE Image Processor + - local: api/video_processor + title: Video Processor title: Internal classes isExpanded: false title: API diff --git a/docs/source/en/api/image_processor.md b/docs/source/en/api/image_processor.md index e5aba851bb5c..e633a936103d 100644 --- a/docs/source/en/api/image_processor.md +++ b/docs/source/en/api/image_processor.md @@ -25,3 +25,11 @@ All pipelines with [`VaeImageProcessor`] accept PIL Image, PyTorch tensor, or Nu The [`VaeImageProcessorLDM3D`] accepts RGB and depth inputs and returns RGB and depth outputs. [[autodoc]] image_processor.VaeImageProcessorLDM3D + +## PixArtImageProcessor + +[[autodoc]] image_processor.PixArtImageProcessor + +## IPAdapterMaskProcessor + +[[autodoc]] image_processor.IPAdapterMaskProcessor diff --git a/docs/source/en/api/loaders/single_file.md b/docs/source/en/api/loaders/single_file.md index 359ed60a996b..34190f0244f4 100644 --- a/docs/source/en/api/loaders/single_file.md +++ b/docs/source/en/api/loaders/single_file.md @@ -10,13 +10,134 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# Single files +# Loading Pipelines and Models via `from_single_file` -Diffusers supports loading pretrained pipeline (or model) weights stored in a single file, such as a `ckpt` or `safetensors` file. These single file types are typically produced from community trained models. There are three classes for loading single file weights: +The `from_single_file` method allows you to load supported pipelines using a single checkpoint file as opposed to Diffusers' multiple folders format. This is useful if you are working with Stable Diffusion Web UI's (such as A1111) that rely on a single file format to distribute all the components of a model. -- [`FromSingleFileMixin`] supports loading pretrained pipeline weights stored in a single file, which can either be a `ckpt` or `safetensors` file. -- [`FromOriginalVAEMixin`] supports loading a pretrained [`AutoencoderKL`] from pretrained ControlNet weights stored in a single file, which can either be a `ckpt` or `safetensors` file. -- [`FromOriginalControlnetMixin`] supports loading pretrained ControlNet weights stored in a single file, which can either be a `ckpt` or `safetensors` file. +The `from_single_file` method also supports loading models in their originally distributed format. This means that supported models that have been finetuned with other services can be loaded directly into Diffusers model objects and pipelines. + +## Pipelines that currently support `from_single_file` loading + +- [`StableDiffusionPipeline`] +- [`StableDiffusionImg2ImgPipeline`] +- [`StableDiffusionInpaintPipeline`] +- [`StableDiffusionControlNetPipeline`] +- [`StableDiffusionControlNetImg2ImgPipeline`] +- [`StableDiffusionControlNetInpaintPipeline`] +- [`StableDiffusionUpscalePipeline`] +- [`StableDiffusionXLPipeline`] +- [`StableDiffusionXLImg2ImgPipeline`] +- [`StableDiffusionXLInpaintPipeline`] +- [`StableDiffusionXLInstructPix2PixPipeline`] +- [`StableDiffusionXLControlNetPipeline`] +- [`StableDiffusionXLKDiffusionPipeline`] +- [`LatentConsistencyModelPipeline`] +- [`LatentConsistencyModelImg2ImgPipeline`] +- [`StableDiffusionControlNetXSPipeline`] +- [`StableDiffusionXLControlNetXSPipeline`] +- [`LEditsPPPipelineStableDiffusion`] +- [`LEditsPPPipelineStableDiffusionXL`] +- [`PIAPipeline`] + +## Models that currently support `from_single_file` loading + +- [`UNet2DConditionModel`] +- [`StableCascadeUNet`] +- [`AutoencoderKL`] +- [`ControlNetModel`] + +## Usage Examples + +## Loading a Pipeline using `from_single_file` + +```python +from diffusers import StableDiffusionXLPipeline + +ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors" +pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path) +``` + +## Setting components in a Pipeline using `from_single_file` + +Set components of a pipeline by passing them directly to the `from_single_file` method. For example, here we are swapping out the pipeline's default scheduler with the `DDIMScheduler`. + +```python +from diffusers import StableDiffusionXLPipeline, DDIMScheduler + +ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors" + +scheduler = DDIMScheduler() +pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path, scheduler=scheduler) + +``` + +Here we are passing in a ControlNet model to the `StableDiffusionControlNetPipeline`. + +```python +from diffusers import StableDiffusionControlNetPipeline, ControlNetModel + +ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors" + +controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny") +pipe = StableDiffusionControlNetPipeline.from_single_file(ckpt_path, controlnet=controlnet) + +``` + +## Loading a Model using `from_single_file` + +```python +from diffusers import StableCascadeUNet + +ckpt_path = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_lite.safetensors" +model = StableCascadeUNet.from_single_file(ckpt_path) + +``` + +## Using a Diffusers model repository to configure single file loading + +Under the hood, `from_single_file` will try to automatically determine a model repository to use to configure the components of a pipeline. You can also explicitly set the model repository to configure the pipeline with the `config` argument. + +```python +from diffusers import StableDiffusionXLPipeline + +ckpt_path = "https://huggingface.co/segmind/SSD-1B/blob/main/SSD-1B.safetensors" +repo_id = "segmind/SSD-1B" + +pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path, config=repo_id) + +``` + +In the example above, since we explicitly passed `repo_id="segmind/SSD-1B"` to the `config` argument, it will use this [configuration file](https://huggingface.co/segmind/SSD-1B/blob/main/unet/config.json) from the `unet` subfolder in `"segmind/SSD-1B"` to configure the `unet` component of the pipeline; Similarly, it will use the `config.json` file from `vae` subfolder to configure the `vae` model, `config.json` file from `text_encoder` folder to configure `text_encoder` and so on. + + + +Most of the time you do not need to explicitly set a `config` argument. `from_single_file` will automatically map the checkpoint to the appropriate model repository. However, this option can be useful in cases where model components in the checkpoint might have been changed from what was originally distributed, or in cases where a checkpoint file might not have the necessary metadata to correctly determine the configuration to use for the pipeline. + + + +## Override configuration options when using single file loading + +Override the default model or pipeline configuration options by providing the relevant arguments directly to the `from_single_file` method. Any argument supported by the model or pipeline class can be configured in this way: + +### Setting a pipeline configuration option + +```python +from diffusers import StableDiffusionXLInstructPix2PixPipeline + +ckpt_path = "https://huggingface.co/stabilityai/cosxl/blob/main/cosxl_edit.safetensors" +pipe = StableDiffusionXLInstructPix2PixPipeline.from_single_file(ckpt_path, config="diffusers/sdxl-instructpix2pix-768", is_cosxl_edit=True) + +``` + +### Setting a model configuration option + +```python +from diffusers import UNet2DConditionModel + +ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors" +model = UNet2DConditionModel.from_single_file(ckpt_path, upcast_attention=True) + +``` @@ -24,14 +145,116 @@ To learn more about how to load single file weights, see the [Load different Sta -## FromSingleFileMixin +## Working with local files -[[autodoc]] loaders.single_file.FromSingleFileMixin +As of `diffusers>=0.28.0` the `from_single_file` method will attempt to configure a pipeline or model by first inferring the model type from the keys in the checkpoint file. This inferred model type is then used to determine the appropriate model repository on the Hugging Face Hub to configure the model or pipeline. + +For example, any single file checkpoint based on the Stable Diffusion XL base model will use the [`stabilityai/stable-diffusion-xl-base-1.0`](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) model repository to configure the pipeline. + +If you are working in an environment with restricted internet access, it is recommended that you download the config files and checkpoints for the model to your preferred directory and pass the local paths to the `pretrained_model_link_or_path` and `config` arguments of the `from_single_file` method. + +```python +from huggingface_hub import hf_hub_download, snapshot_download + +my_local_checkpoint_path = hf_hub_download( + repo_id="segmind/SSD-1B", + filename="SSD-1B.safetensors" +) + +my_local_config_path = snapshot_download( + repo_id="segmind/SSD-1B", + allowed_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"] +) + +pipe = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True) + +``` + +By default this will download the checkpoints and config files to the [Hugging Face Hub cache directory](https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache). You can also specify a local directory to download the files to by passing the `local_dir` argument to the `hf_hub_download` and `snapshot_download` functions. + +```python +from huggingface_hub import hf_hub_download, snapshot_download + +my_local_checkpoint_path = hf_hub_download( + repo_id="segmind/SSD-1B", + filename="SSD-1B.safetensors" + local_dir="my_local_checkpoints" +) + +my_local_config_path = snapshot_download( + repo_id="segmind/SSD-1B", + allowed_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"] + local_dir="my_local_config" +) + +pipe = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True) + +``` + +## Working with local files on file systems that do not support symlinking + +By default the `from_single_file` method relies on the `huggingface_hub` caching mechanism to fetch and store checkpoints and config files for models and pipelines. If you are working with a file system that does not support symlinking, it is recommended that you first download the checkpoint file to a local directory and disable symlinking by passing the `local_dir_use_symlink=False` argument to the `hf_hub_download` and `snapshot_download` functions. + +```python +from huggingface_hub import hf_hub_download, snapshot_download + +my_local_checkpoint_path = hf_hub_download( + repo_id="segmind/SSD-1B", + filename="SSD-1B.safetensors" + local_dir="my_local_checkpoints", + local_dir_use_symlinks=False +) +print("My local checkpoint: ", my_local_checkpoint_path) + +my_local_config_path = snapshot_download( + repo_id="segmind/SSD-1B", + allowed_patterns=["*.json", "**/*.json", "*.txt", "**/*.txt"] + local_dir_use_symlinks=False, +) +print("My local config: ", my_local_config_path) + +``` -## FromOriginalVAEMixin +Then pass the local paths to the `pretrained_model_link_or_path` and `config` arguments of the `from_single_file` method. -[[autodoc]] loaders.autoencoder.FromOriginalVAEMixin +```python +pipe = StableDiffusionXLPipeline.from_single_file(my_local_checkpoint_path, config=my_local_config_path, local_files_only=True) + +``` + + + +As of `huggingface_hub>=0.23.0` the `local_dir_use_symlinks` argument isn't necessary for the `hf_hub_download` and `snapshot_download` functions. + + + +## Using the original configuration file of a model + +If you would like to configure the model components in a pipeline using the orignal YAML configuration file, you can pass a local path or url to the original configuration file via the `original_config` argument. + +```python +from diffusers import StableDiffusionXLPipeline + +ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors" +repo_id = "stabilityai/stable-diffusion-xl-base-1.0" +original_config = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml" + +pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path, original_config=original_config) +``` + + + +When using `original_config` with `local_files_only=True`, Diffusers will attempt to infer the components of the pipeline based on the type signatures of pipeline class, rather than attempting to fetch the configuration files from a model repository on the Hugging Face Hub. This is to prevent backward breaking changes in existing code that might not be able to connect to the internet to fetch the necessary configuration files. + +This is not as reliable as providing a path to a local model repository using the `config` argument and might lead to errors when configuring the pipeline. To avoid this, please run the pipeline with `local_files_only=False` once to download the appropriate pipeline configuration files to the local cache. + + + + +## FromSingleFileMixin + +[[autodoc]] loaders.single_file.FromSingleFileMixin -## FromOriginalControlnetMixin +## FromOriginalModelMixin -[[autodoc]] loaders.controlnet.FromOriginalControlNetMixin \ No newline at end of file +[[autodoc]] loaders.single_file_model.FromOriginalModelMixin diff --git a/docs/source/en/api/pipelines/pixart.md b/docs/source/en/api/pipelines/pixart.md index ef50b1744d71..b0152570b9b1 100644 --- a/docs/source/en/api/pipelines/pixart.md +++ b/docs/source/en/api/pipelines/pixart.md @@ -31,7 +31,7 @@ Some notes about this pipeline: -Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines. +Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines. diff --git a/docs/source/en/api/pipelines/pixart_sigma.md b/docs/source/en/api/pipelines/pixart_sigma.md new file mode 100644 index 000000000000..854b9781e92e --- /dev/null +++ b/docs/source/en/api/pipelines/pixart_sigma.md @@ -0,0 +1,151 @@ + + +# PixArt-Σ + +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/pixart/header_collage_sigma.jpg) + +[PixArt-Σ: Weak-to-Strong Training of Diffusion Transformer for 4K Text-to-Image Generation](https://huggingface.co/papers/2403.04692) is Junsong Chen, Jincheng Yu, Chongjian Ge, Lewei Yao, Enze Xie, Yue Wu, Zhongdao Wang, James Kwok, Ping Luo, Huchuan Lu, and Zhenguo Li. + +The abstract from the paper is: + +*In this paper, we introduce PixArt-Σ, a Diffusion Transformer model (DiT) capable of directly generating images at 4K resolution. PixArt-Σ represents a significant advancement over its predecessor, PixArt-α, offering images of markedly higher fidelity and improved alignment with text prompts. A key feature of PixArt-Σ is its training efficiency. Leveraging the foundational pre-training of PixArt-α, it evolves from the ‘weaker’ baseline to a ‘stronger’ model via incorporating higher quality data, a process we term “weak-to-strong training”. The advancements in PixArt-Σ are twofold: (1) High-Quality Training Data: PixArt-Σ incorporates superior-quality image data, paired with more precise and detailed image captions. (2) Efficient Token Compression: we propose a novel attention module within the DiT framework that compresses both keys and values, significantly improving efficiency and facilitating ultra-high-resolution image generation. Thanks to these improvements, PixArt-Σ achieves superior image quality and user prompt adherence capabilities with significantly smaller model size (0.6B parameters) than existing text-to-image diffusion models, such as SDXL (2.6B parameters) and SD Cascade (5.1B parameters). Moreover, PixArt-Σ’s capability to generate 4K images supports the creation of high-resolution posters and wallpapers, efficiently bolstering the production of highquality visual content in industries such as film and gaming.* + +You can find the original codebase at [PixArt-alpha/PixArt-sigma](https://github.com/PixArt-alpha/PixArt-sigma) and all the available checkpoints at [PixArt-alpha](https://huggingface.co/PixArt-alpha). + +Some notes about this pipeline: + +* It uses a Transformer backbone (instead of a UNet) for denoising. As such it has a similar architecture as [DiT](https://hf.co/docs/transformers/model_doc/dit). +* It was trained using text conditions computed from T5. This aspect makes the pipeline better at following complex text prompts with intricate details. +* It is good at producing high-resolution images at different aspect ratios. To get the best results, the authors recommend some size brackets which can be found [here](https://github.com/PixArt-alpha/PixArt-sigma/blob/master/diffusion/data/datasets/utils.py). +* It rivals the quality of state-of-the-art text-to-image generation systems (as of this writing) such as PixArt-α, Stable Diffusion XL, Playground V2.0 and DALL-E 3, while being more efficient than them. +* It shows the ability of generating super high resolution images, such as 2048px or even 4K. +* It shows that text-to-image models can grow from a weak model to a stronger one through several improvements (VAEs, datasets, and so on.) + + + +Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines. + + + +## Inference with under 8GB GPU VRAM + +Run the [`PixArtSigmaPipeline`] with under 8GB GPU VRAM by loading the text encoder in 8-bit precision. Let's walk through a full-fledged example. + +First, install the [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) library: + +```bash +pip install -U bitsandbytes +``` + +Then load the text encoder in 8-bit: + +```python +from transformers import T5EncoderModel +from diffusers import PixArtSigmaPipeline +import torch + +text_encoder = T5EncoderModel.from_pretrained( + "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS", + subfolder="text_encoder", + load_in_8bit=True, + device_map="auto", + +) +pipe = PixArtSigmaPipeline.from_pretrained( + "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS", + text_encoder=text_encoder, + transformer=None, + device_map="balanced" +) +``` + +Now, use the `pipe` to encode a prompt: + +```python +with torch.no_grad(): + prompt = "cute cat" + prompt_embeds, prompt_attention_mask, negative_embeds, negative_prompt_attention_mask = pipe.encode_prompt(prompt) +``` + +Since text embeddings have been computed, remove the `text_encoder` and `pipe` from the memory, and free up som GPU VRAM: + +```python +import gc + +def flush(): + gc.collect() + torch.cuda.empty_cache() + +del text_encoder +del pipe +flush() +``` + +Then compute the latents with the prompt embeddings as inputs: + +```python +pipe = PixArtSigmaPipeline.from_pretrained( + "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS", + text_encoder=None, + torch_dtype=torch.float16, +).to("cuda") + +latents = pipe( + negative_prompt=None, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_embeds, + prompt_attention_mask=prompt_attention_mask, + negative_prompt_attention_mask=negative_prompt_attention_mask, + num_images_per_prompt=1, + output_type="latent", +).images + +del pipe.transformer +flush() +``` + + + +Notice that while initializing `pipe`, you're setting `text_encoder` to `None` so that it's not loaded. + + + +Once the latents are computed, pass it off to the VAE to decode into a real image: + +```python +with torch.no_grad(): + image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0] +image = pipe.image_processor.postprocess(image, output_type="pil")[0] +image.save("cat.png") +``` + +By deleting components you aren't using and flushing the GPU VRAM, you should be able to run [`PixArtSigmaPipeline`] with under 8GB GPU VRAM. + +![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/pixart/8bits_cat.png) + +If you want a report of your memory-usage, run this [script](https://gist.github.com/sayakpaul/3ae0f847001d342af27018a96f467e4e). + + + +Text embeddings computed in 8-bit can impact the quality of the generated images because of the information loss in the representation space caused by the reduced precision. It's recommended to compare the outputs with and without 8-bit. + + + +While loading the `text_encoder`, you set `load_in_8bit` to `True`. You could also specify `load_in_4bit` to bring your memory requirements down even further to under 7GB. + +## PixArtSigmaPipeline + +[[autodoc]] PixArtSigmaPipeline + - all + - __call__ + \ No newline at end of file diff --git a/docs/source/en/api/video_processor.md b/docs/source/en/api/video_processor.md new file mode 100644 index 000000000000..6461c46c286f --- /dev/null +++ b/docs/source/en/api/video_processor.md @@ -0,0 +1,21 @@ + + +# Video Processor + +The [`VideoProcessor`] provides a unified API for video pipelines to prepare inputs for VAE encoding and post-processing outputs once they're decoded. The class inherits [`VaeImageProcessor`] so it includes transformations such as resizing, normalization, and conversion between PIL Image, PyTorch, and NumPy arrays. + +## VideoProcessor + +[[autodoc]] video_processor.VideoProcessor.preprocess_video + +[[autodoc]] video_processor.VideoProcessor.postprocess_video diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md index e34db1bed49b..74cfa70d70fc 100644 --- a/docs/source/en/installation.md +++ b/docs/source/en/installation.md @@ -112,7 +112,7 @@ pip install -e ".[flax]" These commands will link the folder you cloned the repository to and your Python library paths. Python will now look inside the folder you cloned to in addition to the normal library paths. -For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.8/site-packages/`, Python will also search the `~/diffusers/` folder you cloned to. +For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.10/site-packages/`, Python will also search the `~/diffusers/` folder you cloned to. diff --git a/docs/source/en/optimization/fp16.md b/docs/source/en/optimization/fp16.md index b21b61368826..90a7233b8d3d 100644 --- a/docs/source/en/optimization/fp16.md +++ b/docs/source/en/optimization/fp16.md @@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License. # Speed up inference -There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attetntion](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times. +There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times. > [!TIP] > Optimizing for inference speed or reduced memory usage can lead to improved performance in the other category, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about lowering memory usage in the [Reduce memory usage](memory) guide. diff --git a/docs/source/en/optimization/memory.md b/docs/source/en/optimization/memory.md index 6b2a22b315c8..e3f4d2652d0f 100644 --- a/docs/source/en/optimization/memory.md +++ b/docs/source/en/optimization/memory.md @@ -261,7 +261,7 @@ from dataclasses import dataclass @dataclass class UNet2DConditionOutput: - sample: torch.FloatTensor + sample: torch.Tensor pipe = StableDiffusionPipeline.from_pretrained( diff --git a/docs/source/en/optimization/tgate.md b/docs/source/en/optimization/tgate.md index d208ddfa8411..90e0bc32f71b 100644 --- a/docs/source/en/optimization/tgate.md +++ b/docs/source/en/optimization/tgate.md @@ -6,7 +6,7 @@ Before you begin, make sure you install T-GATE. ```bash pip install tgate -pip install -U pytorch diffusers transformers accelerate DeepCache +pip install -U torch diffusers transformers accelerate DeepCache ``` @@ -46,12 +46,12 @@ pipe = TgatePixArtLoader( image = pipe.tgate( "An alpaca made of colorful building blocks, cyberpunk.", - gate_step=gate_step, + gate_step=gate_step, num_inference_steps=inference_step, ).images[0] ``` - + Accelerate `StableDiffusionXLPipeline` with T-GATE: @@ -78,9 +78,9 @@ pipe = TgateSDXLLoader( ).to("cuda") image = pipe.tgate( - "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.", - gate_step=gate_step, - num_inference_steps=inference_step + "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.", + gate_step=gate_step, + num_inference_steps=inference_step ).images[0] ``` @@ -111,9 +111,9 @@ pipe = TgateSDXLDeepCacheLoader( ).to("cuda") image = pipe.tgate( - "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.", - gate_step=gate_step, - num_inference_steps=inference_step + "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.", + gate_step=gate_step, + num_inference_steps=inference_step ).images[0] ``` @@ -151,9 +151,9 @@ pipe = TgateSDXLLoader( ).to("cuda") image = pipe.tgate( - "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.", - gate_step=gate_step, - num_inference_steps=inference_step + "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.", + gate_step=gate_step, + num_inference_steps=inference_step ).images[0] ``` diff --git a/docs/source/en/tutorials/basic_training.md b/docs/source/en/tutorials/basic_training.md index c97ae2d62f3b..4f7c29cfe513 100644 --- a/docs/source/en/tutorials/basic_training.md +++ b/docs/source/en/tutorials/basic_training.md @@ -260,7 +260,7 @@ Then, you'll need a way to evaluate the model. For evaluation, you can use the [ ... # The default pipeline output type is `List[PIL.Image]` ... images = pipeline( ... batch_size=config.eval_batch_size, -... generator=torch.manual_seed(config.seed), +... generator=torch.Generator(device='cpu').manual_seed(config.seed), # Use a separate torch generator to avoid rewinding the random state of the main training loop ... ).images ... # Make a grid out of the images diff --git a/docs/source/en/using-diffusers/callback.md b/docs/source/en/using-diffusers/callback.md index 3f3e8dae9f2d..7445513dbf4b 100644 --- a/docs/source/en/using-diffusers/callback.md +++ b/docs/source/en/using-diffusers/callback.md @@ -19,13 +19,74 @@ The denoising loop of a pipeline can be modified with custom defined functions u This guide will demonstrate how callbacks work by a few features you can implement with them. +## Official callbacks + +We provide a list of callbacks you can plug into an existing pipeline and modify the denoising loop. This is the current list of official callbacks: + +- `SDCFGCutoffCallback`: Disables the CFG after a certain number of steps for all SD 1.5 pipelines, including text-to-image, image-to-image, inpaint, and controlnet. +- `SDXLCFGCutoffCallback`: Disables the CFG after a certain number of steps for all SDXL pipelines, including text-to-image, image-to-image, inpaint, and controlnet. +- `IPAdapterScaleCutoffCallback`: Disables the IP Adapter after a certain number of steps for all pipelines supporting IP-Adapter. + +> [!TIP] +> If you want to add a new official callback, feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) or [submit a PR](https://huggingface.co/docs/diffusers/main/en/conceptual/contribution#how-to-open-a-pr). + +To set up a callback, you need to specify the number of denoising steps after which the callback comes into effect. You can do so by using either one of these two arguments + +- `cutoff_step_ratio`: Float number with the ratio of the steps. +- `cutoff_step_index`: Integer number with the exact number of the step. + +```python +import torch + +from diffusers import DPMSolverMultistepScheduler, StableDiffusionXLPipeline +from diffusers.callbacks import SDXLCFGCutoffCallback + + +callback = SDXLCFGCutoffCallback(cutoff_step_ratio=0.4) +# can also be used with cutoff_step_index +# callback = SDXLCFGCutoffCallback(cutoff_step_ratio=None, cutoff_step_index=10) + +pipeline = StableDiffusionXLPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + torch_dtype=torch.float16, + variant="fp16", +).to("cuda") +pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, use_karras_sigmas=True) + +prompt = "a sports car at the road, best quality, high quality, high detail, 8k resolution" + +generator = torch.Generator(device="cpu").manual_seed(2628670641) + +out = pipeline( + prompt=prompt, + negative_prompt="", + guidance_scale=6.5, + num_inference_steps=25, + generator=generator, + callback_on_step_end=callback, +) + +out.images[0].save("official_callback.png") +``` + +
+
+ generated image of a sports car at the road +
without SDXLCFGCutoffCallback
+
+
+ generated image of a a sports car at the road with cfg callback +
with SDXLCFGCutoffCallback
+
+
+ ## Dynamic classifier-free guidance Dynamic classifier-free guidance (CFG) is a feature that allows you to disable CFG after a certain number of inference steps which can help you save compute with minimal cost to performance. The callback function for this should have the following arguments: -* `pipeline` (or the pipeline instance) provides access to important properties such as `num_timesteps` and `guidance_scale`. You can modify these properties by updating the underlying attributes. For this example, you'll disable CFG by setting `pipeline._guidance_scale=0.0`. -* `step_index` and `timestep` tell you where you are in the denoising loop. Use `step_index` to turn off CFG after reaching 40% of `num_timesteps`. -* `callback_kwargs` is a dict that contains tensor variables you can modify during the denoising loop. It only includes variables specified in the `callback_on_step_end_tensor_inputs` argument, which is passed to the pipeline's `__call__` method. Different pipelines may use different sets of variables, so please check a pipeline's `_callback_tensor_inputs` attribute for the list of variables you can modify. Some common variables include `latents` and `prompt_embeds`. For this function, change the batch size of `prompt_embeds` after setting `guidance_scale=0.0` in order for it to work properly. +- `pipeline` (or the pipeline instance) provides access to important properties such as `num_timesteps` and `guidance_scale`. You can modify these properties by updating the underlying attributes. For this example, you'll disable CFG by setting `pipeline._guidance_scale=0.0`. +- `step_index` and `timestep` tell you where you are in the denoising loop. Use `step_index` to turn off CFG after reaching 40% of `num_timesteps`. +- `callback_kwargs` is a dict that contains tensor variables you can modify during the denoising loop. It only includes variables specified in the `callback_on_step_end_tensor_inputs` argument, which is passed to the pipeline's `__call__` method. Different pipelines may use different sets of variables, so please check a pipeline's `_callback_tensor_inputs` attribute for the list of variables you can modify. Some common variables include `latents` and `prompt_embeds`. For this function, change the batch size of `prompt_embeds` after setting `guidance_scale=0.0` in order for it to work properly. Your callback function should look something like this: diff --git a/docs/source/en/using-diffusers/inference_with_tcd_lora.md b/docs/source/en/using-diffusers/inference_with_tcd_lora.md index 10ad674e73ac..df49fc8475ad 100644 --- a/docs/source/en/using-diffusers/inference_with_tcd_lora.md +++ b/docs/source/en/using-diffusers/inference_with_tcd_lora.md @@ -78,7 +78,7 @@ image = pipe( prompt=prompt, num_inference_steps=4, guidance_scale=0, - eta=0.3, + eta=0.3, generator=torch.Generator(device=device).manual_seed(0), ).images[0] ``` @@ -156,14 +156,14 @@ image = pipe( prompt=prompt, num_inference_steps=8, guidance_scale=0, - eta=0.3, + eta=0.3, generator=torch.Generator(device=device).manual_seed(0), ).images[0] ``` ![](https://github.com/jabir-zheng/TCD/raw/main/assets/animagine_xl.png) -TCD-LoRA also supports other LoRAs trained on different styles. For example, let's load the [TheLastBen/Papercut_SDXL](https://huggingface.co/TheLastBen/Papercut_SDXL) LoRA and fuse it with the TCD-LoRA with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method. +TCD-LoRA also supports other LoRAs trained on different styles. For example, let's load the [TheLastBen/Papercut_SDXL](https://huggingface.co/TheLastBen/Papercut_SDXL) LoRA and fuse it with the TCD-LoRA with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method. > [!TIP] > Check out the [Merge LoRAs](merge_loras) guide to learn more about efficient merging methods. @@ -171,7 +171,7 @@ TCD-LoRA also supports other LoRAs trained on different styles. For example, let ```python import torch from diffusers import StableDiffusionXLPipeline -from scheduling_tcd import TCDScheduler +from scheduling_tcd import TCDScheduler device = "cuda" base_model_id = "stabilityai/stable-diffusion-xl-base-1.0" @@ -191,7 +191,7 @@ image = pipe( prompt=prompt, num_inference_steps=4, guidance_scale=0, - eta=0.3, + eta=0.3, generator=torch.Generator(device=device).manual_seed(0), ).images[0] ``` @@ -215,7 +215,7 @@ from PIL import Image from transformers import DPTFeatureExtractor, DPTForDepthEstimation from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline from diffusers.utils import load_image, make_image_grid -from scheduling_tcd import TCDScheduler +from scheduling_tcd import TCDScheduler device = "cuda" depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device) @@ -249,13 +249,13 @@ controlnet = ControlNetModel.from_pretrained( controlnet_id, torch_dtype=torch.float16, variant="fp16", -).to(device) +) pipe = StableDiffusionXLControlNetPipeline.from_pretrained( base_model_id, controlnet=controlnet, torch_dtype=torch.float16, variant="fp16", -).to(device) +) pipe.enable_model_cpu_offload() pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config) @@ -271,9 +271,9 @@ depth_image = get_depth_map(image) controlnet_conditioning_scale = 0.5 # recommended for good generalization image = pipe( - prompt, - image=depth_image, - num_inference_steps=4, + prompt, + image=depth_image, + num_inference_steps=4, guidance_scale=0, eta=0.3, controlnet_conditioning_scale=controlnet_conditioning_scale, @@ -290,7 +290,7 @@ grid_image = make_image_grid([depth_image, image], rows=1, cols=2) import torch from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline from diffusers.utils import load_image, make_image_grid -from scheduling_tcd import TCDScheduler +from scheduling_tcd import TCDScheduler device = "cuda" base_model_id = "stabilityai/stable-diffusion-xl-base-1.0" @@ -301,13 +301,13 @@ controlnet = ControlNetModel.from_pretrained( controlnet_id, torch_dtype=torch.float16, variant="fp16", -).to(device) +) pipe = StableDiffusionXLControlNetPipeline.from_pretrained( base_model_id, controlnet=controlnet, torch_dtype=torch.float16, variant="fp16", -).to(device) +) pipe.enable_model_cpu_offload() pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config) @@ -322,9 +322,9 @@ canny_image = load_image("https://huggingface.co/datasets/hf-internal-testing/di controlnet_conditioning_scale = 0.5 # recommended for good generalization image = pipe( - prompt, - image=canny_image, - num_inference_steps=4, + prompt, + image=canny_image, + num_inference_steps=4, guidance_scale=0, eta=0.3, controlnet_conditioning_scale=controlnet_conditioning_scale, @@ -336,7 +336,7 @@ grid_image = make_image_grid([canny_image, image], rows=1, cols=2) ![](https://github.com/jabir-zheng/TCD/raw/main/assets/controlnet_canny_tcd.png) -The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one. +The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one. @@ -350,7 +350,7 @@ from diffusers import StableDiffusionXLPipeline from diffusers.utils import load_image, make_image_grid from ip_adapter import IPAdapterXL -from scheduling_tcd import TCDScheduler +from scheduling_tcd import TCDScheduler device = "cuda" base_model_path = "stabilityai/stable-diffusion-xl-base-1.0" @@ -359,8 +359,8 @@ ip_ckpt = "sdxl_models/ip-adapter_sdxl.bin" tcd_lora_id = "h1t/TCD-SDXL-LoRA" pipe = StableDiffusionXLPipeline.from_pretrained( - base_model_path, - torch_dtype=torch.float16, + base_model_path, + torch_dtype=torch.float16, variant="fp16" ) pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config) @@ -375,13 +375,13 @@ ref_image = load_image("https://raw.githubusercontent.com/tencent-ailab/IP-Adapt prompt = "best quality, high quality, wearing sunglasses" image = ip_model.generate( - pil_image=ref_image, + pil_image=ref_image, prompt=prompt, scale=0.5, - num_samples=1, - num_inference_steps=4, + num_samples=1, + num_inference_steps=4, guidance_scale=0, - eta=0.3, + eta=0.3, seed=0, )[0] diff --git a/docs/source/en/using-diffusers/inpaint.md b/docs/source/en/using-diffusers/inpaint.md index 193f5a6d9fe4..ba43325f5305 100644 --- a/docs/source/en/using-diffusers/inpaint.md +++ b/docs/source/en/using-diffusers/inpaint.md @@ -230,7 +230,7 @@ from diffusers.utils import load_image, make_image_grid pipeline = AutoPipelineForInpainting.from_pretrained( "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16" -).to("cuda") +) pipeline.enable_model_cpu_offload() # remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed pipeline.enable_xformers_memory_efficient_attention() @@ -255,7 +255,7 @@ from diffusers.utils import load_image, make_image_grid pipeline = AutoPipelineForInpainting.from_pretrained( "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16" -).to("cuda") +) pipeline.enable_model_cpu_offload() # remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed pipeline.enable_xformers_memory_efficient_attention() @@ -296,7 +296,7 @@ from diffusers.utils import load_image, make_image_grid pipeline = AutoPipelineForInpainting.from_pretrained( "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16" -).to("cuda") +) pipeline.enable_model_cpu_offload() # remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed pipeline.enable_xformers_memory_efficient_attention() @@ -319,7 +319,7 @@ from diffusers.utils import load_image, make_image_grid pipeline = AutoPipelineForInpainting.from_pretrained( "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16" -).to("cuda") +) pipeline.enable_model_cpu_offload() # remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed pipeline.enable_xformers_memory_efficient_attention() diff --git a/docs/source/en/using-diffusers/schedulers.md b/docs/source/en/using-diffusers/schedulers.md index 01dab2bed7fe..bfc8aa1a2108 100644 --- a/docs/source/en/using-diffusers/schedulers.md +++ b/docs/source/en/using-diffusers/schedulers.md @@ -212,6 +212,62 @@ images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True). images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:]))) ``` +## Custom Timestep Schedules + +With all our schedulers, you can choose one of the popular timestep schedules using configurations such as `timestep_spacing`, `interpolation_type`, and `use_karras_sigmas`. Some schedulers also provide the flexibility to use a custom timestep schedule. You can use any list of arbitrary timesteps, we will use the AYS timestep schedule here as example. It is a set of 10-step optimized timestep schedules released by researchers from Nvidia that can achieve significantly better quality compared to the preset timestep schedules. You can read more about their research [here](https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/). + +```python +from diffusers.schedulers import AysSchedules +sampling_schedule = AysSchedules["StableDiffusionXLTimesteps"] +print(sampling_schedule) +``` +``` +[999, 845, 730, 587, 443, 310, 193, 116, 53, 13] +``` + +You can then create a pipeline and pass this custom timestep schedule to it as `timesteps`. + +```python +pipe = StableDiffusionXLPipeline.from_pretrained( + "SG161222/RealVisXL_V4.0", + torch_dtype=torch.float16, + variant="fp16", +).to("cuda") + +pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, algorithm_type="sde-dpmsolver++") + +prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up" + +generator = torch.Generator(device="cpu").manual_seed(2487854446) + +image = pipe( + prompt=prompt, + negative_prompt="", + generator=generator, + timesteps=sampling_schedule, +).images[0] +``` +The generated image has better quality than the default linear timestep schedule for the same number of steps, and it is similar to the default timestep scheduler when running for 25 steps. + +
+
+ +
AYS timestep schedule 10 steps
+
+
+ +
Linearly-spaced timestep schedule 10 steps
+
+
+ +
Linearly-spaced timestep schedule 25 steps
+
+
+ +> [!TIP] +> 🤗 Diffusers currently only supports `timesteps` and `sigmas` for a selected list of schedulers and pipelines, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you want to extend feature to a scheduler and pipeline that does not currently support it! + + ## Models Models are loaded from the [`ModelMixin.from_pretrained`] method, which downloads and caches the latest version of the model weights and configurations. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache instead of re-downloading them. diff --git a/docs/source/ja/installation.md b/docs/source/ja/installation.md index 392d42db6bda..ef0445635f4d 100644 --- a/docs/source/ja/installation.md +++ b/docs/source/ja/installation.md @@ -106,7 +106,7 @@ pip install -e ".[flax]" これらのコマンドは、リポジトリをクローンしたフォルダと Python のライブラリパスをリンクします。 Python は通常のライブラリパスに加えて、クローンしたフォルダの中を探すようになります。 -例えば、Python パッケージが通常 `~/anaconda3/envs/main/lib/python3.8/site-packages/` にインストールされている場合、Python はクローンした `~/diffusers/` フォルダも同様に参照します。 +例えば、Python パッケージが通常 `~/anaconda3/envs/main/lib/python3.10/site-packages/` にインストールされている場合、Python はクローンした `~/diffusers/` フォルダも同様に参照します。 diff --git a/docs/source/ko/installation.md b/docs/source/ko/installation.md index aff32e3f21fa..edb8b81c4fa2 100644 --- a/docs/source/ko/installation.md +++ b/docs/source/ko/installation.md @@ -105,7 +105,7 @@ pip install -e ".[flax]" 이러한 명령어들은 저장소를 복제한 폴더와 Python 라이브러리 경로를 연결합니다. Python은 이제 일반 라이브러리 경로에 더하여 복제한 폴더 내부를 살펴봅니다. -예를들어 Python 패키지가 `~/anaconda3/envs/main/lib/python3.8/site-packages/`에 설치되어 있는 경우 Python은 복제한 폴더인 `~/diffusers/`도 검색합니다. +예를들어 Python 패키지가 `~/anaconda3/envs/main/lib/python3.10/site-packages/`에 설치되어 있는 경우 Python은 복제한 폴더인 `~/diffusers/`도 검색합니다. diff --git a/docs/source/ko/optimization/fp16.md b/docs/source/ko/optimization/fp16.md index 2e58421c35b8..f7b2cf809552 100644 --- a/docs/source/ko/optimization/fp16.md +++ b/docs/source/ko/optimization/fp16.md @@ -339,7 +339,7 @@ from dataclasses import dataclass @dataclass class UNet2DConditionOutput: - sample: torch.FloatTensor + sample: torch.Tensor pipe = StableDiffusionPipeline.from_pretrained( diff --git a/docs/source/pt/installation.md b/docs/source/pt/installation.md index 0e45707acd23..3c263299920a 100644 --- a/docs/source/pt/installation.md +++ b/docs/source/pt/installation.md @@ -102,7 +102,7 @@ pip install -e ".[flax]" Esses comandos irá linkar a pasta que você clonou o repositório e os caminhos das suas bibliotecas Python. Python então irá procurar dentro da pasta que você clonou além dos caminhos normais das bibliotecas. -Por exemplo, se o pacote python for tipicamente instalado no `~/anaconda3/envs/main/lib/python3.8/site-packages/`, o Python também irá procurar na pasta `~/diffusers/` que você clonou. +Por exemplo, se o pacote python for tipicamente instalado no `~/anaconda3/envs/main/lib/python3.10/site-packages/`, o Python também irá procurar na pasta `~/diffusers/` que você clonou. diff --git a/docs/source/zh/installation.md b/docs/source/zh/installation.md index ab9be4f8a17b..2f53e18ef61b 100644 --- a/docs/source/zh/installation.md +++ b/docs/source/zh/installation.md @@ -107,7 +107,7 @@ pip install -e ".[flax]" 这些命令将连接到你克隆的版本库和你的 Python 库路径。 现在,不只是在通常的库路径,Python 还会在你克隆的文件夹内寻找包。 -例如,如果你的 Python 包通常安装在 `~/anaconda3/envs/main/lib/python3.8/Site-packages/`,Python 也会搜索你克隆到的文件夹。`~/diffusers/`。 +例如,如果你的 Python 包通常安装在 `~/anaconda3/envs/main/lib/python3.10/Site-packages/`,Python 也会搜索你克隆到的文件夹。`~/diffusers/`。 diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py index 6cdf2e7b21ab..143f67f9cb81 100644 --- a/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py +++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py @@ -981,7 +981,7 @@ def collate_fn(examples, with_prior_preservation=False): class PromptDataset(Dataset): - "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + """A simple dataset to prepare the prompts to generate class images on multiple GPUs.""" def __init__(self, prompt, num_samples): self.prompt = prompt diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py index 0699ac17077d..8b5b00f3bd88 100644 --- a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py +++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py @@ -1136,7 +1136,7 @@ def collate_fn(examples, with_prior_preservation=False): class PromptDataset(Dataset): - "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + """A simple dataset to prepare the prompts to generate class images on multiple GPUs.""" def __init__(self, prompt, num_samples): self.prompt = prompt diff --git a/examples/community/README.md b/examples/community/README.md index 5cebc4f9f049..600761aae730 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -68,6 +68,8 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif | InstantID Pipeline | Stable Diffusion XL Pipeline that supports InstantID | [InstantID Pipeline](#instantid-pipeline) | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/InstantX/InstantID) | [Haofan Wang](https://github.com/haofanwang) | | UFOGen Scheduler | Scheduler for UFOGen Model (compatible with Stable Diffusion pipelines) | [UFOGen Scheduler](#ufogen-scheduler) | - | [dg845](https://github.com/dg845) | | Stable Diffusion XL IPEX Pipeline | Accelerate Stable Diffusion XL inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [Stable Diffusion XL on IPEX](#stable-diffusion-xl-on-ipex) | - | [Dan Li](https://github.com/ustcuna/) | +| Stable Diffusion BoxDiff Pipeline | Training-free controlled generation with bounding boxes using [BoxDiff](https://github.com/showlab/BoxDiff) | [Stable Diffusion BoxDiff Pipeline](#stable-diffusion-boxdiff) | - | [Jingyang Zhang](https://github.com/zjysteven/) | +| FRESCO V2V Pipeline | Implementation of [[CVPR 2024] FRESCO: Spatial-Temporal Correspondence for Zero-Shot Video Translation](https://arxiv.org/abs/2403.12962) | [FRESCO V2V Pipeline](#fresco) | - | [Yifan Zhou](https://github.com/SingleZombie) | To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly. @@ -238,12 +240,12 @@ pipeline_output = pipe( # denoising_steps=10, # (optional) Number of denoising steps of each inference pass. Default: 10. # ensemble_size=10, # (optional) Number of inference passes in the ensemble. Default: 10. # ------------------------------------------------ - + # ----- recommended setting for LCM version ------ # denoising_steps=4, # ensemble_size=5, # ------------------------------------------------- - + # processing_res=768, # (optional) Maximum resolution of processing. If set to 0: will not resize at all. Defaults to 768. # match_input_res=True, # (optional) Resize depth prediction to match input resolution. # batch_size=0, # (optional) Inference batch size, no bigger than `num_ensemble`. If set to 0, the script will automatically decide the proper batch size. Defaults to 0. @@ -1030,7 +1032,7 @@ image = pipe().images[0] Make sure you have @crowsonkb's installed: -``` +```sh pip install k-diffusion ``` @@ -1676,6 +1678,68 @@ image = pipe(prompt, image=input_image, strength=0.75,).images[0] image.save('tensorrt_img2img_new_zealand_hills.png') ``` +### Stable Diffusion BoxDiff +BoxDiff is a training-free method for controlled generation with bounding box coordinates. It shoud work with any Stable Diffusion model. Below shows an example with `stable-diffusion-2-1-base`. +```py +import torch +from PIL import Image, ImageDraw +from copy import deepcopy + +from examples.community.pipeline_stable_diffusion_boxdiff import StableDiffusionBoxDiffPipeline + +def draw_box_with_text(img, boxes, names): + colors = ["red", "olive", "blue", "green", "orange", "brown", "cyan", "purple"] + img_new = deepcopy(img) + draw = ImageDraw.Draw(img_new) + + W, H = img.size + for bid, box in enumerate(boxes): + draw.rectangle([box[0] * W, box[1] * H, box[2] * W, box[3] * H], outline=colors[bid % len(colors)], width=4) + draw.text((box[0] * W, box[1] * H), names[bid], fill=colors[bid % len(colors)]) + return img_new + +pipe = StableDiffusionBoxDiffPipeline.from_pretrained( + "stabilityai/stable-diffusion-2-1-base", + torch_dtype=torch.float16, +) +pipe.to("cuda") + +# example 1 +prompt = "as the aurora lights up the sky, a herd of reindeer leisurely wanders on the grassy meadow, admiring the breathtaking view, a serene lake quietly reflects the magnificent display, and in the distance, a snow-capped mountain stands majestically, fantasy, 8k, highly detailed" +phrases = [ + "aurora", + "reindeer", + "meadow", + "lake", + "mountain" +] +boxes = [[1,3,512,202], [75,344,421,495], [1,327,508,507], [2,217,507,341], [1,135,509,242]] + +# example 2 +# prompt = "A rabbit wearing sunglasses looks very proud" +# phrases = ["rabbit", "sunglasses"] +# boxes = [[67,87,366,512], [66,130,364,262]] + +boxes = [[x / 512 for x in box] for box in boxes] + +images = pipe( + prompt, + boxdiff_phrases=phrases, + boxdiff_boxes=boxes, + boxdiff_kwargs={ + "attention_res": 16, + "normalize_eot": True + }, + num_inference_steps=50, + guidance_scale=7.5, + generator=torch.manual_seed(42), + safety_checker=None +).images + +draw_box_with_text(images[0], boxes, phrases).save("output.png") +``` + + ### Stable Diffusion Reference This pipeline uses the Reference Control. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280). @@ -1790,13 +1854,13 @@ To use this pipeline, you need to: You can simply use pip to install IPEX with the latest version. -```python +```sh python -m pip install intel_extension_for_pytorch ``` **Note:** To install a specific version, run with the following command: -``` +```sh python -m pip install intel_extension_for_pytorch== -f https://developer.intel.com/ipex-whl-stable-cpu ``` @@ -1894,13 +1958,13 @@ To use this pipeline, you need to: You can simply use pip to install IPEX with the latest version. -```python +```sh python -m pip install intel_extension_for_pytorch ``` **Note:** To install a specific version, run with the following command: -``` +```sh python -m pip install intel_extension_for_pytorch== -f https://developer.intel.com/ipex-whl-stable-cpu ``` @@ -2946,8 +3010,8 @@ This code implements a pipeline for the Stable Diffusion model, enabling the div ### Sample Code -``` -from from examples.community.regional_prompting_stable_diffusion import RegionalPromptingStableDiffusionPipeline +```py +from examples.community.regional_prompting_stable_diffusion import RegionalPromptingStableDiffusionPipeline pipe = RegionalPromptingStableDiffusionPipeline.from_single_file(model_path, vae=vae) rp_args = { @@ -3972,6 +4036,93 @@ onestep_image = pipe(prompt, num_inference_steps=1).images[0] multistep_image = pipe(prompt, num_inference_steps=4).images[0] ``` +### FRESCO + +This is the Diffusers implementation of zero-shot video-to-video translation pipeline [FRESCO](https://github.com/williamyang1991/FRESCO) (without Ebsynth postprocessing and background smooth). To run the code, please install gmflow. Then modify the path in `gmflow_dir`. After that, you can run the pipeline with: + +```py +from PIL import Image +import cv2 +import torch +import numpy as np + +from diffusers import ControlNetModel,DDIMScheduler, DiffusionPipeline +import sys +gmflow_dir = "/path/to/gmflow" +sys.path.insert(0, gmflow_dir) + +def video_to_frame(video_path: str, interval: int): + vidcap = cv2.VideoCapture(video_path) + success = True + + count = 0 + res = [] + while success: + count += 1 + success, image = vidcap.read() + if count % interval != 1: + continue + if image is not None: + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + res.append(image) + if len(res) >= 8: + break + + vidcap.release() + return res + + +input_video_path = 'https://github.com/williamyang1991/FRESCO/raw/main/data/car-turn.mp4' +output_video_path = 'car.gif' + +# You can use any fintuned SD here +model_path = 'SG161222/Realistic_Vision_V2.0' + +prompt = 'a red car turns in the winter' +a_prompt = ', RAW photo, subject, (high detailed skin:1.2), 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3, ' +n_prompt = '(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers:1.4), (deformed, distorted, disfigured:1.3), poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation' + +input_interval = 5 +frames = video_to_frame( + input_video_path, input_interval) + +control_frames = [] +# get canny image +for frame in frames: + image = cv2.Canny(frame, 50, 100) + np_image = np.array(image) + np_image = np_image[:, :, None] + np_image = np.concatenate([np_image, np_image, np_image], axis=2) + canny_image = Image.fromarray(np_image) + control_frames.append(canny_image) + +# You can use any ControlNet here +controlnet = ControlNetModel.from_pretrained( + "lllyasviel/sd-controlnet-canny").to('cuda') + +pipe = DiffusionPipeline.from_pretrained( + model_path, controlnet=controlnet, custom_pipeline='fresco_v2v').to('cuda') +pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + +generator = torch.manual_seed(0) +frames = [Image.fromarray(frame) for frame in frames] + +output_frames = pipe( + prompt + a_prompt, + frames, + control_frames, + num_inference_steps=20, + strength=0.75, + controlnet_conditioning_scale=0.7, + generator=generator, + negative_prompt=n_prompt +).images + +output_frames[0].save(output_video_path, save_all=True, + append_images=output_frames[1:], duration=100, loop=0) + +``` + # Perturbed-Attention Guidance [Project](https://ku-cvlab.github.io/Perturbed-Attention-Guidance/) / [arXiv](https://arxiv.org/abs/2403.17377) / [GitHub](https://github.com/KU-CVLAB/Perturbed-Attention-Guidance) @@ -3980,7 +4131,7 @@ This implementation is based on [Diffusers](https://huggingface.co/docs/diffuser ## Example Usage -``` +```py import os import torch diff --git a/examples/community/bit_diffusion.py b/examples/community/bit_diffusion.py index 18d5fca5619e..71d8f31163f2 100644 --- a/examples/community/bit_diffusion.py +++ b/examples/community/bit_diffusion.py @@ -44,9 +44,9 @@ def bits_to_decimal(x, bits=BITS): # modified scheduler step functions for clamping the predicted x_0 between -bit_scale and +bit_scale def ddim_bit_scheduler_step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = True, generator=None, @@ -56,9 +56,9 @@ def ddim_bit_scheduler_step( Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. eta (`float`): weight of noise for added noise in diffusion step. use_clipped_model_output (`bool`): TODO @@ -134,9 +134,9 @@ def ddim_bit_scheduler_step( def ddpm_bit_scheduler_step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, prediction_type="epsilon", generator=None, return_dict: bool = True, @@ -145,9 +145,9 @@ def ddpm_bit_scheduler_step( Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. prediction_type (`str`, default `epsilon`): indicates whether the model predicts the noise (epsilon), or the samples (`sample`). diff --git a/examples/community/clip_guided_images_mixing_stable_diffusion.py b/examples/community/clip_guided_images_mixing_stable_diffusion.py index 16dcecd7b22a..75b7df84dc77 100644 --- a/examples/community/clip_guided_images_mixing_stable_diffusion.py +++ b/examples/community/clip_guided_images_mixing_stable_diffusion.py @@ -233,8 +233,8 @@ def cond_fn( @torch.no_grad() def __call__( self, - style_image: Union[torch.FloatTensor, PIL.Image.Image], - content_image: Union[torch.FloatTensor, PIL.Image.Image], + style_image: Union[torch.Tensor, PIL.Image.Image], + content_image: Union[torch.Tensor, PIL.Image.Image], style_prompt: Optional[str] = None, content_prompt: Optional[str] = None, height: Optional[int] = 512, diff --git a/examples/community/clip_guided_stable_diffusion.py b/examples/community/clip_guided_stable_diffusion.py index 4205718802de..1350650113da 100644 --- a/examples/community/clip_guided_stable_diffusion.py +++ b/examples/community/clip_guided_stable_diffusion.py @@ -180,7 +180,7 @@ def __call__( num_cutouts: Optional[int] = 4, use_cutouts: Optional[bool] = True, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, ): diff --git a/examples/community/clip_guided_stable_diffusion_img2img.py b/examples/community/clip_guided_stable_diffusion_img2img.py index c8e0a9094f22..9a77458be5ad 100644 --- a/examples/community/clip_guided_stable_diffusion_img2img.py +++ b/examples/community/clip_guided_stable_diffusion_img2img.py @@ -306,7 +306,7 @@ def __call__( prompt: Union[str, List[str]], height: Optional[int] = 512, width: Optional[int] = 512, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -317,7 +317,7 @@ def __call__( num_cutouts: Optional[int] = 4, use_cutouts: Optional[bool] = True, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, ): diff --git a/examples/community/composable_stable_diffusion.py b/examples/community/composable_stable_diffusion.py index c2a60bed6426..1ef257fe78a8 100644 --- a/examples/community/composable_stable_diffusion.py +++ b/examples/community/composable_stable_diffusion.py @@ -354,10 +354,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, weights: Optional[str] = "", ): @@ -391,7 +391,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -403,7 +403,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/ddim_noise_comparative_analysis.py b/examples/community/ddim_noise_comparative_analysis.py index 482c0a5826d2..829106c47f65 100644 --- a/examples/community/ddim_noise_comparative_analysis.py +++ b/examples/community/ddim_noise_comparative_analysis.py @@ -103,7 +103,7 @@ def prepare_latents(self, image, timestep, batch_size, dtype, device, generator= @torch.no_grad() def __call__( self, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, strength: float = 0.8, batch_size: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, @@ -115,7 +115,7 @@ def __call__( ) -> Union[ImagePipelineOutput, Tuple]: r""" Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.8): diff --git a/examples/community/fresco_v2v.py b/examples/community/fresco_v2v.py new file mode 100644 index 000000000000..bf6a31c32fa8 --- /dev/null +++ b/examples/community/fresco_v2v.py @@ -0,0 +1,2511 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import inspect +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import numpy as np +import PIL.Image +import torch +import torch.nn.functional as F +import torch.utils.model_zoo +from einops import rearrange, repeat +from gmflow.gmflow import GMFlow +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection + +from diffusers.image_processor import PipelineImageInput, VaeImageProcessor +from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel +from diffusers.models.attention_processor import AttnProcessor2_0 +from diffusers.models.lora import adjust_lora_scale_text_encoder +from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput +from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel +from diffusers.pipelines.controlnet.pipeline_controlnet_img2img import StableDiffusionControlNetImg2ImgPipeline +from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker +from diffusers.schedulers import KarrasDiffusionSchedulers +from diffusers.utils import ( + USE_PEFT_BACKEND, + deprecate, + logging, + scale_lora_layers, + unscale_lora_layers, +) +from diffusers.utils.torch_utils import is_compiled_module, randn_tensor + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +def clear_cache(): + gc.collect() + torch.cuda.empty_cache() + + +def coords_grid(b, h, w, homogeneous=False, device=None): + y, x = torch.meshgrid(torch.arange(h), torch.arange(w)) # [H, W] + + stacks = [x, y] + + if homogeneous: + ones = torch.ones_like(x) # [H, W] + stacks.append(ones) + + grid = torch.stack(stacks, dim=0).float() # [2, H, W] or [3, H, W] + + grid = grid[None].repeat(b, 1, 1, 1) # [B, 2, H, W] or [B, 3, H, W] + + if device is not None: + grid = grid.to(device) + + return grid + + +def bilinear_sample(img, sample_coords, mode="bilinear", padding_mode="zeros", return_mask=False): + # img: [B, C, H, W] + # sample_coords: [B, 2, H, W] in image scale + if sample_coords.size(1) != 2: # [B, H, W, 2] + sample_coords = sample_coords.permute(0, 3, 1, 2) + + b, _, h, w = sample_coords.shape + + # Normalize to [-1, 1] + x_grid = 2 * sample_coords[:, 0] / (w - 1) - 1 + y_grid = 2 * sample_coords[:, 1] / (h - 1) - 1 + + grid = torch.stack([x_grid, y_grid], dim=-1) # [B, H, W, 2] + + img = F.grid_sample(img, grid, mode=mode, padding_mode=padding_mode, align_corners=True) + + if return_mask: + mask = (x_grid >= -1) & (y_grid >= -1) & (x_grid <= 1) & (y_grid <= 1) # [B, H, W] + + return img, mask + + return img + + +class Dilate: + def __init__(self, kernel_size=7, channels=1, device="cpu"): + self.kernel_size = kernel_size + self.channels = channels + gaussian_kernel = torch.ones(1, 1, self.kernel_size, self.kernel_size) + gaussian_kernel = gaussian_kernel.repeat(self.channels, 1, 1, 1) + self.mean = (self.kernel_size - 1) // 2 + gaussian_kernel = gaussian_kernel.to(device) + self.gaussian_filter = gaussian_kernel + + def __call__(self, x): + x = F.pad(x, (self.mean, self.mean, self.mean, self.mean), "replicate") + return torch.clamp(F.conv2d(x, self.gaussian_filter, bias=None), 0, 1) + + +def flow_warp(feature, flow, mask=False, mode="bilinear", padding_mode="zeros"): + b, c, h, w = feature.size() + assert flow.size(1) == 2 + + grid = coords_grid(b, h, w).to(flow.device) + flow # [B, 2, H, W] + grid = grid.to(feature.dtype) + return bilinear_sample(feature, grid, mode=mode, padding_mode=padding_mode, return_mask=mask) + + +def forward_backward_consistency_check(fwd_flow, bwd_flow, alpha=0.01, beta=0.5): + # fwd_flow, bwd_flow: [B, 2, H, W] + # alpha and beta values are following UnFlow + # (https://arxiv.org/abs/1711.07837) + assert fwd_flow.dim() == 4 and bwd_flow.dim() == 4 + assert fwd_flow.size(1) == 2 and bwd_flow.size(1) == 2 + flow_mag = torch.norm(fwd_flow, dim=1) + torch.norm(bwd_flow, dim=1) # [B, H, W] + + warped_bwd_flow = flow_warp(bwd_flow, fwd_flow) # [B, 2, H, W] + warped_fwd_flow = flow_warp(fwd_flow, bwd_flow) # [B, 2, H, W] + + diff_fwd = torch.norm(fwd_flow + warped_bwd_flow, dim=1) # [B, H, W] + diff_bwd = torch.norm(bwd_flow + warped_fwd_flow, dim=1) + + threshold = alpha * flow_mag + beta + + fwd_occ = (diff_fwd > threshold).float() # [B, H, W] + bwd_occ = (diff_bwd > threshold).float() + + return fwd_occ, bwd_occ + + +def numpy2tensor(img): + x0 = torch.from_numpy(img.copy()).float().cuda() / 255.0 * 2.0 - 1.0 + x0 = torch.stack([x0], dim=0) + # einops.rearrange(x0, 'b h w c -> b c h w').clone() + return x0.permute(0, 3, 1, 2) + + +def calc_mean_std(feat, eps=1e-5, chunk=1): + size = feat.size() + assert len(size) == 4 + if chunk == 2: + feat = torch.cat(feat.chunk(2), dim=3) + N, C = size[:2] + feat_var = feat.view(N // chunk, C, -1).var(dim=2) + eps + feat_std = feat_var.sqrt().view(N, C, 1, 1) + feat_mean = feat.view(N // chunk, C, -1).mean(dim=2).view(N // chunk, C, 1, 1) + return feat_mean.repeat(chunk, 1, 1, 1), feat_std.repeat(chunk, 1, 1, 1) + + +def adaptive_instance_normalization(content_feat, style_feat, chunk=1): + assert content_feat.size()[:2] == style_feat.size()[:2] + size = content_feat.size() + style_mean, style_std = calc_mean_std(style_feat, chunk) + content_mean, content_std = calc_mean_std(content_feat) + + normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size) + return normalized_feat * style_std.expand(size) + style_mean.expand(size) + + +def optimize_feature( + sample, flows, occs, correlation_matrix=[], intra_weight=1e2, iters=20, unet_chunk_size=2, optimize_temporal=True +): + """ + FRESO-guided latent feature optimization + * optimize spatial correspondence (match correlation_matrix) + * optimize temporal correspondence (match warped_image) + """ + if (flows is None or occs is None or (not optimize_temporal)) and ( + intra_weight == 0 or len(correlation_matrix) == 0 + ): + return sample + # flows=[fwd_flows, bwd_flows]: (N-1)*2*H1*W1 + # occs=[fwd_occs, bwd_occs]: (N-1)*H1*W1 + # sample: 2N*C*H*W + torch.cuda.empty_cache() + video_length = sample.shape[0] // unet_chunk_size + latent = rearrange(sample.to(torch.float32), "(b f) c h w -> b f c h w", f=video_length) + + cs = torch.nn.Parameter((latent.detach().clone())) + optimizer = torch.optim.Adam([cs], lr=0.2) + + # unify resolution + if flows is not None and occs is not None: + scale = sample.shape[2] * 1.0 / flows[0].shape[2] + kernel = int(1 / scale) + bwd_flow_ = F.interpolate(flows[1] * scale, scale_factor=scale, mode="bilinear").repeat( + unet_chunk_size, 1, 1, 1 + ) + bwd_occ_ = F.max_pool2d(occs[1].unsqueeze(1), kernel_size=kernel).repeat( + unet_chunk_size, 1, 1, 1 + ) # 2(N-1)*1*H1*W1 + fwd_flow_ = F.interpolate(flows[0] * scale, scale_factor=scale, mode="bilinear").repeat( + unet_chunk_size, 1, 1, 1 + ) + fwd_occ_ = F.max_pool2d(occs[0].unsqueeze(1), kernel_size=kernel).repeat( + unet_chunk_size, 1, 1, 1 + ) # 2(N-1)*1*H1*W1 + # match frame 0,1,2,3 and frame 1,2,3,0 + reshuffle_list = list(range(1, video_length)) + [0] + + # attention_probs is the GRAM matrix of the normalized feature + attention_probs = None + for tmp in correlation_matrix: + if sample.shape[2] * sample.shape[3] == tmp.shape[1]: + attention_probs = tmp # 2N*HW*HW + break + + n_iter = [0] + while n_iter[0] < iters: + + def closure(): + optimizer.zero_grad() + + loss = 0 + + # temporal consistency loss + if optimize_temporal and flows is not None and occs is not None: + c1 = rearrange(cs[:, :], "b f c h w -> (b f) c h w") + c2 = rearrange(cs[:, reshuffle_list], "b f c h w -> (b f) c h w") + warped_image1 = flow_warp(c1, bwd_flow_) + warped_image2 = flow_warp(c2, fwd_flow_) + loss = ( + abs((c2 - warped_image1) * (1 - bwd_occ_)) + abs((c1 - warped_image2) * (1 - fwd_occ_)) + ).mean() * 2 + + # spatial consistency loss + if attention_probs is not None and intra_weight > 0: + cs_vector = rearrange(cs, "b f c h w -> (b f) (h w) c") + # attention_scores = torch.bmm(cs_vector, cs_vector.transpose(-1, -2)) + # cs_attention_probs = attention_scores.softmax(dim=-1) + cs_vector = cs_vector / ((cs_vector**2).sum(dim=2, keepdims=True) ** 0.5) + cs_attention_probs = torch.bmm(cs_vector, cs_vector.transpose(-1, -2)) + tmp = F.l1_loss(cs_attention_probs, attention_probs) * intra_weight + loss = tmp + loss + + loss.backward() + n_iter[0] += 1 + + return loss + + optimizer.step(closure) + + torch.cuda.empty_cache() + return adaptive_instance_normalization(rearrange(cs.data.to(sample.dtype), "b f c h w -> (b f) c h w"), sample) + + +@torch.no_grad() +def warp_tensor(sample, flows, occs, saliency, unet_chunk_size): + """ + Warp images or features based on optical flow + Fuse the warped imges or features based on occusion masks and saliency map + """ + scale = sample.shape[2] * 1.0 / flows[0].shape[2] + kernel = int(1 / scale) + bwd_flow_ = F.interpolate(flows[1] * scale, scale_factor=scale, mode="bilinear") + bwd_occ_ = F.max_pool2d(occs[1].unsqueeze(1), kernel_size=kernel) # (N-1)*1*H1*W1 + if scale == 1: + bwd_occ_ = Dilate(kernel_size=13, device=sample.device)(bwd_occ_) + fwd_flow_ = F.interpolate(flows[0] * scale, scale_factor=scale, mode="bilinear") + fwd_occ_ = F.max_pool2d(occs[0].unsqueeze(1), kernel_size=kernel) # (N-1)*1*H1*W1 + if scale == 1: + fwd_occ_ = Dilate(kernel_size=13, device=sample.device)(fwd_occ_) + scale2 = sample.shape[2] * 1.0 / saliency.shape[2] + saliency = F.interpolate(saliency, scale_factor=scale2, mode="bilinear") + latent = sample.to(torch.float32) + video_length = sample.shape[0] // unet_chunk_size + warp_saliency = flow_warp(saliency, bwd_flow_) + warp_saliency_ = flow_warp(saliency[0:1], fwd_flow_[video_length - 1 : video_length]) + + for j in range(unet_chunk_size): + for ii in range(video_length - 1): + i = video_length * j + ii + warped_image = flow_warp(latent[i : i + 1], bwd_flow_[ii : ii + 1]) + mask = (1 - bwd_occ_[ii : ii + 1]) * saliency[ii + 1 : ii + 2] * warp_saliency[ii : ii + 1] + latent[i + 1 : i + 2] = latent[i + 1 : i + 2] * (1 - mask) + warped_image * mask + i = video_length * j + ii = video_length - 1 + warped_image = flow_warp(latent[i : i + 1], fwd_flow_[ii : ii + 1]) + mask = (1 - fwd_occ_[ii : ii + 1]) * saliency[ii : ii + 1] * warp_saliency_ + latent[ii + i : ii + i + 1] = latent[ii + i : ii + i + 1] * (1 - mask) + warped_image * mask + + return latent.to(sample.dtype) + + +def my_forward( + self, + steps=[], + layers=[0, 1, 2, 3], + flows=None, + occs=None, + correlation_matrix=[], + intra_weight=1e2, + iters=20, + optimize_temporal=True, + saliency=None, +): + """ + Hacked pipe.unet.forward() + copied from https://github.com/huggingface/diffusers/blob/v0.19.3/src/diffusers/models/unet_2d_condition.py#L700 + if you are using a new version of diffusers, please copy the source code and modify it accordingly (find [HACK] in the code) + * restore and return the decoder features + * optimize the decoder features + * perform background smoothing + """ + + def forward( + sample: torch.FloatTensor, + timestep: Union[torch.Tensor, float, int], + encoder_hidden_states: torch.Tensor, + class_labels: Optional[torch.Tensor] = None, + timestep_cond: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, + down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, + mid_block_additional_residual: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + return_dict: bool = True, + ) -> Union[UNet2DConditionOutput, Tuple]: + r""" + The [`UNet2DConditionModel`] forward method. + + Args: + sample (`torch.FloatTensor`): + The noisy input tensor with the following shape `(batch, channel, height, width)`. + timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.FloatTensor`): + The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. + encoder_attention_mask (`torch.Tensor`): + A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If + `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias, + which adds large negative values to the attention scores corresponding to "discard" tokens. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. + added_cond_kwargs: (`dict`, *optional*): + A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that + are passed along to the UNet blocks. + + Returns: + [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: + If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise + a `tuple` is returned where the first element is the sample tensor. + """ + # By default samples have to be AT least a multiple of the overall upsampling factor. + # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). + # However, the upsampling interpolation output size can be forced to fit any upsampling size + # on the fly if necessary. + default_overall_up_factor = 2**self.num_upsamplers + + # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor` + forward_upsample_size = False + upsample_size = None + + if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]): + logger.info("Forward upsample size to force interpolation output size.") + forward_upsample_size = True + + # ensure attention_mask is a bias, and give it a singleton query_tokens dimension + # expects mask of shape: + # [batch, key_tokens] + # adds singleton query_tokens dimension: + # [batch, 1, key_tokens] + # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes: + # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn) + # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn) + if attention_mask is not None: + # assume that mask is expressed as: + # (1 = keep, 0 = discard) + # convert mask into a bias that can be added to attention scores: + # (keep = +0, discard = -10000.0) + attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # convert encoder_attention_mask to a bias the same way we do for attention_mask + if encoder_attention_mask is not None: + encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0 + encoder_attention_mask = encoder_attention_mask.unsqueeze(1) + + # 0. center input if necessary + if self.config.center_input_sample: + sample = 2 * sample - 1.0 + + # 1. time + timesteps = timestep + if not torch.is_tensor(timesteps): + # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can + # This would be a good case for the `match` statement (Python 3.10+) + is_mps = sample.device.type == "mps" + if isinstance(timestep, float): + dtype = torch.float32 if is_mps else torch.float64 + else: + dtype = torch.int32 if is_mps else torch.int64 + timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device) + elif len(timesteps.shape) == 0: + timesteps = timesteps[None].to(sample.device) + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timesteps = timesteps.expand(sample.shape[0]) + + t_emb = self.time_proj(timesteps) + + # `Timesteps` does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=sample.dtype) + + emb = self.time_embedding(t_emb, timestep_cond) + aug_emb = None + + if self.class_embedding is not None: + if class_labels is None: + raise ValueError("class_labels should be provided when num_class_embeds > 0") + + if self.config.class_embed_type == "timestep": + class_labels = self.time_proj(class_labels) + + # `Timesteps` does not contain any weights and will always return f32 tensors + # there might be better ways to encapsulate this. + class_labels = class_labels.to(dtype=sample.dtype) + + class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype) + + if self.config.class_embeddings_concat: + emb = torch.cat([emb, class_emb], dim=-1) + else: + emb = emb + class_emb + + if self.config.addition_embed_type == "text": + aug_emb = self.add_embedding(encoder_hidden_states) + elif self.config.addition_embed_type == "text_image": + # Kandinsky 2.1 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" + ) + + image_embs = added_cond_kwargs.get("image_embeds") + text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states) + aug_emb = self.add_embedding(text_embs, image_embs) + elif self.config.addition_embed_type == "text_time": + # SDXL - style + if "text_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`" + ) + text_embeds = added_cond_kwargs.get("text_embeds") + if "time_ids" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`" + ) + time_ids = added_cond_kwargs.get("time_ids") + time_embeds = self.add_time_proj(time_ids.flatten()) + time_embeds = time_embeds.reshape((text_embeds.shape[0], -1)) + + add_embeds = torch.concat([text_embeds, time_embeds], dim=-1) + add_embeds = add_embeds.to(emb.dtype) + aug_emb = self.add_embedding(add_embeds) + elif self.config.addition_embed_type == "image": + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" + ) + image_embs = added_cond_kwargs.get("image_embeds") + aug_emb = self.add_embedding(image_embs) + elif self.config.addition_embed_type == "image_hint": + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`" + ) + image_embs = added_cond_kwargs.get("image_embeds") + hint = added_cond_kwargs.get("hint") + aug_emb, hint = self.add_embedding(image_embs, hint) + sample = torch.cat([sample, hint], dim=1) + + emb = emb + aug_emb if aug_emb is not None else emb + + if self.time_embed_act is not None: + emb = self.time_embed_act(emb) + + if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj": + encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states) + elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj": + # Kadinsky 2.1 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" + ) + + image_embeds = added_cond_kwargs.get("image_embeds") + encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds) + elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj": + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" + ) + image_embeds = added_cond_kwargs.get("image_embeds") + encoder_hidden_states = self.encoder_hid_proj(image_embeds) + # 2. pre-process + sample = self.conv_in(sample) + + # 3. down + + is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None + is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None + + down_block_res_samples = (sample,) + for downsample_block in self.down_blocks: + if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: + # For t2i-adapter CrossAttnDownBlock2D + additional_residuals = {} + if is_adapter and len(down_block_additional_residuals) > 0: + additional_residuals["additional_residuals"] = down_block_additional_residuals.pop(0) + + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, + **additional_residuals, + ) + else: + sample, res_samples = downsample_block(hidden_states=sample, temb=emb) + + if is_adapter and len(down_block_additional_residuals) > 0: + sample += down_block_additional_residuals.pop(0) + down_block_res_samples += res_samples + + if is_controlnet: + new_down_block_res_samples = () + + for down_block_res_sample, down_block_additional_residual in zip( + down_block_res_samples, down_block_additional_residuals + ): + down_block_res_sample = down_block_res_sample + down_block_additional_residual + new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,) + + down_block_res_samples = new_down_block_res_samples + + # 4. mid + if self.mid_block is not None: + sample = self.mid_block( + sample, + emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, + ) + + if is_controlnet: + sample = sample + mid_block_additional_residual + + # 5. up + """ + [HACK] restore the decoder features in up_samples + """ + up_samples = () + # down_samples = () + for i, upsample_block in enumerate(self.up_blocks): + is_final_block = i == len(self.up_blocks) - 1 + + res_samples = down_block_res_samples[-len(upsample_block.resnets) :] + down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] + + """ + [HACK] restore the decoder features in up_samples + [HACK] optimize the decoder features + [HACK] perform background smoothing + """ + if i in layers: + up_samples += (sample,) + if timestep in steps and i in layers: + sample = optimize_feature( + sample, flows, occs, correlation_matrix, intra_weight, iters, optimize_temporal=optimize_temporal + ) + if saliency is not None: + sample = warp_tensor(sample, flows, occs, saliency, 2) + + # if we have not reached the final block and need to forward the + # upsample size, we do it here + if not is_final_block and forward_upsample_size: + upsample_size = down_block_res_samples[-1].shape[2:] + + if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention: + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + upsample_size=upsample_size, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + ) + else: + sample = upsample_block( + hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size + ) + + # 6. post-process + if self.conv_norm_out: + sample = self.conv_norm_out(sample) + sample = self.conv_act(sample) + sample = self.conv_out(sample) + + """ + [HACK] return the output feature as well as the decoder features + """ + if not return_dict: + return (sample,) + up_samples + + return UNet2DConditionOutput(sample=sample) + + return forward + + +@torch.no_grad() +def get_single_mapping_ind(bwd_flow, bwd_occ, imgs, scale=1.0): + """ + FLATTEN: Optical fLow-guided attention (Temoporal-guided attention) + Find the correspondence between every pixels in a pair of frames + + [input] + bwd_flow: 1*2*H*W + bwd_occ: 1*H*W i.e., f2 = warp(f1, bwd_flow) * bwd_occ + imgs: 2*3*H*W i.e., [f1,f2] + + [output] + mapping_ind: pixel index correspondence + unlinkedmask: indicate whether a pixel has no correspondence + i.e., f2 = f1[mapping_ind] * unlinkedmask + """ + flows = F.interpolate(bwd_flow, scale_factor=1.0 / scale, mode="bilinear")[0][[1, 0]] / scale # 2*H*W + _, H, W = flows.shape + masks = torch.logical_not(F.interpolate(bwd_occ[None], scale_factor=1.0 / scale, mode="bilinear") > 0.5)[ + 0 + ] # 1*H*W + frames = F.interpolate(imgs, scale_factor=1.0 / scale, mode="bilinear").view(2, 3, -1) # 2*3*HW + grid = torch.stack(torch.meshgrid([torch.arange(H), torch.arange(W)]), dim=0).to(flows.device) # 2*H*W + warp_grid = torch.round(grid + flows) + mask = torch.logical_and( + torch.logical_and( + torch.logical_and(torch.logical_and(warp_grid[0] >= 0, warp_grid[0] < H), warp_grid[1] >= 0), + warp_grid[1] < W, + ), + masks[0], + ).view(-1) # HW + warp_grid = warp_grid.view(2, -1) # 2*HW + warp_ind = (warp_grid[0] * W + warp_grid[1]).to(torch.long) # HW + mapping_ind = torch.zeros_like(warp_ind) - 1 # HW + + for f0ind, f1ind in enumerate(warp_ind): + if mask[f0ind]: + if mapping_ind[f1ind] == -1: + mapping_ind[f1ind] = f0ind + else: + targetv = frames[0, :, f1ind] + pref0ind = mapping_ind[f1ind] + prev = frames[1, :, pref0ind] + v = frames[1, :, f0ind] + if ((prev - targetv) ** 2).mean() > ((v - targetv) ** 2).mean(): + mask[pref0ind] = False + mapping_ind[f1ind] = f0ind + else: + mask[f0ind] = False + + unusedind = torch.arange(len(mask)).to(mask.device)[~mask] + unlinkedmask = mapping_ind == -1 + mapping_ind[unlinkedmask] = unusedind + return mapping_ind, unlinkedmask + + +@torch.no_grad() +def get_mapping_ind(bwd_flows, bwd_occs, imgs, scale=1.0): + """ + FLATTEN: Optical fLow-guided attention (Temoporal-guided attention) + Find pixel correspondence between every consecutive frames in a batch + + [input] + bwd_flow: (N-1)*2*H*W + bwd_occ: (N-1)*H*W + imgs: N*3*H*W + + [output] + fwd_mappings: N*1*HW + bwd_mappings: N*1*HW + flattn_mask: HW*1*N*N + i.e., imgs[i,:,fwd_mappings[i]] corresponds to imgs[0] + i.e., imgs[i,:,fwd_mappings[i]][:,bwd_mappings[i]] restore the original imgs[i] + """ + N, H, W = imgs.shape[0], int(imgs.shape[2] // scale), int(imgs.shape[3] // scale) + iterattn_mask = torch.ones(H * W, N, N, dtype=torch.bool).to(imgs.device) + for i in range(len(imgs) - 1): + one_mask = torch.ones(N, N, dtype=torch.bool).to(imgs.device) + one_mask[: i + 1, i + 1 :] = False + one_mask[i + 1 :, : i + 1] = False + mapping_ind, unlinkedmask = get_single_mapping_ind( + bwd_flows[i : i + 1], bwd_occs[i : i + 1], imgs[i : i + 2], scale + ) + if i == 0: + fwd_mapping = [torch.arange(len(mapping_ind)).to(mapping_ind.device)] + bwd_mapping = [torch.arange(len(mapping_ind)).to(mapping_ind.device)] + iterattn_mask[unlinkedmask[fwd_mapping[-1]]] = torch.logical_and( + iterattn_mask[unlinkedmask[fwd_mapping[-1]]], one_mask + ) + fwd_mapping += [mapping_ind[fwd_mapping[-1]]] + bwd_mapping += [torch.sort(fwd_mapping[-1])[1]] + fwd_mappings = torch.stack(fwd_mapping, dim=0).unsqueeze(1) + bwd_mappings = torch.stack(bwd_mapping, dim=0).unsqueeze(1) + return fwd_mappings, bwd_mappings, iterattn_mask.unsqueeze(1) + + +def apply_FRESCO_opt( + pipe, + steps=[], + layers=[0, 1, 2, 3], + flows=None, + occs=None, + correlation_matrix=[], + intra_weight=1e2, + iters=20, + optimize_temporal=True, + saliency=None, +): + """ + Apply FRESCO-based optimization to a StableDiffusionPipeline + """ + pipe.unet.forward = my_forward( + pipe.unet, steps, layers, flows, occs, correlation_matrix, intra_weight, iters, optimize_temporal, saliency + ) + + +@torch.no_grad() +def get_intraframe_paras(pipe, imgs, frescoProc, prompt_embeds, do_classifier_free_guidance=True, generator=None): + """ + Get parameters for spatial-guided attention and optimization + * perform one step denoising + * collect attention feature, stored in frescoProc.controller.stored_attn['decoder_attn'] + * compute the gram matrix of the normalized feature for spatial consistency loss + """ + + noise_scheduler = pipe.scheduler + timestep = noise_scheduler.timesteps[-1] + device = pipe._execution_device + B, C, H, W = imgs.shape + + frescoProc.controller.disable_controller() + apply_FRESCO_opt(pipe) + frescoProc.controller.clear_store() + frescoProc.controller.enable_store() + + latents = pipe.prepare_latents( + imgs.to(pipe.unet.dtype), timestep, B, 1, prompt_embeds.dtype, device, generator=generator, repeat_noise=False + ) + + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + model_output = pipe.unet( + latent_model_input, + timestep, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=None, + return_dict=False, + ) + + frescoProc.controller.disable_store() + + # gram matrix of the normalized feature for spatial consistency loss + correlation_matrix = [] + for tmp in model_output[1:]: + latent_vector = rearrange(tmp, "b c h w -> b (h w) c") + latent_vector = latent_vector / ((latent_vector**2).sum(dim=2, keepdims=True) ** 0.5) + attention_probs = torch.bmm(latent_vector, latent_vector.transpose(-1, -2)) + correlation_matrix += [attention_probs.detach().clone().to(torch.float32)] + del attention_probs, latent_vector, tmp + del model_output + + clear_cache() + + return correlation_matrix + + +@torch.no_grad() +def get_flow_and_interframe_paras(flow_model, imgs): + """ + Get parameters for temporal-guided attention and optimization + * predict optical flow and occlusion mask + * compute pixel index correspondence for FLATTEN + """ + images = torch.stack([torch.from_numpy(img).permute(2, 0, 1).float() for img in imgs], dim=0).cuda() + imgs_torch = torch.cat([numpy2tensor(img) for img in imgs], dim=0) + + reshuffle_list = list(range(1, len(images))) + [0] + + results_dict = flow_model( + images, + images[reshuffle_list], + attn_splits_list=[2], + corr_radius_list=[-1], + prop_radius_list=[-1], + pred_bidir_flow=True, + ) + flow_pr = results_dict["flow_preds"][-1] # [2*B, 2, H, W] + fwd_flows, bwd_flows = flow_pr.chunk(2) # [B, 2, H, W] + fwd_occs, bwd_occs = forward_backward_consistency_check(fwd_flows, bwd_flows) # [B, H, W] + + warped_image1 = flow_warp(images, bwd_flows) + bwd_occs = torch.clamp( + bwd_occs + (abs(images[reshuffle_list] - warped_image1).mean(dim=1) > 255 * 0.25).float(), 0, 1 + ) + + warped_image2 = flow_warp(images[reshuffle_list], fwd_flows) + fwd_occs = torch.clamp(fwd_occs + (abs(images - warped_image2).mean(dim=1) > 255 * 0.25).float(), 0, 1) + + attn_mask = [] + for scale in [8.0, 16.0, 32.0]: + bwd_occs_ = F.interpolate(bwd_occs[:-1].unsqueeze(1), scale_factor=1.0 / scale, mode="bilinear") + attn_mask += [ + torch.cat((bwd_occs_[0:1].reshape(1, -1) > -1, bwd_occs_.reshape(bwd_occs_.shape[0], -1) > 0.5), dim=0) + ] + + fwd_mappings = [] + bwd_mappings = [] + interattn_masks = [] + for scale in [8.0, 16.0]: + fwd_mapping, bwd_mapping, interattn_mask = get_mapping_ind(bwd_flows, bwd_occs, imgs_torch, scale=scale) + fwd_mappings += [fwd_mapping] + bwd_mappings += [bwd_mapping] + interattn_masks += [interattn_mask] + + interattn_paras = {} + interattn_paras["fwd_mappings"] = fwd_mappings + interattn_paras["bwd_mappings"] = bwd_mappings + interattn_paras["interattn_masks"] = interattn_masks + + clear_cache() + + return [fwd_flows, bwd_flows], [fwd_occs, bwd_occs], attn_mask, interattn_paras + + +class AttentionControl: + """ + Control FRESCO-based attention + * enable/diable spatial-guided attention + * enable/diable temporal-guided attention + * enable/diable cross-frame attention + * collect intermediate attention feature (for spatial-guided attention) + """ + + def __init__(self): + self.stored_attn = self.get_empty_store() + self.store = False + self.index = 0 + self.attn_mask = None + self.interattn_paras = None + self.use_interattn = False + self.use_cfattn = False + self.use_intraattn = False + self.intraattn_bias = 0 + self.intraattn_scale_factor = 0.2 + self.interattn_scale_factor = 0.2 + + @staticmethod + def get_empty_store(): + return { + "decoder_attn": [], + } + + def clear_store(self): + del self.stored_attn + torch.cuda.empty_cache() + gc.collect() + self.stored_attn = self.get_empty_store() + self.disable_intraattn() + + # store attention feature of the input frame for spatial-guided attention + def enable_store(self): + self.store = True + + def disable_store(self): + self.store = False + + # spatial-guided attention + def enable_intraattn(self): + self.index = 0 + self.use_intraattn = True + self.disable_store() + if len(self.stored_attn["decoder_attn"]) == 0: + self.use_intraattn = False + + def disable_intraattn(self): + self.index = 0 + self.use_intraattn = False + self.disable_store() + + def disable_cfattn(self): + self.use_cfattn = False + + # cross frame attention + def enable_cfattn(self, attn_mask=None): + if attn_mask: + if self.attn_mask: + del self.attn_mask + torch.cuda.empty_cache() + self.attn_mask = attn_mask + self.use_cfattn = True + else: + if self.attn_mask: + self.use_cfattn = True + else: + print("Warning: no valid cross-frame attention parameters available!") + self.disable_cfattn() + + def disable_interattn(self): + self.use_interattn = False + + # temporal-guided attention + def enable_interattn(self, interattn_paras=None): + if interattn_paras: + if self.interattn_paras: + del self.interattn_paras + torch.cuda.empty_cache() + self.interattn_paras = interattn_paras + self.use_interattn = True + else: + if self.interattn_paras: + self.use_interattn = True + else: + print("Warning: no valid temporal-guided attention parameters available!") + self.disable_interattn() + + def disable_controller(self): + self.disable_intraattn() + self.disable_interattn() + self.disable_cfattn() + + def enable_controller(self, interattn_paras=None, attn_mask=None): + self.enable_intraattn() + self.enable_interattn(interattn_paras) + self.enable_cfattn(attn_mask) + + def forward(self, context): + if self.store: + self.stored_attn["decoder_attn"].append(context.detach()) + if self.use_intraattn and len(self.stored_attn["decoder_attn"]) > 0: + tmp = self.stored_attn["decoder_attn"][self.index] + self.index = self.index + 1 + if self.index >= len(self.stored_attn["decoder_attn"]): + self.index = 0 + self.disable_store() + return tmp + return context + + def __call__(self, context): + context = self.forward(context) + return context + + +class FRESCOAttnProcessor2_0: + """ + Hack self attention to FRESCO-based attention + * adding spatial-guided attention + * adding temporal-guided attention + * adding cross-frame attention + + Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). + Usage + frescoProc = FRESCOAttnProcessor2_0(2, attn_mask) + attnProc = AttnProcessor2_0() + + attn_processor_dict = {} + for k in pipe.unet.attn_processors.keys(): + if k.startswith("up_blocks.2") or k.startswith("up_blocks.3"): + attn_processor_dict[k] = frescoProc + else: + attn_processor_dict[k] = attnProc + pipe.unet.set_attn_processor(attn_processor_dict) + """ + + def __init__(self, unet_chunk_size=2, controller=None): + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") + self.unet_chunk_size = unet_chunk_size + self.controller = controller + + def __call__( + self, + attn, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + temb=None, + ): + residual = hidden_states + + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + + if attention_mask is not None: + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + # scaled_dot_product_attention expects attention_mask shape to be + # (batch, heads, source_length, target_length) + attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + + crossattn = False + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + if self.controller and self.controller.store: + self.controller(hidden_states.detach().clone()) + else: + crossattn = True + if attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + # BC * HW * 8D + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + query_raw, key_raw = None, None + if self.controller and self.controller.use_interattn and (not crossattn): + query_raw, key_raw = query.clone(), key.clone() + + inner_dim = key.shape[-1] # 8D + head_dim = inner_dim // attn.heads # D + + """for efficient cross-frame attention""" + if self.controller and self.controller.use_cfattn and (not crossattn): + video_length = key.size()[0] // self.unet_chunk_size + former_frame_index = [0] * video_length + attn_mask = None + if self.controller.attn_mask is not None: + for m in self.controller.attn_mask: + if m.shape[1] == key.shape[1]: + attn_mask = m + # BC * HW * 8D --> B * C * HW * 8D + key = rearrange(key, "(b f) d c -> b f d c", f=video_length) + # B * C * HW * 8D --> B * C * HW * 8D + if attn_mask is None: + key = key[:, former_frame_index] + else: + key = repeat(key[:, attn_mask], "b d c -> b f d c", f=video_length) + # B * C * HW * 8D --> BC * HW * 8D + key = rearrange(key, "b f d c -> (b f) d c").detach() + value = rearrange(value, "(b f) d c -> b f d c", f=video_length) + if attn_mask is None: + value = value[:, former_frame_index] + else: + value = repeat(value[:, attn_mask], "b d c -> b f d c", f=video_length) + value = rearrange(value, "b f d c -> (b f) d c").detach() + + # BC * HW * 8D --> BC * HW * 8 * D --> BC * 8 * HW * D + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + # BC * 8 * HW2 * D + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + # BC * 8 * HW2 * D2 + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + """for spatial-guided intra-frame attention""" + if self.controller and self.controller.use_intraattn and (not crossattn): + ref_hidden_states = self.controller(None) + assert ref_hidden_states.shape == encoder_hidden_states.shape + query_ = attn.to_q(ref_hidden_states) + key_ = attn.to_k(ref_hidden_states) + + # BC * 8 * HW * D + query_ = query_.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + key_ = key_.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + query = F.scaled_dot_product_attention( + query_, + key_ * self.controller.intraattn_scale_factor, + query, + attn_mask=torch.eye(query_.size(-2), key_.size(-2), dtype=query.dtype, device=query.device) + * self.controller.intraattn_bias, + ).detach() + + del query_, key_ + torch.cuda.empty_cache() + + # the output of sdp = (batch, num_heads, seq_len, head_dim) + # TODO: add support for attn.scale when we move to Torch 2.1 + # output: BC * 8 * HW * D2 + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) + + """for temporal-guided inter-frame attention (FLATTEN)""" + if self.controller and self.controller.use_interattn and (not crossattn): + del query, key, value + torch.cuda.empty_cache() + bwd_mapping = None + fwd_mapping = None + for i, f in enumerate(self.controller.interattn_paras["fwd_mappings"]): + if f.shape[2] == hidden_states.shape[2]: + fwd_mapping = f + bwd_mapping = self.controller.interattn_paras["bwd_mappings"][i] + interattn_mask = self.controller.interattn_paras["interattn_masks"][i] + video_length = key_raw.size()[0] // self.unet_chunk_size + # BC * HW * 8D --> C * 8BD * HW + key = rearrange(key_raw, "(b f) d c -> f (b c) d", f=video_length) + query = rearrange(query_raw, "(b f) d c -> f (b c) d", f=video_length) + # BC * 8 * HW * D --> C * 8BD * HW + # key = rearrange(hidden_states, "(b f) h d c -> f (b h c) d", f=video_length) ######## + # query = rearrange(hidden_states, "(b f) h d c -> f (b h c) d", f=video_length) ####### + + value = rearrange(hidden_states, "(b f) h d c -> f (b h c) d", f=video_length) + key = torch.gather(key, 2, fwd_mapping.expand(-1, key.shape[1], -1)) + query = torch.gather(query, 2, fwd_mapping.expand(-1, query.shape[1], -1)) + value = torch.gather(value, 2, fwd_mapping.expand(-1, value.shape[1], -1)) + # C * 8BD * HW --> BHW, C, 8D + key = rearrange(key, "f (b c) d -> (b d) f c", b=self.unet_chunk_size) + query = rearrange(query, "f (b c) d -> (b d) f c", b=self.unet_chunk_size) + value = rearrange(value, "f (b c) d -> (b d) f c", b=self.unet_chunk_size) + # BHW * C * 8D --> BHW * C * 8 * D--> BHW * 8 * C * D + query = query.view(-1, video_length, attn.heads, head_dim).transpose(1, 2).detach() + key = key.view(-1, video_length, attn.heads, head_dim).transpose(1, 2).detach() + value = value.view(-1, video_length, attn.heads, head_dim).transpose(1, 2).detach() + hidden_states_ = F.scaled_dot_product_attention( + query, + key * self.controller.interattn_scale_factor, + value, + # .to(query.dtype)-1.0) * 1e6 - + attn_mask=(interattn_mask.repeat(self.unet_chunk_size, 1, 1, 1)), + # torch.eye(interattn_mask.shape[2]).to(query.device).to(query.dtype) * 1e4, + ) + + # BHW * 8 * C * D --> C * 8BD * HW + hidden_states_ = rearrange(hidden_states_, "(b d) h f c -> f (b h c) d", b=self.unet_chunk_size) + hidden_states_ = torch.gather( + hidden_states_, 2, bwd_mapping.expand(-1, hidden_states_.shape[1], -1) + ).detach() + # C * 8BD * HW --> BC * 8 * HW * D + hidden_states = rearrange( + hidden_states_, "f (b h c) d -> (b f) h d c", b=self.unet_chunk_size, h=attn.heads + ) + + # BC * 8 * HW * D --> BC * HW * 8D + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + hidden_states = hidden_states.to(query.dtype) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + + return hidden_states + + +def apply_FRESCO_attn(pipe): + """ + Apply FRESCO-guided attention to a StableDiffusionPipeline + """ + frescoProc = FRESCOAttnProcessor2_0(2, AttentionControl()) + attnProc = AttnProcessor2_0() + attn_processor_dict = {} + for k in pipe.unet.attn_processors.keys(): + if k.startswith("up_blocks.2") or k.startswith("up_blocks.3"): + attn_processor_dict[k] = frescoProc + else: + attn_processor_dict[k] = attnProc + pipe.unet.set_attn_processor(attn_processor_dict) + return frescoProc + + +def retrieve_latents( + encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample" +): + if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": + return encoder_output.latent_dist.sample(generator) + elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": + return encoder_output.latent_dist.mode() + elif hasattr(encoder_output, "latents"): + return encoder_output.latents + else: + raise AttributeError("Could not access latents of provided encoder_output") + + +def prepare_image(image): + if isinstance(image, torch.Tensor): + # Batch single image + if image.ndim == 3: + image = image.unsqueeze(0) + + image = image.to(dtype=torch.float32) + else: + # preprocess image + if isinstance(image, (PIL.Image.Image, np.ndarray)): + image = [image] + + if isinstance(image, list) and isinstance(image[0], PIL.Image.Image): + image = [np.array(i.convert("RGB"))[None, :] for i in image] + image = np.concatenate(image, axis=0) + elif isinstance(image, list) and isinstance(image[0], np.ndarray): + image = np.concatenate([i[None, :] for i in image], axis=0) + + image = image.transpose(0, 3, 1, 2) + image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0 + + return image + + +class FrescoV2VPipeline(StableDiffusionControlNetImg2ImgPipeline): + r""" + Pipeline for video-to-video translation using Stable Diffusion with FRESCO Algorithm. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods + implemented for all pipelines (downloading, saving, running on a particular device, etc.). + + The pipeline also inherits the following loading methods: + - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings + - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. + text_encoder ([`~transformers.CLIPTextModel`]): + Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)). + tokenizer ([`~transformers.CLIPTokenizer`]): + A `CLIPTokenizer` to tokenize text. + unet ([`UNet2DConditionModel`]): + A `UNet2DConditionModel` to denoise the encoded image latents. + controlnet ([`ControlNetModel`] or `List[ControlNetModel]`): + Provides additional conditioning to the `unet` during the denoising process. If you set multiple + ControlNets as a list, the outputs from each ControlNet are added together to create one combined + additional conditioning. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details + about a model's potential harms. + feature_extractor ([`~transformers.CLIPImageProcessor`]): + A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`. + """ + + model_cpu_offload_seq = "text_encoder->unet->vae" + _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] + _exclude_from_cpu_offload = ["safety_checker"] + _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel], + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + image_encoder: CLIPVisionModelWithProjection = None, + requires_safety_checker: bool = True, + ): + super().__init__( + vae, + text_encoder, + tokenizer, + unet, + controlnet, + scheduler, + safety_checker, + feature_extractor, + image_encoder, + requires_safety_checker, + ) + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + if isinstance(controlnet, (list, tuple)): + controlnet = MultiControlNetModel(controlnet) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + controlnet=controlnet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + image_encoder=image_encoder, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) + self.control_image_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False + ) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + frescoProc = FRESCOAttnProcessor2_0(2, AttentionControl()) + attnProc = AttnProcessor2_0() + attn_processor_dict = {} + for k in self.unet.attn_processors.keys(): + if k.startswith("up_blocks.2") or k.startswith("up_blocks.3"): + attn_processor_dict[k] = frescoProc + else: + attn_processor_dict[k] = attnProc + self.unet.set_attn_processor(attn_processor_dict) + self.frescoProc = frescoProc + + flow_model = GMFlow( + feature_channels=128, + num_scales=1, + upsample_factor=8, + num_head=1, + attention_type="swin", + ffn_dim_expansion=4, + num_transformer_layers=6, + ).to(self.device) + + checkpoint = torch.utils.model_zoo.load_url( + "https://huggingface.co/Anonymous-sub/Rerender/resolve/main/models/gmflow_sintel-0c07dcb3.pth", + map_location=lambda storage, loc: storage, + ) + weights = checkpoint["model"] if "model" in checkpoint else checkpoint + flow_model.load_state_dict(weights, strict=False) + flow_model.eval() + self.flow_model = flow_model + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + **kwargs, + ): + deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." + deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) + + prompt_embeds_tuple = self.encode_prompt( + prompt=prompt, + device=device, + num_images_per_prompt=num_images_per_prompt, + do_classifier_free_guidance=do_classifier_free_guidance, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=lora_scale, + **kwargs, + ) + + # concatenate for backwards comp + prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]]) + + return prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt + def encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + clip_skip: Optional[int] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + lora_scale (`float`, *optional*): + A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + + # dynamically adjust the LoRA scale + if not USE_PEFT_BACKEND: + adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) + else: + scale_lora_layers(self.text_encoder, lora_scale) + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + # textual inversion: process multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, self.tokenizer) + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + if clip_skip is None: + prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask) + prompt_embeds = prompt_embeds[0] + else: + prompt_embeds = self.text_encoder( + text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True + ) + # Access the `hidden_states` first, that contains a tuple of + # all the hidden states from the encoder layers. Then index into + # the tuple to access the hidden states from the desired layer. + prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)] + # We also need to apply the final LayerNorm here to not mess with the + # representations. The `last_hidden_states` that we typically use for + # obtaining the final prompt representations passes through the LayerNorm + # layer. + prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + + if self.text_encoder is not None: + prompt_embeds_dtype = self.text_encoder.dtype + elif self.unet is not None: + prompt_embeds_dtype = self.unet.dtype + else: + prompt_embeds_dtype = prompt_embeds.dtype + + prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + # textual inversion: process multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder, lora_scale) + + return prompt_embeds, negative_prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] + + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) + + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) + + if do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + + image_embeds.append(single_image_embeds) + else: + repeat_dims = [1] + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + image_embeds.append(single_image_embeds) + + return image_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is None: + has_nsfw_concept = None + else: + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + return image, has_nsfw_concept + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents + def decode_latents(self, latents): + deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead" + deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False) + + latents = 1 / self.vae.config.scaling_factor * latents + image = self.vae.decode(latents, return_dict=False)[0] + image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + return image + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + image, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, + controlnet_conditioning_scale=1.0, + control_guidance_start=0.0, + control_guidance_end=1.0, + callback_on_step_end_tensor_inputs=None, + ): + if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + # `prompt` needs more sophisticated handling when there are multiple + # conditionings. + if isinstance(self.controlnet, MultiControlNetModel): + if isinstance(prompt, list): + logger.warning( + f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}" + " prompts. The conditionings will be fixed across the prompts." + ) + + # Check `image` + is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance( + self.controlnet, torch._dynamo.eval_frame.OptimizedModule + ) + if ( + isinstance(self.controlnet, ControlNetModel) + or is_compiled + and isinstance(self.controlnet._orig_mod, ControlNetModel) + ): + self.check_image(image, prompt, prompt_embeds) + elif ( + isinstance(self.controlnet, MultiControlNetModel) + or is_compiled + and isinstance(self.controlnet._orig_mod, MultiControlNetModel) + ): + if not isinstance(image, list): + raise TypeError("For multiple controlnets: `image` must be type `list`") + + # When `image` is a nested list: + # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]]) + elif any(isinstance(i, list) for i in image): + raise ValueError("A single batch of multiple conditionings are supported at the moment.") + elif len(image) != len(self.controlnet.nets): + raise ValueError( + f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets." + ) + + for image_ in image: + self.check_image(image_, prompt, prompt_embeds) + else: + assert False + + # Check `controlnet_conditioning_scale` + if ( + isinstance(self.controlnet, ControlNetModel) + or is_compiled + and isinstance(self.controlnet._orig_mod, ControlNetModel) + ): + if not isinstance(controlnet_conditioning_scale, float): + raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.") + elif ( + isinstance(self.controlnet, MultiControlNetModel) + or is_compiled + and isinstance(self.controlnet._orig_mod, MultiControlNetModel) + ): + if isinstance(controlnet_conditioning_scale, list): + if any(isinstance(i, list) for i in controlnet_conditioning_scale): + raise ValueError("A single batch of multiple conditionings are supported at the moment.") + elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len( + self.controlnet.nets + ): + raise ValueError( + "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have" + " the same length as the number of controlnets" + ) + else: + assert False + + if len(control_guidance_start) != len(control_guidance_end): + raise ValueError( + f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list." + ) + + if isinstance(self.controlnet, MultiControlNetModel): + if len(control_guidance_start) != len(self.controlnet.nets): + raise ValueError( + f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}." + ) + + for start, end in zip(control_guidance_start, control_guidance_end): + if start >= end: + raise ValueError( + f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}." + ) + if start < 0.0: + raise ValueError(f"control guidance start: {start} can't be smaller than 0.") + if end > 1.0: + raise ValueError(f"control guidance end: {end} can't be larger than 1.0.") + + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image + def check_image(self, image, prompt, prompt_embeds): + image_is_pil = isinstance(image, PIL.Image.Image) + image_is_tensor = isinstance(image, torch.Tensor) + image_is_np = isinstance(image, np.ndarray) + image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image) + image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor) + image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray) + + if ( + not image_is_pil + and not image_is_tensor + and not image_is_np + and not image_is_pil_list + and not image_is_tensor_list + and not image_is_np_list + ): + raise TypeError( + f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}" + ) + + if image_is_pil: + image_batch_size = 1 + else: + image_batch_size = len(image) + + if prompt is not None and isinstance(prompt, str): + prompt_batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + prompt_batch_size = len(prompt) + elif prompt_embeds is not None: + prompt_batch_size = prompt_embeds.shape[0] + + if image_batch_size != 1 and image_batch_size != prompt_batch_size: + raise ValueError( + f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}" + ) + + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image + def prepare_control_image( + self, + image, + width, + height, + batch_size, + num_images_per_prompt, + device, + dtype, + do_classifier_free_guidance=False, + guess_mode=False, + ): + image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32) + image_batch_size = image.shape[0] + + if image_batch_size == 1: + repeat_by = batch_size + else: + # image batch size is the same as prompt batch size + repeat_by = num_images_per_prompt + + image = image.repeat_interleave(repeat_by, dim=0) + + image = image.to(device=device, dtype=dtype) + + if do_classifier_free_guidance and not guess_mode: + image = torch.cat([image] * 2) + + return image + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) + + return timesteps, num_inference_steps - t_start + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents + def prepare_latents( + self, image, timestep, batch_size, num_images_per_prompt, dtype, device, repeat_noise, generator=None + ): + if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)): + raise ValueError( + f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" + ) + + image = image.to(device=device, dtype=dtype) + + batch_size = batch_size * num_images_per_prompt + + if image.shape[1] == 4: + init_latents = image + + else: + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + elif isinstance(generator, list): + init_latents = [ + retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) + for i in range(batch_size) + ] + init_latents = torch.cat(init_latents, dim=0) + else: + init_latents = retrieve_latents(self.vae.encode(image), generator=generator) + + init_latents = self.vae.config.scaling_factor * init_latents + + if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: + # expand init_latents for batch_size + deprecation_message = ( + f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" + " images (`image`). Initial images are now duplicating to match the number of text prompts. Note" + " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" + " your script to pass as many initial images as text prompts to suppress this warning." + ) + deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False) + additional_image_per_prompt = batch_size // init_latents.shape[0] + init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0) + elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." + ) + else: + init_latents = torch.cat([init_latents], dim=0) + + shape = init_latents.shape + if repeat_noise: + noise = randn_tensor((1, *shape[1:]), generator=generator, device=device, dtype=dtype) + one_tuple = (1,) * (len(shape) - 1) + noise = noise.repeat(batch_size, *one_tuple) + else: + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + + # get latents + init_latents = self.scheduler.add_noise(init_latents, noise, timestep) + latents = init_latents + + return latents + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def clip_skip(self): + return self._clip_skip + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + @property + def do_classifier_free_guidance(self): + return self._guidance_scale > 1 + + @property + def cross_attention_kwargs(self): + return self._cross_attention_kwargs + + @property + def num_timesteps(self): + return self._num_timesteps + + @torch.no_grad() + def __call__( + self, + prompt: Union[str, List[str]] = None, + frames: Union[List[np.ndarray], torch.FloatTensor] = None, + control_frames: Union[List[np.ndarray], torch.FloatTensor] = None, + height: Optional[int] = None, + width: Optional[int] = None, + strength: float = 0.8, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + controlnet_conditioning_scale: Union[float, List[float]] = 0.8, + guess_mode: bool = False, + control_guidance_start: Union[float, List[float]] = 0.0, + control_guidance_end: Union[float, List[float]] = 1.0, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + end_opt_step=15, + num_intraattn_steps=1, + step_interattn_end=350, + **kwargs, + ): + r""" + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + frames (`List[np.ndarray]` or `torch.FloatTensor`): The input images to be used as the starting point for the image generation process. + control_frames (`List[np.ndarray]` or `torch.FloatTensor`): The ControlNet input images condition to provide guidance to the `unet` for generation. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The width in pixels of the generated image. + strength (`float`, *optional*, defaults to 0.8): + Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a + starting point and more noise is added the higher the `strength`. The number of denoising steps depends + on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising + process runs for the full number of iterations specified in `num_inference_steps`. A value of 1 + essentially ignores `image`. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0): + The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added + to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set + the corresponding scale as a list. + guess_mode (`bool`, *optional*, defaults to `False`): + The ControlNet encoder tries to recognize the content of the input image even if you remove all + prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended. + control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0): + The percentage of total steps at which the ControlNet starts applying. + control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0): + The percentage of total steps at which the ControlNet stops applying. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + end_opt_step: + The feature optimization is activated from strength * num_inference_step to end_opt_step. + num_intraattn_steps: + Apply num_interattn_steps steps of spatial-guided attention. + step_interattn_end: + Apply temporal-guided attention in [step_interattn_end, 1000] steps + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, + otherwise a `tuple` is returned where the first element is a list with the generated images and the + second element is a list of `bool`s indicating whether the corresponding generated image contains + "not-safe-for-work" (nsfw) content. + """ + + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + if callback is not None: + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + if callback_steps is not None: + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + + controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet + + # align format for control guidance + if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list): + control_guidance_start = len(control_guidance_end) * [control_guidance_start] + elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list): + control_guidance_end = len(control_guidance_start) * [control_guidance_end] + elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list): + mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1 + control_guidance_start, control_guidance_end = ( + mult * [control_guidance_start], + mult * [control_guidance_end], + ) + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + control_frames[0], + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, + controlnet_conditioning_scale, + control_guidance_start, + control_guidance_end, + callback_on_step_end_tensor_inputs, + ) + + self._guidance_scale = guidance_scale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + + # 2. Define call parameters + batch_size = len(frames) + + device = self._execution_device + + if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): + controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets) + + global_pool_conditions = ( + controlnet.config.global_pool_conditions + if isinstance(controlnet, ControlNetModel) + else controlnet.nets[0].config.global_pool_conditions + ) + guess_mode = guess_mode or global_pool_conditions + + # 3. Encode input prompt + text_encoder_lora_scale = ( + self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None + ) + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt, + device, + num_images_per_prompt, + self.do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, + clip_skip=self.clip_skip, + ) + prompt_embeds = prompt_embeds.repeat(batch_size, 1, 1) + negative_prompt_embeds = negative_prompt_embeds.repeat(batch_size, 1, 1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + if self.do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, + ) + + # 4. Prepare image + imgs_np = [] + for frame in frames: + if isinstance(frame, PIL.Image.Image): + imgs_np.append(np.asarray(frame)) + else: + # np.ndarray + imgs_np.append(frame) + images_pt = self.image_processor.preprocess(frames).to(dtype=torch.float32) + + # 5. Prepare controlnet_conditioning_image + if isinstance(controlnet, ControlNetModel): + control_image = self.prepare_control_image( + image=control_frames, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + device=device, + dtype=controlnet.dtype, + do_classifier_free_guidance=self.do_classifier_free_guidance, + guess_mode=guess_mode, + ) + elif isinstance(controlnet, MultiControlNetModel): + control_images = [] + + for control_image_ in control_frames: + control_image_ = self.prepare_control_image( + image=control_image_, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + device=device, + dtype=controlnet.dtype, + do_classifier_free_guidance=self.do_classifier_free_guidance, + guess_mode=guess_mode, + ) + + control_images.append(control_image_) + + control_image = control_images + else: + assert False + + self.flow_model.to(device) + + flows, occs, attn_mask, interattn_paras = get_flow_and_interframe_paras(self.flow_model, imgs_np) + correlation_matrix = get_intraframe_paras(self, images_pt, self.frescoProc, prompt_embeds, generator) + + """ + Flexible settings for attention: + * Turn off FRESCO-guided attention: frescoProc.controller.disable_controller() + Then you can turn on one specific attention submodule + * Turn on Cross-frame attention: frescoProc.controller.enable_cfattn(attn_mask) + * Turn on Spatial-guided attention: frescoProc.controller.enable_intraattn() + * Turn on Temporal-guided attention: frescoProc.controller.enable_interattn(interattn_paras) + + Flexible settings for optimization: + * Turn off Spatial-guided optimization: set optimize_temporal = False in apply_FRESCO_opt() + * Turn off Temporal-guided optimization: set correlation_matrix = [] in apply_FRESCO_opt() + * Turn off FRESCO-guided optimization: disable_FRESCO_opt(pipe) + + Flexible settings for background smoothing: + * Turn off background smoothing: set saliency = None in apply_FRESCO_opt() + """ + + self.frescoProc.controller.enable_controller(interattn_paras=interattn_paras, attn_mask=attn_mask) + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + apply_FRESCO_opt( + self, + steps=timesteps[:end_opt_step], + flows=flows, + occs=occs, + correlation_matrix=correlation_matrix, + saliency=None, + optimize_temporal=True, + ) + + clear_cache() + + # 5. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) + latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + self._num_timesteps = len(timesteps) + + # 6. Prepare latent variables + latents = self.prepare_latents( + images_pt, + latent_timestep, + batch_size, + num_images_per_prompt, + prompt_embeds.dtype, + device, + generator=generator, + repeat_noise=True, + ) + + # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7.1 Add image embeds for IP-Adapter + added_cond_kwargs = ( + {"image_embeds": image_embeds} + if ip_adapter_image is not None or ip_adapter_image_embeds is not None + else None + ) + + # 7.2 Create tensor stating which controlnets to keep + controlnet_keep = [] + for i in range(len(timesteps)): + keeps = [ + 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e) + for s, e in zip(control_guidance_start, control_guidance_end) + ] + controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps) + + # 8. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if i >= num_intraattn_steps: + self.frescoProc.controller.disable_intraattn() + if t < step_interattn_end: + self.frescoProc.controller.disable_interattn() + + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # controlnet(s) inference + if guess_mode and self.do_classifier_free_guidance: + # Infer ControlNet only for the conditional batch. + control_model_input = latents + control_model_input = self.scheduler.scale_model_input(control_model_input, t) + controlnet_prompt_embeds = prompt_embeds.chunk(2)[1] + else: + control_model_input = latent_model_input + controlnet_prompt_embeds = prompt_embeds + + if isinstance(controlnet_keep[i], list): + cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])] + else: + controlnet_cond_scale = controlnet_conditioning_scale + if isinstance(controlnet_cond_scale, list): + controlnet_cond_scale = controlnet_cond_scale[0] + cond_scale = controlnet_cond_scale * controlnet_keep[i] + + down_block_res_samples, mid_block_res_sample = self.controlnet( + control_model_input, + t, + encoder_hidden_states=controlnet_prompt_embeds, + controlnet_cond=control_image, + conditioning_scale=cond_scale, + guess_mode=guess_mode, + return_dict=False, + ) + + if guess_mode and self.do_classifier_free_guidance: + # Infered ControlNet only for the conditional batch. + # To apply the output of ControlNet to both the unconditional and conditional batches, + # add 0 to the unconditional batch to keep it unchanged. + down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] + mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample]) + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=self.cross_attention_kwargs, + down_block_additional_residuals=down_block_res_samples, + mid_block_additional_residual=mid_block_res_sample, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + # If we do sequential model offloading, let's offload unet and controlnet + # manually for max memory savings + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.unet.to("cpu") + self.controlnet.to("cpu") + torch.cuda.empty_cache() + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[ + 0 + ] + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + image = latents + has_nsfw_concept = None + + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + + # Offload all models + self.maybe_free_model_hooks() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/examples/community/gluegen.py b/examples/community/gluegen.py index c656dce55a0d..1ad6911905db 100644 --- a/examples/community/gluegen.py +++ b/examples/community/gluegen.py @@ -205,7 +205,7 @@ def __init__( safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPImageProcessor, language_adapter: TranslatorNoLN = None, - tensor_norm: torch.FloatTensor = None, + tensor_norm: torch.Tensor = None, requires_safety_checker: bool = True, ): super().__init__() @@ -231,7 +231,7 @@ def load_language_adapter( num_token: int, dim: int, dim_out: int, - tensor_norm: torch.FloatTensor, + tensor_norm: torch.Tensor, mult: int = 2, depth: int = 5, ): @@ -242,7 +242,7 @@ def load_language_adapter( ) self.language_adapter.load_state_dict(torch.load(model_path)) - def _adapt_language(self, prompt_embeds: torch.FloatTensor): + def _adapt_language(self, prompt_embeds: torch.Tensor): prompt_embeds = prompt_embeds / 3 prompt_embeds = self.language_adapter(prompt_embeds) * (self.tensor_norm / 2) return prompt_embeds @@ -254,8 +254,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -275,10 +275,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -535,7 +535,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -594,9 +594,9 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -635,14 +635,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. diff --git a/examples/community/hd_painter.py b/examples/community/hd_painter.py index c157e1688941..df41be9ef7b1 100644 --- a/examples/community/hd_painter.py +++ b/examples/community/hd_painter.py @@ -28,10 +28,10 @@ def __init__(self, mask, token_idx, scale_factor): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, scale: float = 1.0, ) -> torch.Tensor: # Same as the default AttnProcessor up untill the part where similarity matrix gets saved @@ -111,10 +111,10 @@ def __init__(self, transformer_block, mask, token_idx, do_classifier_free_guidan def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, scale: float = 1.0, ) -> torch.Tensor: # Automatically recognize the resolution of the current attention layer and resize the masks accordingly @@ -454,7 +454,7 @@ def __call__( prompt: Union[str, List[str]] = None, image: PipelineImageInput = None, mask_image: PipelineImageInput = None, - masked_image_latents: torch.FloatTensor = None, + masked_image_latents: torch.Tensor = None, height: Optional[int] = None, width: Optional[int] = None, padding_mask_crop: Optional[int] = None, @@ -467,9 +467,9 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.01, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, diff --git a/examples/community/iadb.py b/examples/community/iadb.py index 6089e49fc621..81e9e8d89dc6 100644 --- a/examples/community/iadb.py +++ b/examples/community/iadb.py @@ -17,21 +17,21 @@ class IADBScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - x_alpha: torch.FloatTensor, - ) -> torch.FloatTensor: + x_alpha: torch.Tensor, + ) -> torch.Tensor: """ Predict the sample at the previous timestep by reversing the ODE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. It is the direction from x0 to x1. + model_output (`torch.Tensor`): direct output from learned diffusion model. It is the direction from x0 to x1. timestep (`float`): current timestep in the diffusion chain. - x_alpha (`torch.FloatTensor`): x_alpha sample for the current timestep + x_alpha (`torch.Tensor`): x_alpha sample for the current timestep Returns: - `torch.FloatTensor`: the sample at the previous timestep + `torch.Tensor`: the sample at the previous timestep """ if self.num_inference_steps is None: @@ -53,10 +53,10 @@ def set_timesteps(self, num_inference_steps: int): def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - alpha: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + alpha: torch.Tensor, + ) -> torch.Tensor: return original_samples * alpha + noise * (1 - alpha) def __len__(self): diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py index 69b5440c923a..7656e5a99b07 100644 --- a/examples/community/imagic_stable_diffusion.py +++ b/examples/community/imagic_stable_diffusion.py @@ -110,7 +110,7 @@ def __init__( def train( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], height: Optional[int] = 512, width: Optional[int] = 512, generator: Optional[torch.Generator] = None, @@ -146,7 +146,7 @@ def train( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/examples/community/img2img_inpainting.py b/examples/community/img2img_inpainting.py index 71dc3cf712ed..4dfb7a39155f 100644 --- a/examples/community/img2img_inpainting.py +++ b/examples/community/img2img_inpainting.py @@ -133,9 +133,9 @@ def __init__( def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], - inner_image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], + inner_image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image], height: int = 512, width: int = 512, num_inference_steps: int = 50, @@ -144,10 +144,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -194,7 +194,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -206,7 +206,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/instaflow_one_step.py b/examples/community/instaflow_one_step.py index b0476d3afe38..ab0393c8f76c 100644 --- a/examples/community/instaflow_one_step.py +++ b/examples/community/instaflow_one_step.py @@ -189,8 +189,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." @@ -219,8 +219,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): r""" @@ -239,10 +239,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -501,12 +501,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -538,14 +538,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -555,7 +555,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/examples/community/interpolate_stable_diffusion.py b/examples/community/interpolate_stable_diffusion.py index 9e4c80451d6c..aea9e9073f8e 100644 --- a/examples/community/interpolate_stable_diffusion.py +++ b/examples/community/interpolate_stable_diffusion.py @@ -133,12 +133,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, - text_embeddings: Optional[torch.FloatTensor] = None, + text_embeddings: Optional[torch.Tensor] = None, **kwargs, ): r""" @@ -171,7 +171,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -183,11 +183,11 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. - text_embeddings (`torch.FloatTensor`, *optional*, defaults to `None`): + text_embeddings (`torch.Tensor`, *optional*, defaults to `None`): Pre-generated text embeddings to be used as inputs for image generation. Can be used in place of `prompt` to avoid re-computing the embeddings. If not provided, the embeddings will be generated from the supplied `prompt`. diff --git a/examples/community/ip_adapter_face_id.py b/examples/community/ip_adapter_face_id.py index bb5a2a4fe5a9..befb48c7391e 100644 --- a/examples/community/ip_adapter_face_id.py +++ b/examples/community/ip_adapter_face_id.py @@ -62,7 +62,7 @@ def __init__(self, image_embed_dim=1024, cross_attention_dim=1024, mult=1, num_t self.ff = FeedForward(image_embed_dim, cross_attention_dim * num_tokens, mult=mult, activation_fn="gelu") self.norm = nn.LayerNorm(cross_attention_dim) - def forward(self, image_embeds: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor): x = self.ff(image_embeds) x = x.reshape(-1, self.num_tokens, self.cross_attention_dim) return self.norm(x) @@ -452,8 +452,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -484,8 +484,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -505,10 +505,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -788,7 +788,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -847,10 +847,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - image_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + image_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -891,17 +891,17 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - image_embeds (`torch.FloatTensor`, *optional*): + image_embeds (`torch.Tensor`, *optional*): Pre-generated image embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. diff --git a/examples/community/latent_consistency_img2img.py b/examples/community/latent_consistency_img2img.py index 3c5ffa845699..5fe53ab6b830 100644 --- a/examples/community/latent_consistency_img2img.py +++ b/examples/community/latent_consistency_img2img.py @@ -88,7 +88,7 @@ def _encode_prompt( torch device num_images_per_prompt (`int`): number of images that should be generated per prompt - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. """ @@ -282,10 +282,10 @@ def __call__( width: Optional[int] = 768, guidance_scale: float = 7.5, num_images_per_prompt: Optional[int] = 1, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, num_inference_steps: int = 4, lcm_origin_steps: int = 50, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -395,16 +395,16 @@ class LCMSchedulerOutput(BaseOutput): """ Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - denoised: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + denoised: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -452,10 +452,10 @@ def rescale_zero_terminal_snr(betas): """ Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -565,7 +565,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") # Rescale for zero SNR if rescale_betas_zero_snr: @@ -587,17 +587,17 @@ def __init__( self.num_inference_steps = None self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -613,7 +613,7 @@ def _get_variance(self, timestep, prev_timestep): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -685,25 +685,25 @@ def get_scalings_for_boundary_condition_discrete(self, t): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timeindex: int, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = False, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[LCMSchedulerOutput, Tuple]: """ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. eta (`float`): The weight of noise for added noise in diffusion step. @@ -714,7 +714,7 @@ def step( `use_clipped_model_output` has no effect. generator (`torch.Generator`, *optional*): A random number generator. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`CycleDiffusion`]. return_dict (`bool`, *optional*, defaults to `True`): @@ -777,10 +777,10 @@ def step( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) timesteps = timesteps.to(original_samples.device) @@ -799,9 +799,7 @@ def add_noise( return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype) timesteps = timesteps.to(sample.device) diff --git a/examples/community/latent_consistency_interpolate.py b/examples/community/latent_consistency_interpolate.py index 3d2413c99189..8db70d3b9508 100644 --- a/examples/community/latent_consistency_interpolate.py +++ b/examples/community/latent_consistency_interpolate.py @@ -281,8 +281,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -302,10 +302,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -506,7 +506,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -546,7 +546,7 @@ def check_inputs( height: int, width: int, callback_steps: int, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, callback_on_step_end_tensor_inputs=None, ): if height % 8 != 0 or width % 8 != 0: @@ -580,11 +580,11 @@ def check_inputs( @torch.no_grad() def interpolate_embedding( self, - start_embedding: torch.FloatTensor, - end_embedding: torch.FloatTensor, + start_embedding: torch.Tensor, + end_embedding: torch.Tensor, num_interpolation_steps: Union[int, List[int]], interpolation_type: str, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if interpolation_type == "lerp": interpolation_fn = lerp elif interpolation_type == "slerp": @@ -611,11 +611,11 @@ def interpolate_embedding( @torch.no_grad() def interpolate_latent( self, - start_latent: torch.FloatTensor, - end_latent: torch.FloatTensor, + start_latent: torch.Tensor, + end_latent: torch.Tensor, num_interpolation_steps: Union[int, List[int]], interpolation_type: str, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if interpolation_type == "lerp": interpolation_fn = lerp elif interpolation_type == "slerp": @@ -663,8 +663,8 @@ def __call__( guidance_scale: float = 8.5, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -705,11 +705,11 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): diff --git a/examples/community/latent_consistency_txt2img.py b/examples/community/latent_consistency_txt2img.py index c31d6abae368..9f25a6db2722 100755 --- a/examples/community/latent_consistency_txt2img.py +++ b/examples/community/latent_consistency_txt2img.py @@ -86,7 +86,7 @@ def _encode_prompt( torch device num_images_per_prompt (`int`): number of images that should be generated per prompt - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. """ @@ -208,10 +208,10 @@ def __call__( width: Optional[int] = 768, guidance_scale: float = 7.5, num_images_per_prompt: Optional[int] = 1, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, num_inference_steps: int = 4, lcm_origin_steps: int = 50, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -310,16 +310,16 @@ class LCMSchedulerOutput(BaseOutput): """ Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - denoised: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + denoised: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -367,10 +367,10 @@ def rescale_zero_terminal_snr(betas): """ Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -477,7 +477,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") # Rescale for zero SNR if rescale_betas_zero_snr: @@ -499,17 +499,17 @@ def __init__( self.num_inference_steps = None self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -525,7 +525,7 @@ def _get_variance(self, timestep, prev_timestep): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -593,25 +593,25 @@ def get_scalings_for_boundary_condition_discrete(self, t): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timeindex: int, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = False, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[LCMSchedulerOutput, Tuple]: """ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. eta (`float`): The weight of noise for added noise in diffusion step. @@ -622,7 +622,7 @@ def step( `use_clipped_model_output` has no effect. generator (`torch.Generator`, *optional*): A random number generator. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`CycleDiffusion`]. return_dict (`bool`, *optional*, defaults to `True`): @@ -685,10 +685,10 @@ def step( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) timesteps = timesteps.to(original_samples.device) @@ -707,9 +707,7 @@ def add_noise( return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype) timesteps = timesteps.to(sample.device) diff --git a/examples/community/llm_grounded_diffusion.py b/examples/community/llm_grounded_diffusion.py index c3b07ade7ba7..24b0a9d0e209 100644 --- a/examples/community/llm_grounded_diffusion.py +++ b/examples/community/llm_grounded_diffusion.py @@ -756,13 +756,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -807,14 +807,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. @@ -825,7 +825,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -1194,8 +1194,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -1227,8 +1227,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -1248,10 +1248,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1509,7 +1509,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py index d1c8b357e16f..3665794db8f7 100644 --- a/examples/community/lpw_stable_diffusion.py +++ b/examples/community/lpw_stable_diffusion.py @@ -381,7 +381,7 @@ def preprocess_image(image, batch_size): def preprocess_mask(mask, batch_size, scale_factor=8): - if not isinstance(mask, torch.FloatTensor): + if not isinstance(mask, torch.Tensor): mask = mask.convert("L") w, h = mask.size w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 @@ -546,8 +546,8 @@ def _encode_prompt( do_classifier_free_guidance, negative_prompt=None, max_embeddings_multiples=3, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -770,8 +770,8 @@ def __call__( self, prompt: Union[str, List[str]], negative_prompt: Optional[Union[str, List[str]]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, + mask_image: Union[torch.Tensor, PIL.Image.Image] = None, height: int = 512, width: int = 512, num_inference_steps: int = 50, @@ -781,13 +781,13 @@ def __call__( add_predicted_noise: Optional[bool] = False, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, max_embeddings_multiples: Optional[int] = 3, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, is_cancelled_callback: Optional[Callable[[], bool]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -801,10 +801,10 @@ def __call__( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. - mask_image (`torch.FloatTensor` or `PIL.Image.Image`): + mask_image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should @@ -839,14 +839,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -860,7 +860,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. is_cancelled_callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. If the function returns `True`, the inference will be cancelled. @@ -1035,13 +1035,13 @@ def text2img( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, max_embeddings_multiples: Optional[int] = 3, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, is_cancelled_callback: Optional[Callable[[], bool]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1075,14 +1075,14 @@ def text2img( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1096,7 +1096,7 @@ def text2img( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. is_cancelled_callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. If the function returns `True`, the inference will be cancelled. @@ -1140,7 +1140,7 @@ def text2img( def img2img( self, - image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], prompt: Union[str, List[str]], negative_prompt: Optional[Union[str, List[str]]] = None, strength: float = 0.8, @@ -1149,12 +1149,12 @@ def img2img( num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, max_embeddings_multiples: Optional[int] = 3, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, is_cancelled_callback: Optional[Callable[[], bool]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1162,7 +1162,7 @@ def img2img( r""" Function for image-to-image generation. Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. prompt (`str` or `List[str]`): @@ -1193,10 +1193,10 @@ def img2img( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1210,7 +1210,7 @@ def img2img( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. is_cancelled_callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. If the function returns `True`, the inference will be cancelled. @@ -1252,8 +1252,8 @@ def img2img( def inpaint( self, - image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image], prompt: Union[str, List[str]], negative_prompt: Optional[Union[str, List[str]]] = None, strength: float = 0.8, @@ -1263,12 +1263,12 @@ def inpaint( add_predicted_noise: Optional[bool] = False, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, max_embeddings_multiples: Optional[int] = 3, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, is_cancelled_callback: Optional[Callable[[], bool]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1276,10 +1276,10 @@ def inpaint( r""" Function for inpaint. Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. This is the image whose masked region will be inpainted. - mask_image (`torch.FloatTensor` or `PIL.Image.Image`): + mask_image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should @@ -1314,10 +1314,10 @@ def inpaint( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1331,7 +1331,7 @@ def inpaint( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. is_cancelled_callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. If the function returns `True`, the inference will be cancelled. diff --git a/examples/community/lpw_stable_diffusion_xl.py b/examples/community/lpw_stable_diffusion_xl.py index 64b7973e894b..4ea98b1306fb 100644 --- a/examples/community/lpw_stable_diffusion_xl.py +++ b/examples/community/lpw_stable_diffusion_xl.py @@ -694,10 +694,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): r""" @@ -722,17 +722,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1320,7 +1320,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -1378,7 +1378,7 @@ def __call__( prompt_2: Optional[str] = None, image: Optional[PipelineImageInput] = None, mask_image: Optional[PipelineImageInput] = None, - masked_image_latents: Optional[torch.FloatTensor] = None, + masked_image_latents: Optional[torch.Tensor] = None, height: Optional[int] = None, width: Optional[int] = None, strength: float = 0.8, @@ -1392,12 +1392,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1481,23 +1481,23 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1926,12 +1926,12 @@ def text2img( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -2001,12 +2001,12 @@ def img2img( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -2066,7 +2066,7 @@ def inpaint( prompt_2: Optional[str] = None, image: Optional[PipelineImageInput] = None, mask_image: Optional[PipelineImageInput] = None, - masked_image_latents: Optional[torch.FloatTensor] = None, + masked_image_latents: Optional[torch.Tensor] = None, height: Optional[int] = None, width: Optional[int] = None, strength: float = 0.8, @@ -2080,12 +2080,12 @@ def inpaint( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, diff --git a/examples/community/masked_stable_diffusion_img2img.py b/examples/community/masked_stable_diffusion_img2img.py index 0b08086c7da9..a210c167a2f3 100644 --- a/examples/community/masked_stable_diffusion_img2img.py +++ b/examples/community/masked_stable_diffusion_img2img.py @@ -16,10 +16,10 @@ def __call__( self, prompt: Union[str, List[str]] = None, image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -30,18 +30,18 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, mask: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -52,7 +52,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be used as the starting point. Can also accept image latents as `image`, but if passing latents directly it is not encoded again. strength (`float`, *optional*, defaults to 0.8): @@ -78,10 +78,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -91,14 +91,14 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - mask (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`, *optional*): + mask (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`, *optional*): A mask with non-zero elements for the area to be inpainted. If not specified, no mask is applied. Examples: diff --git a/examples/community/mixture_canvas.py b/examples/community/mixture_canvas.py index 2083c7acad38..7196ee9587f2 100644 --- a/examples/community/mixture_canvas.py +++ b/examples/community/mixture_canvas.py @@ -154,7 +154,7 @@ def encode_prompt(self, text_encoder, device): class Image2ImageRegion(DiffusionRegion): """Class defining a region where an image guided diffusion process is acting""" - reference_image: torch.FloatTensor = None + reference_image: torch.Tensor = None strength: float = 0.8 # Strength of the image def __post_init__(self): diff --git a/examples/community/multilingual_stable_diffusion.py b/examples/community/multilingual_stable_diffusion.py index f3b0540cf4d3..dc335e0b585e 100644 --- a/examples/community/multilingual_stable_diffusion.py +++ b/examples/community/multilingual_stable_diffusion.py @@ -147,10 +147,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -184,7 +184,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -196,7 +196,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_animatediff_controlnet.py b/examples/community/pipeline_animatediff_controlnet.py index 4b1a64142704..ac0aa38254e5 100644 --- a/examples/community/pipeline_animatediff_controlnet.py +++ b/examples/community/pipeline_animatediff_controlnet.py @@ -198,8 +198,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -219,10 +219,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -752,9 +752,9 @@ def __call__( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, ip_adapter_image_embeds: Optional[PipelineImageInput] = None, conditioning_frames: Optional[List[PipelineImageInput]] = None, @@ -798,20 +798,20 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. @@ -821,7 +821,7 @@ def __call__( are specified, images must be passed as a list such that each element of the list can be correctly batched for input to a single ControlNet. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or + The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead diff --git a/examples/community/pipeline_animatediff_img2video.py b/examples/community/pipeline_animatediff_img2video.py index d9209122262f..7546fbd9bcb0 100644 --- a/examples/community/pipeline_animatediff_img2video.py +++ b/examples/community/pipeline_animatediff_img2video.py @@ -315,8 +315,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -336,10 +336,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -746,14 +746,14 @@ def __call__( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, ip_adapter_image_embeds: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -791,33 +791,33 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or + The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`AnimateDiffImgToVideoPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/examples/community/pipeline_demofusion_sdxl.py b/examples/community/pipeline_demofusion_sdxl.py index f46d635dae2b..b4d47f1856ba 100644 --- a/examples/community/pipeline_demofusion_sdxl.py +++ b/examples/community/pipeline_demofusion_sdxl.py @@ -187,10 +187,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): r""" @@ -215,17 +215,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -642,14 +642,14 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = False, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -720,21 +720,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -746,7 +746,7 @@ def __call__( of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_fabric.py b/examples/community/pipeline_fabric.py index 46692a96840b..f17c8e52f595 100644 --- a/examples/community/pipeline_fabric.py +++ b/examples/community/pipeline_fabric.py @@ -190,8 +190,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): r""" @@ -210,10 +210,10 @@ def _encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -512,7 +512,7 @@ def __call__( neg_scale: float = 0.5, pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ): r""" The call function to the pipeline for generation. Generate a trajectory of images with binary feedback. The diff --git a/examples/community/pipeline_prompt2prompt.py b/examples/community/pipeline_prompt2prompt.py index db9f215b6bed..8e9bcddfef99 100644 --- a/examples/community/pipeline_prompt2prompt.py +++ b/examples/community/pipeline_prompt2prompt.py @@ -217,8 +217,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -250,8 +250,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -271,10 +271,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -564,12 +564,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -604,7 +604,7 @@ def __call__( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -616,7 +616,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_sdxl_style_aligned.py b/examples/community/pipeline_sdxl_style_aligned.py index 88edeeb7ee4c..5ad85dc90aff 100644 --- a/examples/community/pipeline_sdxl_style_aligned.py +++ b/examples/community/pipeline_sdxl_style_aligned.py @@ -514,10 +514,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -543,17 +543,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1325,7 +1325,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -1387,7 +1387,7 @@ def __call__( prompt_2: Optional[Union[str, List[str]]] = None, image: Optional[PipelineImageInput] = None, mask_image: Optional[PipelineImageInput] = None, - masked_image_latents: Optional[torch.FloatTensor] = None, + masked_image_latents: Optional[torch.Tensor] = None, strength: float = 0.3, height: Optional[int] = None, width: Optional[int] = None, @@ -1401,11 +1401,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -1474,21 +1474,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. diff --git a/examples/community/pipeline_stable_diffusion_boxdiff.py b/examples/community/pipeline_stable_diffusion_boxdiff.py new file mode 100644 index 000000000000..f82533944132 --- /dev/null +++ b/examples/community/pipeline_stable_diffusion_boxdiff.py @@ -0,0 +1,1700 @@ +# Copyright 2024 Jingyang Zhang and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc +import inspect +import math +import numbers +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection + +from diffusers.configuration_utils import FrozenDict +from diffusers.image_processor import PipelineImageInput, VaeImageProcessor +from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel +from diffusers.models.attention_processor import Attention, FusedAttnProcessor2_0 +from diffusers.models.lora import adjust_lora_scale_text_encoder +from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker +from diffusers.schedulers import KarrasDiffusionSchedulers +from diffusers.utils import ( + USE_PEFT_BACKEND, + deprecate, + logging, + replace_example_docstring, + scale_lora_layers, + unscale_lora_layers, +) +from diffusers.utils.torch_utils import randn_tensor + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import StableDiffusionPipeline + + >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16) + >>> pipe = pipe.to("cuda") + + >>> prompt = "a photo of an astronaut riding a horse on mars" + >>> image = pipe(prompt).images[0] + ``` +""" + + +class GaussianSmoothing(nn.Module): + """ + Copied from official repo: https://github.com/showlab/BoxDiff/blob/master/utils/gaussian_smoothing.py + Apply gaussian smoothing on a + 1d, 2d or 3d tensor. Filtering is performed seperately for each channel + in the input using a depthwise convolution. + Arguments: + channels (int, sequence): Number of channels of the input tensors. Output will + have this number of channels as well. + kernel_size (int, sequence): Size of the gaussian kernel. + sigma (float, sequence): Standard deviation of the gaussian kernel. + dim (int, optional): The number of dimensions of the data. + Default value is 2 (spatial). + """ + + def __init__(self, channels, kernel_size, sigma, dim=2): + super(GaussianSmoothing, self).__init__() + if isinstance(kernel_size, numbers.Number): + kernel_size = [kernel_size] * dim + if isinstance(sigma, numbers.Number): + sigma = [sigma] * dim + + # The gaussian kernel is the product of the + # gaussian function of each dimension. + kernel = 1 + meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size]) + for size, std, mgrid in zip(kernel_size, sigma, meshgrids): + mean = (size - 1) / 2 + kernel *= 1 / (std * math.sqrt(2 * math.pi)) * torch.exp(-(((mgrid - mean) / (2 * std)) ** 2)) + + # Make sure sum of values in gaussian kernel equals 1. + kernel = kernel / torch.sum(kernel) + + # Reshape to depthwise convolutional weight + kernel = kernel.view(1, 1, *kernel.size()) + kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1)) + + self.register_buffer("weight", kernel) + self.groups = channels + + if dim == 1: + self.conv = F.conv1d + elif dim == 2: + self.conv = F.conv2d + elif dim == 3: + self.conv = F.conv3d + else: + raise RuntimeError("Only 1, 2 and 3 dimensions are supported. Received {}.".format(dim)) + + def forward(self, input): + """ + Apply gaussian filter to input. + Arguments: + input (torch.Tensor): Input to apply gaussian filter on. + Returns: + filtered (torch.Tensor): Filtered output. + """ + return self.conv(input, weight=self.weight.to(input.dtype), groups=self.groups) + + +class AttendExciteCrossAttnProcessor: + def __init__(self, attnstore, place_in_unet): + super().__init__() + self.attnstore = attnstore + self.place_in_unet = place_in_unet + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + batch_size, sequence_length, _ = hidden_states.shape + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size=1) + query = attn.to_q(hidden_states) + + is_cross = encoder_hidden_states is not None + encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + query = attn.head_to_batch_dim(query) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + + attention_probs = attn.get_attention_scores(query, key, attention_mask) + self.attnstore(attention_probs, is_cross, self.place_in_unet) + + hidden_states = torch.bmm(attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + return hidden_states + + +class AttentionControl(abc.ABC): + def step_callback(self, x_t): + return x_t + + def between_steps(self): + return + + # @property + # def num_uncond_att_layers(self): + # return 0 + + @abc.abstractmethod + def forward(self, attn, is_cross: bool, place_in_unet: str): + raise NotImplementedError + + def __call__(self, attn, is_cross: bool, place_in_unet: str): + if self.cur_att_layer >= self.num_uncond_att_layers: + self.forward(attn, is_cross, place_in_unet) + self.cur_att_layer += 1 + if self.cur_att_layer == self.num_att_layers + self.num_uncond_att_layers: + self.cur_att_layer = 0 + self.cur_step += 1 + self.between_steps() + + def reset(self): + self.cur_step = 0 + self.cur_att_layer = 0 + + def __init__(self): + self.cur_step = 0 + self.num_att_layers = -1 + self.cur_att_layer = 0 + + +class AttentionStore(AttentionControl): + @staticmethod + def get_empty_store(): + return {"down_cross": [], "mid_cross": [], "up_cross": [], "down_self": [], "mid_self": [], "up_self": []} + + def forward(self, attn, is_cross: bool, place_in_unet: str): + key = f"{place_in_unet}_{'cross' if is_cross else 'self'}" + if attn.shape[1] <= 32**2: # avoid memory overhead + self.step_store[key].append(attn) + return attn + + def between_steps(self): + self.attention_store = self.step_store + if self.save_global_store: + with torch.no_grad(): + if len(self.global_store) == 0: + self.global_store = self.step_store + else: + for key in self.global_store: + for i in range(len(self.global_store[key])): + self.global_store[key][i] += self.step_store[key][i].detach() + self.step_store = self.get_empty_store() + self.step_store = self.get_empty_store() + + def get_average_attention(self): + average_attention = self.attention_store + return average_attention + + def get_average_global_attention(self): + average_attention = { + key: [item / self.cur_step for item in self.global_store[key]] for key in self.attention_store + } + return average_attention + + def reset(self): + super(AttentionStore, self).reset() + self.step_store = self.get_empty_store() + self.attention_store = {} + self.global_store = {} + + def __init__(self, save_global_store=False): + """ + Initialize an empty AttentionStore + :param step_index: used to visualize only a specific step in the diffusion process + """ + super(AttentionStore, self).__init__() + self.save_global_store = save_global_store + self.step_store = self.get_empty_store() + self.attention_store = {} + self.global_store = {} + self.curr_step_index = 0 + self.num_uncond_att_layers = 0 + + +def aggregate_attention( + attention_store: AttentionStore, res: int, from_where: List[str], is_cross: bool, select: int +) -> torch.Tensor: + """Aggregates the attention across the different layers and heads at the specified resolution.""" + out = [] + attention_maps = attention_store.get_average_attention() + + # for k, v in attention_maps.items(): + # for vv in v: + # print(vv.shape) + # exit() + + num_pixels = res**2 + for location in from_where: + for item in attention_maps[f"{location}_{'cross' if is_cross else 'self'}"]: + if item.shape[1] == num_pixels: + cross_maps = item.reshape(1, -1, res, res, item.shape[-1])[select] + out.append(cross_maps) + out = torch.cat(out, dim=0) + out = out.sum(0) / out.shape[0] + return out + + +def register_attention_control(model, controller): + attn_procs = {} + cross_att_count = 0 + for name in model.unet.attn_processors.keys(): + # cross_attention_dim = None if name.endswith("attn1.processor") else model.unet.config.cross_attention_dim + if name.startswith("mid_block"): + # hidden_size = model.unet.config.block_out_channels[-1] + place_in_unet = "mid" + elif name.startswith("up_blocks"): + # block_id = int(name[len("up_blocks.")]) + # hidden_size = list(reversed(model.unet.config.block_out_channels))[block_id] + place_in_unet = "up" + elif name.startswith("down_blocks"): + # block_id = int(name[len("down_blocks.")]) + # hidden_size = model.unet.config.block_out_channels[block_id] + place_in_unet = "down" + else: + continue + + cross_att_count += 1 + attn_procs[name] = AttendExciteCrossAttnProcessor(attnstore=controller, place_in_unet=place_in_unet) + model.unet.set_attn_processor(attn_procs) + controller.num_att_layers = cross_att_count + + +def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): + """ + Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 + """ + std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True) + std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) + # rescale the results from guidance (fixes overexposure) + noise_pred_rescaled = noise_cfg * (std_text / std_cfg) + # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images + noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg + return noise_cfg + + +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + timesteps: Optional[List[int]] = None, + **kwargs, +): + """ + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, + `timesteps` must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default + timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` + must be `None`. + + Returns: + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +class StableDiffusionBoxDiffPipeline( + DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin +): + r""" + Pipeline for text-to-image generation using Stable Diffusion with BoxDiff. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods + implemented for all pipelines (downloading, saving, running on a particular device, etc.). + + The pipeline also inherits the following loading methods: + - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings + - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. + text_encoder ([`~transformers.CLIPTextModel`]): + Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)). + tokenizer ([`~transformers.CLIPTokenizer`]): + A `CLIPTokenizer` to tokenize text. + unet ([`UNet2DConditionModel`]): + A `UNet2DConditionModel` to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details + about a model's potential harms. + feature_extractor ([`~transformers.CLIPImageProcessor`]): + A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`. + """ + + model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae" + _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] + _exclude_from_cpu_offload = ["safety_checker"] + _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + image_encoder: CLIPVisionModelWithProjection = None, + requires_safety_checker: bool = True, + ): + super().__init__() + + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" + f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " + "to update the config accordingly as leaving `steps_offset` might led to incorrect results" + " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," + " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(scheduler.config) + new_config["steps_offset"] = 1 + scheduler._internal_dict = FrozenDict(new_config) + + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." + " `clip_sample` should be set to False in the configuration file. Please make sure to update the" + " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in" + " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" + " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" + ) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(scheduler.config) + new_config["clip_sample"] = False + scheduler._internal_dict = FrozenDict(new_config) + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse( + version.parse(unet.config._diffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 + if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: + deprecation_message = ( + "The configuration file of the unet has set the default `sample_size` to smaller than" + " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the" + " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-" + " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5" + " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the" + " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" + " in the config might lead to incorrect results in future versions. If you have downloaded this" + " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(unet.config) + new_config["sample_size"] = 64 + unet._internal_dict = FrozenDict(new_config) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + image_encoder=image_encoder, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to + compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. + """ + self.vae.enable_slicing() + + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to + computing decoding in one step. + """ + self.vae.disable_slicing() + + def enable_vae_tiling(self): + r""" + Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to + compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow + processing larger images. + """ + self.vae.enable_tiling() + + def disable_vae_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to + computing decoding in one step. + """ + self.vae.disable_tiling() + + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + **kwargs, + ): + deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." + deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) + + prompt_embeds_tuple = self.encode_prompt( + prompt=prompt, + device=device, + num_images_per_prompt=num_images_per_prompt, + do_classifier_free_guidance=do_classifier_free_guidance, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=lora_scale, + **kwargs, + ) + + # concatenate for backwards comp + prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]]) + + return prompt_embeds + + def encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + clip_skip: Optional[int] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + lora_scale (`float`, *optional*): + A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + + # dynamically adjust the LoRA scale + if not USE_PEFT_BACKEND: + adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) + else: + scale_lora_layers(self.text_encoder, lora_scale) + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, self.tokenizer) + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + if clip_skip is None: + prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask) + prompt_embeds = prompt_embeds[0] + else: + prompt_embeds = self.text_encoder( + text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True + ) + # Access the `hidden_states` first, that contains a tuple of + # all the hidden states from the encoder layers. Then index into + # the tuple to access the hidden states from the desired layer. + prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)] + # We also need to apply the final LayerNorm here to not mess with the + # representations. The `last_hidden_states` that we typically use for + # obtaining the final prompt representations passes through the LayerNorm + # layer. + prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + + if self.text_encoder is not None: + prompt_embeds_dtype = self.text_encoder.dtype + elif self.unet is not None: + prompt_embeds_dtype = self.unet.dtype + else: + prompt_embeds_dtype = prompt_embeds.dtype + + prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder, lora_scale) + + return text_inputs, prompt_embeds, negative_prompt_embeds + + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is None: + has_nsfw_concept = None + else: + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + return image, has_nsfw_concept + + def decode_latents(self, latents): + deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead" + deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False) + + latents = 1 / self.vae.config.scaling_factor * latents + image = self.vae.decode(latents, return_dict=False)[0] + image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + return image + + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + height, + width, + boxdiff_phrases, + boxdiff_boxes, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + callback_on_step_end_tensor_inputs=None, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + if boxdiff_phrases is not None or boxdiff_boxes is not None: + if not (boxdiff_phrases is not None and boxdiff_boxes is not None): + raise ValueError("Either both `boxdiff_phrases` and `boxdiff_boxes` must be passed or none of them.") + + if not isinstance(boxdiff_phrases, list) or not isinstance(boxdiff_boxes, list): + raise ValueError("`boxdiff_phrases` and `boxdiff_boxes` must be lists.") + + if len(boxdiff_phrases) != len(boxdiff_boxes): + raise ValueError( + "`boxdiff_phrases` and `boxdiff_boxes` must have the same length," + f" got: `boxdiff_phrases` {len(boxdiff_phrases)} != `boxdiff_boxes`" + f" {len(boxdiff_boxes)}." + ) + + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): + shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + def enable_freeu(self, s1: float, s2: float, b1: float, b2: float): + r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497. + + The suffixes after the scaling factors represent the stages where they are being applied. + + Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values + that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. + + Args: + s1 (`float`): + Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to + mitigate "oversmoothing effect" in the enhanced denoising process. + s2 (`float`): + Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to + mitigate "oversmoothing effect" in the enhanced denoising process. + b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. + b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. + """ + if not hasattr(self, "unet"): + raise ValueError("The pipeline must have `unet` for using FreeU.") + self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2) + + def disable_freeu(self): + """Disables the FreeU mechanism if enabled.""" + self.unet.disable_freeu() + + # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections + def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): + """ + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, + key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + + + + This API is 🧪 experimental. + + + + Args: + unet (`bool`, defaults to `True`): To apply fusion on the UNet. + vae (`bool`, defaults to `True`): To apply fusion on the VAE. + """ + self.fusing_unet = False + self.fusing_vae = False + + if unet: + self.fusing_unet = True + self.unet.fuse_qkv_projections() + self.unet.set_attn_processor(FusedAttnProcessor2_0()) + + if vae: + if not isinstance(self.vae, AutoencoderKL): + raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.") + + self.fusing_vae = True + self.vae.fuse_qkv_projections() + self.vae.set_attn_processor(FusedAttnProcessor2_0()) + + # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections + def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True): + """Disable QKV projection fusion if enabled. + + + + This API is 🧪 experimental. + + + + Args: + unet (`bool`, defaults to `True`): To apply fusion on the UNet. + vae (`bool`, defaults to `True`): To apply fusion on the VAE. + + """ + if unet: + if not self.fusing_unet: + logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.") + else: + self.unet.unfuse_qkv_projections() + self.fusing_unet = False + + if vae: + if not self.fusing_vae: + logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.") + else: + self.vae.unfuse_qkv_projections() + self.fusing_vae = False + + # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding + def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): + """ + See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 + + Args: + timesteps (`torch.Tensor`): + generate embedding vectors at these timesteps + embedding_dim (`int`, *optional*, defaults to 512): + dimension of the embeddings to generate + dtype: + data type of the generated embeddings + + Returns: + `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + """ + assert len(w.shape) == 1 + w = w * 1000.0 + + half_dim = embedding_dim // 2 + emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) + emb = w.to(dtype)[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1)) + assert emb.shape == (w.shape[0], embedding_dim) + return emb + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def guidance_rescale(self): + return self._guidance_rescale + + @property + def clip_skip(self): + return self._clip_skip + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + @property + def do_classifier_free_guidance(self): + return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None + + @property + def cross_attention_kwargs(self): + return self._cross_attention_kwargs + + @property + def num_timesteps(self): + return self._num_timesteps + + @property + def interrupt(self): + return self._interrupt + + def _compute_max_attention_per_index( + self, + attention_maps: torch.Tensor, + indices_to_alter: List[int], + smooth_attentions: bool = False, + sigma: float = 0.5, + kernel_size: int = 3, + normalize_eot: bool = False, + bboxes: List[int] = None, + L: int = 1, + P: float = 0.2, + ) -> List[torch.Tensor]: + """Computes the maximum attention value for each of the tokens we wish to alter.""" + last_idx = -1 + if normalize_eot: + prompt = self.prompt + if isinstance(self.prompt, list): + prompt = self.prompt[0] + last_idx = len(self.tokenizer(prompt)["input_ids"]) - 1 + attention_for_text = attention_maps[:, :, 1:last_idx] + attention_for_text *= 100 + attention_for_text = torch.nn.functional.softmax(attention_for_text, dim=-1) + + # Shift indices since we removed the first token "1:last_idx" + indices_to_alter = [index - 1 for index in indices_to_alter] + + # Extract the maximum values + max_indices_list_fg = [] + max_indices_list_bg = [] + dist_x = [] + dist_y = [] + + cnt = 0 + for i in indices_to_alter: + image = attention_for_text[:, :, i] + + # TODO + # box = [max(round(b / (512 / image.shape[0])), 0) for b in bboxes[cnt]] + # x1, y1, x2, y2 = box + H, W = image.shape + x1 = min(max(round(bboxes[cnt][0] * W), 0), W) + y1 = min(max(round(bboxes[cnt][1] * H), 0), H) + x2 = min(max(round(bboxes[cnt][2] * W), 0), W) + y2 = min(max(round(bboxes[cnt][3] * H), 0), H) + box = [x1, y1, x2, y2] + cnt += 1 + + # coordinates to masks + obj_mask = torch.zeros_like(image) + ones_mask = torch.ones([y2 - y1, x2 - x1], dtype=obj_mask.dtype).to(obj_mask.device) + obj_mask[y1:y2, x1:x2] = ones_mask + bg_mask = 1 - obj_mask + + if smooth_attentions: + smoothing = GaussianSmoothing(channels=1, kernel_size=kernel_size, sigma=sigma, dim=2).to(image.device) + input = F.pad(image.unsqueeze(0).unsqueeze(0), (1, 1, 1, 1), mode="reflect") + image = smoothing(input).squeeze(0).squeeze(0) + + # Inner-Box constraint + k = (obj_mask.sum() * P).long() + max_indices_list_fg.append((image * obj_mask).reshape(-1).topk(k)[0].mean()) + + # Outer-Box constraint + k = (bg_mask.sum() * P).long() + max_indices_list_bg.append((image * bg_mask).reshape(-1).topk(k)[0].mean()) + + # Corner Constraint + gt_proj_x = torch.max(obj_mask, dim=0)[0] + gt_proj_y = torch.max(obj_mask, dim=1)[0] + corner_mask_x = torch.zeros_like(gt_proj_x) + corner_mask_y = torch.zeros_like(gt_proj_y) + + # create gt according to the number config.L + N = gt_proj_x.shape[0] + corner_mask_x[max(box[0] - L, 0) : min(box[0] + L + 1, N)] = 1.0 + corner_mask_x[max(box[2] - L, 0) : min(box[2] + L + 1, N)] = 1.0 + corner_mask_y[max(box[1] - L, 0) : min(box[1] + L + 1, N)] = 1.0 + corner_mask_y[max(box[3] - L, 0) : min(box[3] + L + 1, N)] = 1.0 + dist_x.append((F.l1_loss(image.max(dim=0)[0], gt_proj_x, reduction="none") * corner_mask_x).mean()) + dist_y.append((F.l1_loss(image.max(dim=1)[0], gt_proj_y, reduction="none") * corner_mask_y).mean()) + + return max_indices_list_fg, max_indices_list_bg, dist_x, dist_y + + def _aggregate_and_get_max_attention_per_token( + self, + attention_store: AttentionStore, + indices_to_alter: List[int], + attention_res: int = 16, + smooth_attentions: bool = False, + sigma: float = 0.5, + kernel_size: int = 3, + normalize_eot: bool = False, + bboxes: List[int] = None, + L: int = 1, + P: float = 0.2, + ): + """Aggregates the attention for each token and computes the max activation value for each token to alter.""" + attention_maps = aggregate_attention( + attention_store=attention_store, + res=attention_res, + from_where=("up", "down", "mid"), + is_cross=True, + select=0, + ) + max_attention_per_index_fg, max_attention_per_index_bg, dist_x, dist_y = self._compute_max_attention_per_index( + attention_maps=attention_maps, + indices_to_alter=indices_to_alter, + smooth_attentions=smooth_attentions, + sigma=sigma, + kernel_size=kernel_size, + normalize_eot=normalize_eot, + bboxes=bboxes, + L=L, + P=P, + ) + return max_attention_per_index_fg, max_attention_per_index_bg, dist_x, dist_y + + @staticmethod + def _compute_loss( + max_attention_per_index_fg: List[torch.Tensor], + max_attention_per_index_bg: List[torch.Tensor], + dist_x: List[torch.Tensor], + dist_y: List[torch.Tensor], + return_losses: bool = False, + ) -> torch.Tensor: + """Computes the attend-and-excite loss using the maximum attention value for each token.""" + losses_fg = [max(0, 1.0 - curr_max) for curr_max in max_attention_per_index_fg] + losses_bg = [max(0, curr_max) for curr_max in max_attention_per_index_bg] + loss = sum(losses_fg) + sum(losses_bg) + sum(dist_x) + sum(dist_y) + if return_losses: + return max(losses_fg), losses_fg + else: + return max(losses_fg), loss + + @staticmethod + def _update_latent(latents: torch.Tensor, loss: torch.Tensor, step_size: float) -> torch.Tensor: + """Update the latent according to the computed loss.""" + grad_cond = torch.autograd.grad(loss.requires_grad_(True), [latents], retain_graph=True)[0] + latents = latents - step_size * grad_cond + return latents + + def _perform_iterative_refinement_step( + self, + latents: torch.Tensor, + indices_to_alter: List[int], + loss_fg: torch.Tensor, + threshold: float, + text_embeddings: torch.Tensor, + text_input, + attention_store: AttentionStore, + step_size: float, + t: int, + attention_res: int = 16, + smooth_attentions: bool = True, + sigma: float = 0.5, + kernel_size: int = 3, + max_refinement_steps: int = 20, + normalize_eot: bool = False, + bboxes: List[int] = None, + L: int = 1, + P: float = 0.2, + ): + """ + Performs the iterative latent refinement introduced in the paper. Here, we continuously update the latent + code according to our loss objective until the given threshold is reached for all tokens. + """ + iteration = 0 + target_loss = max(0, 1.0 - threshold) + + while loss_fg > target_loss: + iteration += 1 + + latents = latents.clone().detach().requires_grad_(True) + # noise_pred_text = self.unet(latents, t, encoder_hidden_states=text_embeddings[1].unsqueeze(0)).sample + self.unet.zero_grad() + + # Get max activation value for each subject token + ( + max_attention_per_index_fg, + max_attention_per_index_bg, + dist_x, + dist_y, + ) = self._aggregate_and_get_max_attention_per_token( + attention_store=attention_store, + indices_to_alter=indices_to_alter, + attention_res=attention_res, + smooth_attentions=smooth_attentions, + sigma=sigma, + kernel_size=kernel_size, + normalize_eot=normalize_eot, + bboxes=bboxes, + L=L, + P=P, + ) + + loss_fg, losses_fg = self._compute_loss( + max_attention_per_index_fg, max_attention_per_index_bg, dist_x, dist_y, return_losses=True + ) + + if loss_fg != 0: + latents = self._update_latent(latents, loss_fg, step_size) + + # with torch.no_grad(): + # noise_pred_uncond = self.unet(latents, t, encoder_hidden_states=text_embeddings[0].unsqueeze(0)).sample + # noise_pred_text = self.unet(latents, t, encoder_hidden_states=text_embeddings[1].unsqueeze(0)).sample + + # try: + # low_token = np.argmax([l.item() if not isinstance(l, int) else l for l in losses_fg]) + # except Exception as e: + # print(e) # catch edge case :) + # low_token = np.argmax(losses_fg) + + # low_word = self.tokenizer.decode(text_input.input_ids[0][indices_to_alter[low_token]]) + # print(f'\t Try {iteration}. {low_word} has a max attention of {max_attention_per_index_fg[low_token]}') + + if iteration >= max_refinement_steps: + # print(f'\t Exceeded max number of iterations ({max_refinement_steps})! ' + # f'Finished with a max attention of {max_attention_per_index_fg[low_token]}') + break + + # Run one more time but don't compute gradients and update the latents. + # We just need to compute the new loss - the grad update will occur below + latents = latents.clone().detach().requires_grad_(True) + # noise_pred_text = self.unet(latents, t, encoder_hidden_states=text_embeddings[1].unsqueeze(0)).sample + self.unet.zero_grad() + + # Get max activation value for each subject token + ( + max_attention_per_index_fg, + max_attention_per_index_bg, + dist_x, + dist_y, + ) = self._aggregate_and_get_max_attention_per_token( + attention_store=attention_store, + indices_to_alter=indices_to_alter, + attention_res=attention_res, + smooth_attentions=smooth_attentions, + sigma=sigma, + kernel_size=kernel_size, + normalize_eot=normalize_eot, + bboxes=bboxes, + L=L, + P=P, + ) + loss_fg, losses_fg = self._compute_loss( + max_attention_per_index_fg, max_attention_per_index_bg, dist_x, dist_y, return_losses=True + ) + # print(f"\t Finished with loss of: {loss_fg}") + return loss_fg, latents, max_attention_per_index_fg + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + boxdiff_phrases: List[str] = None, + boxdiff_boxes: List[List[float]] = None, # TODO + boxdiff_kwargs: Optional[Dict[str, Any]] = { + "attention_res": 16, + "P": 0.2, + "L": 1, + "max_iter_to_alter": 25, + "loss_thresholds": {0: 0.05, 10: 0.5, 20: 0.8}, + "scale_factor": 20, + "scale_range": (1.0, 0.5), + "smooth_attentions": True, + "sigma": 0.5, + "kernel_size": 3, + "refine": False, + "normalize_eot": True, + }, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guidance_rescale: float = 0.0, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + **kwargs, + ): + r""" + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + + boxdiff_attention_res (`int`, *optional*, defaults to 16): + The resolution of the attention maps used for computing the BoxDiff loss. + boxdiff_P (`float`, *optional*, defaults to 0.2): + + boxdiff_L (`int`, *optional*, defaults to 1): + The number of pixels around the corner to be selected in BoxDiff loss. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + guidance_scale (`float`, *optional*, defaults to 7.5): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + guidance_rescale (`float`, *optional*, defaults to 0.0): + Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when + using zero terminal SNR. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, + otherwise a `tuple` is returned where the first element is a list with the generated images and the + second element is a list of `bool`s indicating whether the corresponding generated image contains + "not-safe-for-work" (nsfw) content. + """ + + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + if callback is not None: + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + if callback_steps is not None: + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + + # -1. Register attention control (for BoxDiff) + attention_store = AttentionStore() + register_attention_control(self, attention_store) + + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + # to deal with lora scaling and other possible forward hooks + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + height, + width, + boxdiff_phrases, + boxdiff_boxes, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + callback_on_step_end_tensor_inputs, + ) + self.prompt = prompt + + self._guidance_scale = guidance_scale + self._guidance_rescale = guidance_rescale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + self._interrupt = False + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + # 3. Encode input prompt + lora_scale = ( + self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None + ) + + text_inputs, prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt, + device, + num_images_per_prompt, + self.do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=lora_scale, + clip_skip=self.clip_skip, + ) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + if self.do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + if ip_adapter_image is not None: + output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True + image_embeds, negative_image_embeds = self.encode_image( + ip_adapter_image, device, num_images_per_prompt, output_hidden_state + ) + if self.do_classifier_free_guidance: + image_embeds = torch.cat([negative_image_embeds, image_embeds]) + + # 4. Prepare timesteps + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + + # 5. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 6.1 Add image embeds for IP-Adapter + added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None + + # 6.2 Optionally get Guidance Scale Embedding + timestep_cond = None + if self.unet.config.time_cond_proj_dim is not None: + guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt) + timestep_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim + ).to(device=device, dtype=latents.dtype) + + # 6.3 Prepare BoxDiff inputs + # a) Indices to alter + input_ids = self.tokenizer(prompt)["input_ids"] + decoded = [self.tokenizer.decode([t]) for t in input_ids] + indices_to_alter = [] + bboxes = [] + for phrase, box in zip(boxdiff_phrases, boxdiff_boxes): + # it could happen that phrase does not correspond a single token? + if phrase not in decoded: + continue + indices_to_alter.append(decoded.index(phrase)) + bboxes.append(box) + + # b) A bunch of hyperparameters + attention_res = boxdiff_kwargs.get("attention_res", 16) + smooth_attentions = boxdiff_kwargs.get("smooth_attentions", True) + sigma = boxdiff_kwargs.get("sigma", 0.5) + kernel_size = boxdiff_kwargs.get("kernel_size", 3) + L = boxdiff_kwargs.get("L", 1) + P = boxdiff_kwargs.get("P", 0.2) + thresholds = boxdiff_kwargs.get("loss_thresholds", {0: 0.05, 10: 0.5, 20: 0.8}) + max_iter_to_alter = boxdiff_kwargs.get("max_iter_to_alter", len(self.scheduler.timesteps) + 1) + scale_factor = boxdiff_kwargs.get("scale_factor", 20) + refine = boxdiff_kwargs.get("refine", False) + normalize_eot = boxdiff_kwargs.get("normalize_eot", True) + + scale_range = boxdiff_kwargs.get("scale_range", (1.0, 0.5)) + scale_range = np.linspace(scale_range[0], scale_range[1], len(self.scheduler.timesteps)) + + # 7. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + self._num_timesteps = len(timesteps) + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + # BoxDiff optimization + with torch.enable_grad(): + latents = latents.clone().detach().requires_grad_(True) + + # Forward pass of denoising with text conditioning + noise_pred_text = self.unet( + latents, + t, + encoder_hidden_states=prompt_embeds[1].unsqueeze(0), + cross_attention_kwargs=cross_attention_kwargs, + ).sample + self.unet.zero_grad() + + # Get max activation value for each subject token + ( + max_attention_per_index_fg, + max_attention_per_index_bg, + dist_x, + dist_y, + ) = self._aggregate_and_get_max_attention_per_token( + attention_store=attention_store, + indices_to_alter=indices_to_alter, + attention_res=attention_res, + smooth_attentions=smooth_attentions, + sigma=sigma, + kernel_size=kernel_size, + normalize_eot=normalize_eot, + bboxes=bboxes, + L=L, + P=P, + ) + + loss_fg, loss = self._compute_loss( + max_attention_per_index_fg, max_attention_per_index_bg, dist_x, dist_y + ) + + # Refinement from attend-and-excite (not necessary) + if refine and i in thresholds.keys() and loss_fg > 1.0 - thresholds[i]: + del noise_pred_text + torch.cuda.empty_cache() + loss_fg, latents, max_attention_per_index_fg = self._perform_iterative_refinement_step( + latents=latents, + indices_to_alter=indices_to_alter, + loss_fg=loss_fg, + threshold=thresholds[i], + text_embeddings=prompt_embeds, + text_input=text_inputs, + attention_store=attention_store, + step_size=scale_factor * np.sqrt(scale_range[i]), + t=t, + attention_res=attention_res, + smooth_attentions=smooth_attentions, + sigma=sigma, + kernel_size=kernel_size, + normalize_eot=normalize_eot, + bboxes=bboxes, + L=L, + P=P, + ) + + # Perform gradient update + if i < max_iter_to_alter: + _, loss = self._compute_loss( + max_attention_per_index_fg, max_attention_per_index_bg, dist_x, dist_y + ) + if loss != 0: + latents = self._update_latent( + latents=latents, loss=loss, step_size=scale_factor * np.sqrt(scale_range[i]) + ) + + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + timestep_cond=timestep_cond, + cross_attention_kwargs=self.cross_attention_kwargs, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + + if self.do_classifier_free_guidance and self.guidance_rescale > 0.0: + # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[ + 0 + ] + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + image = latents + has_nsfw_concept = None + + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + + # Offload all models + self.maybe_free_model_hooks() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/examples/community/pipeline_stable_diffusion_pag.py b/examples/community/pipeline_stable_diffusion_pag.py index cdb7bd99cb29..5c588adc4f0d 100644 --- a/examples/community/pipeline_stable_diffusion_pag.py +++ b/examples/community/pipeline_stable_diffusion_pag.py @@ -57,13 +57,13 @@ def __init__(self): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -171,13 +171,13 @@ def __init__(self): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -493,8 +493,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -525,8 +525,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -545,10 +545,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -966,7 +966,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 dtype: data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -1078,11 +1078,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1122,18 +1122,18 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): diff --git a/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py b/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py index 61c4fbc13e07..a873e7b2956e 100644 --- a/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py +++ b/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py @@ -164,8 +164,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -197,8 +197,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -218,10 +218,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -460,7 +460,7 @@ def check_inputs( ) # verify batch size of prompt and image are same if image is a list or tensor or numpy array - if isinstance(image, list) or isinstance(image, torch.Tensor) or isinstance(image, np.ndarray): + if isinstance(image, (list, np.ndarray, torch.Tensor)): if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -535,12 +535,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, target_res: Optional[List[int]] = [1024, 1024], @@ -551,7 +551,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be upscaled. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -570,14 +570,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -587,7 +587,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py index 82c522b4489a..6cb4928f7673 100644 --- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py +++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py @@ -248,10 +248,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -277,17 +277,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -887,14 +887,14 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -922,14 +922,14 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - adapter_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): + adapter_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the - type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be + type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for @@ -973,21 +973,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -999,7 +999,7 @@ def __call__( instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py index a85f1c3da6fb..c5495f756c9f 100644 --- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py +++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py @@ -396,10 +396,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -425,17 +425,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1229,14 +1229,14 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[Union[torch.FloatTensor]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[Union[torch.Tensor]] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -1270,14 +1270,14 @@ def __call__( repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. - adapter_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): + adapter_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the - type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be + type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for @@ -1330,21 +1330,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1356,7 +1356,7 @@ def __call__( instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py b/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py index 49fed61254c6..ae9708a6faf6 100644 --- a/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py +++ b/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py @@ -280,10 +280,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -309,17 +309,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -868,7 +868,7 @@ def upcast_vae(self): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -881,7 +881,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -942,10 +942,10 @@ def __call__( prompt: Union[str, List[str]] = None, prompt_2: Optional[Union[str, List[str]]] = None, image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -960,13 +960,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -982,12 +982,12 @@ def __call__( clip_skip: Optional[int] = None, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], - map: torch.FloatTensor = None, + map: torch.Tensor = None, original_image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -1003,7 +1003,7 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): + image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): The image(s) to modify with the pipeline. strength (`float`, *optional*, defaults to 0.3): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` @@ -1051,26 +1051,26 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. @@ -1083,7 +1083,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py b/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py index f1b8dfa96f4e..fb46ff3f3897 100644 --- a/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py +++ b/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py @@ -561,12 +561,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - image_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, + image_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -596,10 +596,10 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for @@ -632,24 +632,24 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled text embeddings are generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input argument. - image_embeds (`torch.FloatTensor`, *optional*): + image_embeds (`torch.Tensor`, *optional*): Pre-generated image embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. diff --git a/examples/community/pipeline_stable_diffusion_xl_instantid.py b/examples/community/pipeline_stable_diffusion_xl_instantid.py index 147ba46a07ae..6e77261f512d 100644 --- a/examples/community/pipeline_stable_diffusion_xl_instantid.py +++ b/examples/community/pipeline_stable_diffusion_xl_instantid.py @@ -559,12 +559,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - image_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, + image_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -592,10 +592,10 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for @@ -628,24 +628,24 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled text embeddings are generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input argument. - image_embeds (`torch.FloatTensor`, *optional*): + image_embeds (`torch.Tensor`, *optional*): Pre-generated image embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. diff --git a/examples/community/pipeline_stable_diffusion_xl_ipex.py b/examples/community/pipeline_stable_diffusion_xl_ipex.py index a44ccf89eadd..791b077a84ca 100644 --- a/examples/community/pipeline_stable_diffusion_xl_ipex.py +++ b/examples/community/pipeline_stable_diffusion_xl_ipex.py @@ -276,10 +276,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -305,17 +305,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -687,7 +687,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -750,11 +750,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -826,21 +826,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1190,11 +1190,11 @@ def prepare_for_ipex( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, diff --git a/examples/community/pipeline_zero1to3.py b/examples/community/pipeline_zero1to3.py index 5e02ba286679..af1c82dfad95 100644 --- a/examples/community/pipeline_zero1to3.py +++ b/examples/community/pipeline_zero1to3.py @@ -192,8 +192,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -211,10 +211,10 @@ def _encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -481,7 +481,7 @@ def check_inputs(self, image, height, width, callback_steps): and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -595,8 +595,8 @@ def prepare_img_latents(self, image, batch_size, dtype, device, generator=None, @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - input_imgs: Union[torch.FloatTensor, PIL.Image.Image] = None, - prompt_imgs: Union[torch.FloatTensor, PIL.Image.Image] = None, + input_imgs: Union[torch.Tensor, PIL.Image.Image] = None, + prompt_imgs: Union[torch.Tensor, PIL.Image.Image] = None, poses: Union[List[float], List[List[float]]] = None, torch_dtype=torch.float32, height: Optional[int] = None, @@ -607,12 +607,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: float = 1.0, @@ -650,14 +650,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -669,7 +669,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/regional_prompting_stable_diffusion.py b/examples/community/regional_prompting_stable_diffusion.py index 71f24a81bd15..19715a4fb6c1 100644 --- a/examples/community/regional_prompting_stable_diffusion.py +++ b/examples/community/regional_prompting_stable_diffusion.py @@ -104,7 +104,7 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, rp_args: Dict[str, str] = None, @@ -168,7 +168,7 @@ def getcompelembs(prps): orig_hw = (height, width) revers = True - def pcallback(s_self, step: int, timestep: int, latents: torch.FloatTensor, selfs=None): + def pcallback(s_self, step: int, timestep: int, latents: torch.Tensor, selfs=None): if "PRO" in mode: # in Prompt mode, make masks from sum of attension maps self.step = step @@ -198,10 +198,10 @@ def pcallback(s_self, step: int, timestep: int, latents: torch.FloatTensor, self def hook_forward(module): # diffusers==0.23.2 def forward( - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, scale: float = 1.0, ) -> torch.Tensor: attn = module diff --git a/examples/community/rerender_a_video.py b/examples/community/rerender_a_video.py index ad414390c9b6..6e25b92603d5 100644 --- a/examples/community/rerender_a_video.py +++ b/examples/community/rerender_a_video.py @@ -142,12 +142,12 @@ class TextToVideoSDPipelineOutput(BaseOutput): Output class for text-to-video pipelines. Args: - frames (`List[np.ndarray]` or `torch.FloatTensor`) + frames (`List[np.ndarray]` or `torch.Tensor`) List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as a `torch` tensor. The length of the list denotes the video length (the number of frames). """ - frames: Union[List[np.ndarray], torch.FloatTensor] + frames: Union[List[np.ndarray], torch.Tensor] @torch.no_grad() @@ -589,20 +589,20 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt def __call__( self, prompt: Union[str, List[str]] = None, - frames: Union[List[np.ndarray], torch.FloatTensor] = None, - control_frames: Union[List[np.ndarray], torch.FloatTensor] = None, + frames: Union[List[np.ndarray], torch.Tensor] = None, + control_frames: Union[List[np.ndarray], torch.Tensor] = None, strength: float = 0.8, num_inference_steps: int = 50, guidance_scale: float = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 0.8, @@ -624,8 +624,8 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - frames (`List[np.ndarray]` or `torch.FloatTensor`): The input images to be used as the starting point for the image generation process. - control_frames (`List[np.ndarray]` or `torch.FloatTensor`): The ControlNet input images condition to provide guidance to the `unet` for generation. + frames (`List[np.ndarray]` or `torch.Tensor`): The input images to be used as the starting point for the image generation process. + control_frames (`List[np.ndarray]` or `torch.Tensor`): The ControlNet input images condition to provide guidance to the `unet` for generation. strength ('float'): SDEdit strength. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -646,14 +646,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -665,7 +665,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/run_onnx_controlnet.py b/examples/community/run_onnx_controlnet.py index ed9b23318414..af2672c17e59 100644 --- a/examples/community/run_onnx_controlnet.py +++ b/examples/community/run_onnx_controlnet.py @@ -507,18 +507,18 @@ def __call__( fp16: bool = True, prompt: Union[str, List[str]] = None, image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, control_image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -531,12 +531,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 0.8, @@ -551,14 +551,14 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The initial image will be used as the starting point for the image generation process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in init, images must be passed as a list such that each element of the list can be correctly @@ -588,14 +588,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -607,7 +607,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/run_tensorrt_controlnet.py b/examples/community/run_tensorrt_controlnet.py index aece5484e304..873195fa31ba 100644 --- a/examples/community/run_tensorrt_controlnet.py +++ b/examples/community/run_tensorrt_controlnet.py @@ -611,18 +611,18 @@ def __call__( fp16: bool = True, prompt: Union[str, List[str]] = None, image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, control_image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -635,12 +635,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 0.8, @@ -655,14 +655,14 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The initial image will be used as the starting point for the image generation process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in init, images must be passed as a list such that each element of the list can be correctly @@ -692,14 +692,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -711,7 +711,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/scheduling_ufogen.py b/examples/community/scheduling_ufogen.py index 3c03b3e8dfda..4b1b92ff183a 100644 --- a/examples/community/scheduling_ufogen.py +++ b/examples/community/scheduling_ufogen.py @@ -34,16 +34,16 @@ class UFOGenSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -98,11 +98,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -218,7 +218,7 @@ def __init__( betas = torch.linspace(-6, 6, num_train_timesteps) self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") # Rescale for zero SNR if rescale_betas_zero_snr: @@ -240,19 +240,19 @@ def __init__( self.num_inference_steps = None self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy()) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -340,7 +340,7 @@ def set_timesteps( self.timesteps = torch.from_numpy(timesteps).to(device) # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -375,9 +375,9 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[UFOGenSchedulerOutput, Tuple]: @@ -386,11 +386,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -461,10 +461,10 @@ def step( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) timesteps = timesteps.to(original_samples.device) @@ -483,9 +483,7 @@ def add_noise( return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype) timesteps = timesteps.to(sample.device) diff --git a/examples/community/sd_text2img_k_diffusion.py b/examples/community/sd_text2img_k_diffusion.py index 3299a7605257..9f83973aba91 100755 --- a/examples/community/sd_text2img_k_diffusion.py +++ b/examples/community/sd_text2img_k_diffusion.py @@ -286,10 +286,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -323,7 +323,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -335,7 +335,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/seed_resize_stable_diffusion.py b/examples/community/seed_resize_stable_diffusion.py index e9eba994b526..ae2d8a53b298 100644 --- a/examples/community/seed_resize_stable_diffusion.py +++ b/examples/community/seed_resize_stable_diffusion.py @@ -81,12 +81,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, - text_embeddings: Optional[torch.FloatTensor] = None, + text_embeddings: Optional[torch.Tensor] = None, **kwargs, ): r""" @@ -119,7 +119,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -131,7 +131,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/speech_to_image_diffusion.py b/examples/community/speech_to_image_diffusion.py index 3537ef89e1a1..9cb5a2a8c7f9 100644 --- a/examples/community/speech_to_image_diffusion.py +++ b/examples/community/speech_to_image_diffusion.py @@ -76,10 +76,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): diff --git a/examples/community/stable_diffusion_comparison.py b/examples/community/stable_diffusion_comparison.py index dab5705b3370..2b510a64f854 100644 --- a/examples/community/stable_diffusion_comparison.py +++ b/examples/community/stable_diffusion_comparison.py @@ -96,10 +96,10 @@ def text2img_sd1_1( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -133,10 +133,10 @@ def text2img_sd1_2( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -170,10 +170,10 @@ def text2img_sd1_3( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -207,10 +207,10 @@ def text2img_sd1_4( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -244,10 +244,10 @@ def _call_( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -276,7 +276,7 @@ def _call_( generator (`torch.Generator`, optional): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, optional): + latents (`torch.Tensor`, optional): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/examples/community/stable_diffusion_controlnet_img2img.py b/examples/community/stable_diffusion_controlnet_img2img.py index 74674e65f0ef..c7c88d6fdcc7 100644 --- a/examples/community/stable_diffusion_controlnet_img2img.py +++ b/examples/community/stable_diffusion_controlnet_img2img.py @@ -189,8 +189,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -207,10 +207,10 @@ def _encode_prompt( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -600,7 +600,7 @@ def __call__( prompt: Union[str, List[str]] = None, image: Union[torch.Tensor, PIL.Image.Image] = None, controlnet_conditioning_image: Union[ - torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image] + torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image] ] = None, strength: float = 0.8, height: Optional[int] = None, @@ -611,12 +611,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 1.0, @@ -633,9 +633,9 @@ def __call__( image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will be masked out with `mask_image` and repainted according to `prompt`. - controlnet_conditioning_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]`): + controlnet_conditioning_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. strength (`float`, *optional*): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` @@ -667,14 +667,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -686,7 +686,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/stable_diffusion_controlnet_inpaint.py b/examples/community/stable_diffusion_controlnet_inpaint.py index a13497dddcaf..b473ffe79933 100644 --- a/examples/community/stable_diffusion_controlnet_inpaint.py +++ b/examples/community/stable_diffusion_controlnet_inpaint.py @@ -288,8 +288,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -306,10 +306,10 @@ def _encode_prompt( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -744,7 +744,7 @@ def __call__( image: Union[torch.Tensor, PIL.Image.Image] = None, mask_image: Union[torch.Tensor, PIL.Image.Image] = None, controlnet_conditioning_image: Union[ - torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image] + torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image] ] = None, height: Optional[int] = None, width: Optional[int] = None, @@ -754,12 +754,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 1.0, @@ -779,9 +779,9 @@ def __call__( repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. - controlnet_conditioning_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]`): + controlnet_conditioning_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. @@ -807,14 +807,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -826,7 +826,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py index 14c4e4aa6d4e..8928f34239e3 100644 --- a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py +++ b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py @@ -273,8 +273,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -291,10 +291,10 @@ def _encode_prompt( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -731,7 +731,7 @@ def __call__( image: Union[torch.Tensor, PIL.Image.Image] = None, mask_image: Union[torch.Tensor, PIL.Image.Image] = None, controlnet_conditioning_image: Union[ - torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image] + torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image] ] = None, strength: float = 0.8, height: Optional[int] = None, @@ -742,12 +742,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: float = 1.0, @@ -767,9 +767,9 @@ def __call__( repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. - controlnet_conditioning_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]`): + controlnet_conditioning_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. strength (`float`, *optional*): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` @@ -801,14 +801,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -820,7 +820,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/stable_diffusion_controlnet_reference.py b/examples/community/stable_diffusion_controlnet_reference.py index 16f7f589b70b..e4d8e12f85a9 100644 --- a/examples/community/stable_diffusion_controlnet_reference.py +++ b/examples/community/stable_diffusion_controlnet_reference.py @@ -100,14 +100,14 @@ def __call__( self, prompt: Union[str, List[str]] = None, image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, - ref_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + ref_image: Union[torch.Tensor, PIL.Image.Image] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -116,12 +116,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 1.0, @@ -139,17 +139,17 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in init, images must be passed as a list such that each element of the list can be correctly batched for input to a single controlnet. - ref_image (`torch.FloatTensor`, `PIL.Image.Image`): + ref_image (`torch.Tensor`, `PIL.Image.Image`): The Reference Control input condition. Reference Control uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to Reference Control as is. `PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to Reference Control as is. `PIL.Image.Image` can also be accepted as an image. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. @@ -176,14 +176,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -195,7 +195,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -374,10 +374,10 @@ def __call__( def hacked_basic_transformer_inner_forward( self, - hidden_states: torch.FloatTensor, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, timestep: Optional[torch.LongTensor] = None, cross_attention_kwargs: Dict[str, Any] = None, class_labels: Optional[torch.LongTensor] = None, @@ -492,12 +492,12 @@ def hacked_mid_forward(self, *args, **kwargs): def hack_CrossAttnDownBlock2D_forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, ): eps = 1e-6 @@ -588,14 +588,14 @@ def hacked_DownBlock2D_forward(self, hidden_states, temb=None, *args, **kwargs): def hacked_CrossAttnUpBlock2D_forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, ): eps = 1e-6 # TODO(Patrick, William) - attention mask is not used diff --git a/examples/community/stable_diffusion_ipex.py b/examples/community/stable_diffusion_ipex.py index dd648fd8c708..92588ba8a2e8 100644 --- a/examples/community/stable_diffusion_ipex.py +++ b/examples/community/stable_diffusion_ipex.py @@ -311,8 +311,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -330,10 +330,10 @@ def _encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -567,12 +567,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, ): @@ -608,14 +608,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -627,7 +627,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/stable_diffusion_mega.py b/examples/community/stable_diffusion_mega.py index 826d9662e303..12926e9a10b7 100644 --- a/examples/community/stable_diffusion_mega.py +++ b/examples/community/stable_diffusion_mega.py @@ -103,8 +103,8 @@ def components(self) -> Dict[str, Any]: def inpaint( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image], strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -114,7 +114,7 @@ def inpaint( generator: Optional[torch.Generator] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): # For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline @@ -138,7 +138,7 @@ def inpaint( def img2img( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -148,7 +148,7 @@ def img2img( generator: Optional[torch.Generator] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -181,10 +181,10 @@ def text2img( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): # For more information on how this function https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionPipeline diff --git a/examples/community/stable_diffusion_reference.py b/examples/community/stable_diffusion_reference.py index c2dd184c2fa4..15c3f8845f3b 100644 --- a/examples/community/stable_diffusion_reference.py +++ b/examples/community/stable_diffusion_reference.py @@ -78,7 +78,7 @@ def torch_dfs(model: torch.nn.Module): class StableDiffusionReferencePipeline( DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin ): - r""" " + r""" Pipeline for Stable Diffusion Reference. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods @@ -268,10 +268,10 @@ def check_inputs( width: int, callback_steps: Optional[int], negative_prompt: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[torch.Tensor] = None, - ip_adapter_image_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image_embeds: Optional[torch.Tensor] = None, callback_on_step_end_tensor_inputs: Optional[List[str]] = None, ) -> None: """ @@ -283,10 +283,10 @@ def check_inputs( width (int): The width of the input image. callback_steps (Optional[int]): The number of steps to perform the callback on. negative_prompt (Optional[str]): The negative prompt text. - prompt_embeds (Optional[torch.FloatTensor]): The prompt embeddings. - negative_prompt_embeds (Optional[torch.FloatTensor]): The negative prompt embeddings. + prompt_embeds (Optional[torch.Tensor]): The prompt embeddings. + negative_prompt_embeds (Optional[torch.Tensor]): The negative prompt embeddings. ip_adapter_image (Optional[torch.Tensor]): The input adapter image. - ip_adapter_image_embeds (Optional[torch.FloatTensor]): The input adapter image embeddings. + ip_adapter_image_embeds (Optional[torch.Tensor]): The input adapter image embeddings. callback_on_step_end_tensor_inputs (Optional[List[str]]): The list of tensor inputs to perform the callback on. Raises: @@ -357,11 +357,11 @@ def _encode_prompt( num_images_per_prompt: int, do_classifier_free_guidance: bool, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Encodes the prompt into embeddings. @@ -371,13 +371,13 @@ def _encode_prompt( num_images_per_prompt (int): The number of images per prompt. do_classifier_free_guidance (bool): Whether to use classifier-free guidance. negative_prompt (Optional[Union[str, List[str]]], optional): The negative prompt text or a list of negative prompt texts. Defaults to None. - prompt_embeds (Optional[torch.FloatTensor], optional): The prompt embeddings. Defaults to None. - negative_prompt_embeds (Optional[torch.FloatTensor], optional): The negative prompt embeddings. Defaults to None. + prompt_embeds (Optional[torch.Tensor], optional): The prompt embeddings. Defaults to None. + negative_prompt_embeds (Optional[torch.Tensor], optional): The negative prompt embeddings. Defaults to None. lora_scale (Optional[float], optional): The LoRA scale. Defaults to None. **kwargs: Additional keyword arguments. Returns: - torch.FloatTensor: The encoded prompt embeddings. + torch.Tensor: The encoded prompt embeddings. """ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) @@ -407,11 +407,11 @@ def encode_prompt( num_images_per_prompt: int, do_classifier_free_guidance: bool, negative_prompt: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Encodes the prompt into text encoder hidden states. @@ -428,10 +428,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -813,7 +813,7 @@ def run_safety_checker( def __call__( self, prompt: Union[str, List[str]] = None, - ref_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + ref_image: Union[torch.Tensor, PIL.Image.Image] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -822,12 +822,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -844,9 +844,9 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - ref_image (`torch.FloatTensor`, `PIL.Image.Image`): + ref_image (`torch.Tensor`, `PIL.Image.Image`): The Reference Control input condition. Reference Control uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to Reference Control as is. `PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to Reference Control as is. `PIL.Image.Image` can also be accepted as an image. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. @@ -873,14 +873,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -892,7 +892,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -1017,10 +1017,10 @@ def __call__( def hacked_basic_transformer_inner_forward( self, - hidden_states: torch.FloatTensor, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, timestep: Optional[torch.LongTensor] = None, cross_attention_kwargs: Dict[str, Any] = None, class_labels: Optional[torch.LongTensor] = None, @@ -1135,12 +1135,12 @@ def hacked_mid_forward(self, *args, **kwargs): def hack_CrossAttnDownBlock2D_forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, ): eps = 1e-6 @@ -1191,10 +1191,10 @@ def hack_CrossAttnDownBlock2D_forward( def hacked_DownBlock2D_forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, **kwargs: Any, - ) -> Tuple[torch.FloatTensor, ...]: + ) -> Tuple[torch.Tensor, ...]: eps = 1e-6 output_states = () @@ -1236,15 +1236,15 @@ def hacked_DownBlock2D_forward( def hacked_CrossAttnUpBlock2D_forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: eps = 1e-6 # TODO(Patrick, William) - attention mask is not used for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)): @@ -1292,12 +1292,12 @@ def hacked_CrossAttnUpBlock2D_forward( def hacked_UpBlock2D_forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, **kwargs: Any, - ) -> torch.FloatTensor: + ) -> torch.Tensor: eps = 1e-6 for i, resnet in enumerate(self.resnets): # pop res hidden states diff --git a/examples/community/stable_diffusion_repaint.py b/examples/community/stable_diffusion_repaint.py index 02bef293bba8..2addc5a62dbb 100644 --- a/examples/community/stable_diffusion_repaint.py +++ b/examples/community/stable_diffusion_repaint.py @@ -285,8 +285,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -303,10 +303,10 @@ def _encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -606,8 +606,8 @@ def prepare_mask_latents( def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, + mask_image: Union[torch.Tensor, PIL.Image.Image] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -618,12 +618,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -671,14 +671,14 @@ def __call__( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -690,7 +690,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/stable_diffusion_tensorrt_img2img.py b/examples/community/stable_diffusion_tensorrt_img2img.py index 90ec138398cf..7264a60506fe 100755 --- a/examples/community/stable_diffusion_tensorrt_img2img.py +++ b/examples/community/stable_diffusion_tensorrt_img2img.py @@ -962,7 +962,7 @@ def __loadResources(self, image_height, image_width, batch_size): def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, strength: float = 0.8, num_inference_steps: int = 50, guidance_scale: float = 7.5, diff --git a/examples/community/stable_diffusion_tensorrt_inpaint.py b/examples/community/stable_diffusion_tensorrt_inpaint.py index 9ace697cc7a6..b2d61a3dab93 100755 --- a/examples/community/stable_diffusion_tensorrt_inpaint.py +++ b/examples/community/stable_diffusion_tensorrt_inpaint.py @@ -962,8 +962,8 @@ def __loadResources(self, image_height, image_width, batch_size): def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, + mask_image: Union[torch.Tensor, PIL.Image.Image] = None, strength: float = 1.0, num_inference_steps: int = 50, guidance_scale: float = 7.5, diff --git a/examples/community/stable_diffusion_xl_reference.py b/examples/community/stable_diffusion_xl_reference.py index 7282582baf91..107afc1f8b7a 100644 --- a/examples/community/stable_diffusion_xl_reference.py +++ b/examples/community/stable_diffusion_xl_reference.py @@ -190,7 +190,7 @@ def __call__( self, prompt: Union[str, List[str]] = None, prompt_2: Optional[Union[str, List[str]]] = None, - ref_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + ref_image: Union[torch.Tensor, PIL.Image.Image] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -201,14 +201,14 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -335,10 +335,10 @@ def __call__( def hacked_basic_transformer_inner_forward( self, - hidden_states: torch.FloatTensor, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, timestep: Optional[torch.LongTensor] = None, cross_attention_kwargs: Dict[str, Any] = None, class_labels: Optional[torch.LongTensor] = None, @@ -453,12 +453,12 @@ def hacked_mid_forward(self, *args, **kwargs): def hack_CrossAttnDownBlock2D_forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, ): eps = 1e-6 @@ -549,14 +549,14 @@ def hacked_DownBlock2D_forward(self, hidden_states, temb=None, *args, **kwargs): def hacked_CrossAttnUpBlock2D_forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, ): eps = 1e-6 # TODO(Patrick, William) - attention mask is not used diff --git a/examples/community/stable_unclip.py b/examples/community/stable_unclip.py index 6acca20d6a78..f13c4e0a490b 100644 --- a/examples/community/stable_unclip.py +++ b/examples/community/stable_unclip.py @@ -191,7 +191,7 @@ def __call__( num_images_per_prompt: int = 1, prior_num_inference_steps: int = 25, generator: Optional[torch.Generator] = None, - prior_latents: Optional[torch.FloatTensor] = None, + prior_latents: Optional[torch.Tensor] = None, text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None, text_attention_mask: Optional[torch.Tensor] = None, prior_guidance_scale: float = 4.0, diff --git a/examples/community/text_inpainting.py b/examples/community/text_inpainting.py index ea4da966bb71..c4378ab96f28 100644 --- a/examples/community/text_inpainting.py +++ b/examples/community/text_inpainting.py @@ -125,7 +125,7 @@ def __init__( def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], text: str, height: int = 512, width: int = 512, @@ -135,10 +135,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -177,7 +177,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -189,7 +189,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/tiled_upscaling.py b/examples/community/tiled_upscaling.py index bf290042e822..313b5fc6f76a 100644 --- a/examples/community/tiled_upscaling.py +++ b/examples/community/tiled_upscaling.py @@ -193,8 +193,8 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + latents: Optional[torch.Tensor] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, tile_size: int = 128, tile_border: int = 32, @@ -206,7 +206,7 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.FloatTensor`): + image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.Tensor`): `Image`, or tensor representing an image batch which will be upscaled. * num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -228,7 +228,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/examples/community/unclip_image_interpolation.py b/examples/community/unclip_image_interpolation.py index e3bb44e5030b..210bd61ecd1d 100644 --- a/examples/community/unclip_image_interpolation.py +++ b/examples/community/unclip_image_interpolation.py @@ -207,14 +207,14 @@ def _encode_image(self, image, device, num_images_per_prompt, image_embeddings: @torch.no_grad() def __call__( self, - image: Optional[Union[List[PIL.Image.Image], torch.FloatTensor]] = None, + image: Optional[Union[List[PIL.Image.Image], torch.Tensor]] = None, steps: int = 5, decoder_num_inference_steps: int = 25, super_res_num_inference_steps: int = 7, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, image_embeddings: Optional[torch.Tensor] = None, - decoder_latents: Optional[torch.FloatTensor] = None, - super_res_latents: Optional[torch.FloatTensor] = None, + decoder_latents: Optional[torch.Tensor] = None, + super_res_latents: Optional[torch.Tensor] = None, decoder_guidance_scale: float = 8.0, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -223,7 +223,7 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image (`List[PIL.Image.Image]` or `torch.FloatTensor`): + image (`List[PIL.Image.Image]` or `torch.Tensor`): The images to use for the image interpolation. Only accepts a list of two PIL Images or If you provide a tensor, it needs to comply with the configuration of [this](https://huggingface.co/fusing/karlo-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json) @@ -242,9 +242,9 @@ def __call__( image_embeddings (`torch.Tensor`, *optional*): Pre-defined image embeddings that can be derived from the image encoder. Pre-defined image embeddings can be passed for tasks like image interpolations. `image` can the be left to `None`. - decoder_latents (`torch.FloatTensor` of shape (batch size, channels, height, width), *optional*): + decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. - super_res_latents (`torch.FloatTensor` of shape (batch size, channels, super res height, super res width), *optional*): + super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. decoder_guidance_scale (`float`, *optional*, defaults to 4.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). @@ -272,19 +272,19 @@ def __call__( raise AssertionError( f"Expected 'image' List to contain PIL.Image.Image, but passed 'image' contents are {type(image[0])} and {type(image[1])}" ) - elif isinstance(image, torch.FloatTensor): + elif isinstance(image, torch.Tensor): if image.shape[0] != 2: raise AssertionError( - f"Expected 'image' to be torch.FloatTensor of shape 2 in 0th dimension, but passed 'image' size is {image.shape[0]}" + f"Expected 'image' to be torch.Tensor of shape 2 in 0th dimension, but passed 'image' size is {image.shape[0]}" ) elif isinstance(image_embeddings, torch.Tensor): if image_embeddings.shape[0] != 2: raise AssertionError( - f"Expected 'image_embeddings' to be torch.FloatTensor of shape 2 in 0th dimension, but passed 'image_embeddings' shape is {image_embeddings.shape[0]}" + f"Expected 'image_embeddings' to be torch.Tensor of shape 2 in 0th dimension, but passed 'image_embeddings' shape is {image_embeddings.shape[0]}" ) else: raise AssertionError( - f"Expected 'image' or 'image_embeddings' to be not None with types List[PIL.Image] or Torch.FloatTensor respectively. Received {type(image)} and {type(image_embeddings)} repsectively" + f"Expected 'image' or 'image_embeddings' to be not None with types List[PIL.Image] or torch.Tensor respectively. Received {type(image)} and {type(image_embeddings)} repsectively" ) original_image_embeddings = self._encode_image( diff --git a/examples/community/wildcard_stable_diffusion.py b/examples/community/wildcard_stable_diffusion.py index 241e661536d3..c866ce2ae904 100644 --- a/examples/community/wildcard_stable_diffusion.py +++ b/examples/community/wildcard_stable_diffusion.py @@ -166,10 +166,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, wildcard_option_dict: Dict[str, List[str]] = {}, wildcard_files: List[str] = [], @@ -206,7 +206,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -218,7 +218,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py b/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py index 1e88cb67ee71..e7d934dd07a8 100644 --- a/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py +++ b/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py @@ -336,7 +336,7 @@ def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32): data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 diff --git a/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py b/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py index 08d6b23d6deb..ce3e7f624843 100644 --- a/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py +++ b/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py @@ -1358,7 +1358,7 @@ def compute_embeddings( # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE # solver timestep. with torch.no_grad(): - if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path: + if torch.backends.mps.is_available() or "playground" in args.pretrained_teacher_model: autocast_ctx = nullcontext() else: autocast_ctx = torch.autocast(accelerator.device.type) diff --git a/examples/consistency_distillation/train_lcm_distill_sd_wds.py b/examples/consistency_distillation/train_lcm_distill_sd_wds.py index 5dcad9f6cc39..7fbcb5d6fb91 100644 --- a/examples/consistency_distillation/train_lcm_distill_sd_wds.py +++ b/examples/consistency_distillation/train_lcm_distill_sd_wds.py @@ -314,7 +314,7 @@ def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32): data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 diff --git a/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py b/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py index a7deca72a86f..c8b91e7abdd6 100644 --- a/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py +++ b/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py @@ -406,7 +406,7 @@ def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32): data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py index 6858fed8b994..fbd11fc01be2 100644 --- a/examples/custom_diffusion/train_custom_diffusion.py +++ b/examples/custom_diffusion/train_custom_diffusion.py @@ -152,7 +152,7 @@ def collate_fn(examples, with_prior_preservation): class PromptDataset(Dataset): - "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + """A simple dataset to prepare the prompts to generate class images on multiple GPUs.""" def __init__(self, prompt, num_samples): self.prompt = prompt diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index a18c443e7d4d..796aba87e846 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -742,7 +742,7 @@ def collate_fn(examples, with_prior_preservation=False): class PromptDataset(Dataset): - "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + """A simple dataset to prepare the prompts to generate class images on multiple GPUs.""" def __init__(self, prompt, num_samples): self.prompt = prompt @@ -759,7 +759,7 @@ def __getitem__(self, index): def model_has_vae(args): - config_file_name = os.path.join("vae", AutoencoderKL.config_name) + config_file_name = Path("vae", AutoencoderKL.config_name).as_posix() if os.path.isdir(args.pretrained_model_name_or_path): config_file_name = os.path.join(args.pretrained_model_name_or_path, config_file_name) return os.path.isfile(config_file_name) diff --git a/examples/dreambooth/train_dreambooth_flax.py b/examples/dreambooth/train_dreambooth_flax.py index d89e46e4a690..23238c84b643 100644 --- a/examples/dreambooth/train_dreambooth_flax.py +++ b/examples/dreambooth/train_dreambooth_flax.py @@ -301,7 +301,7 @@ def __getitem__(self, index): class PromptDataset(Dataset): - "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + """A simple dataset to prepare the prompts to generate class images on multiple GPUs.""" def __init__(self, prompt, num_samples): self.prompt = prompt diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index 0d33a0558989..f2cac3b1494d 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -680,7 +680,7 @@ def collate_fn(examples, with_prior_preservation=False): class PromptDataset(Dataset): - "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + """A simple dataset to prepare the prompts to generate class images on multiple GPUs.""" def __init__(self, prompt, num_samples): self.prompt = prompt diff --git a/examples/dreambooth/train_dreambooth_lora_sdxl.py b/examples/dreambooth/train_dreambooth_lora_sdxl.py index f3e347cd6ac9..9e3d82acdbfe 100644 --- a/examples/dreambooth/train_dreambooth_lora_sdxl.py +++ b/examples/dreambooth/train_dreambooth_lora_sdxl.py @@ -903,7 +903,7 @@ def collate_fn(examples, with_prior_preservation=False): class PromptDataset(Dataset): - "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + """A simple dataset to prepare the prompts to generate class images on multiple GPUs.""" def __init__(self, prompt, num_samples): self.prompt = prompt diff --git a/examples/kandinsky2_2/text_to_image/train_text_to_image_decoder.py b/examples/kandinsky2_2/text_to_image/train_text_to_image_decoder.py index 78f9b7f18b87..409978cb5373 100644 --- a/examples/kandinsky2_2/text_to_image/train_text_to_image_decoder.py +++ b/examples/kandinsky2_2/text_to_image/train_text_to_image_decoder.py @@ -896,7 +896,6 @@ def collate_fn(examples): images = [] if args.validation_prompts is not None: logger.info("Running inference for collecting generated images...") - pipeline = pipeline.to(accelerator.device) pipeline.torch_dtype = weight_dtype pipeline.set_progress_bar_config(disable=True) pipeline.enable_model_cpu_offload() diff --git a/examples/research_projects/colossalai/train_dreambooth_colossalai.py b/examples/research_projects/colossalai/train_dreambooth_colossalai.py index 5cebd2b81175..10c8e095a696 100644 --- a/examples/research_projects/colossalai/train_dreambooth_colossalai.py +++ b/examples/research_projects/colossalai/train_dreambooth_colossalai.py @@ -327,7 +327,7 @@ def __getitem__(self, index): class PromptDataset(Dataset): - "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + """A simple dataset to prepare the prompts to generate class images on multiple GPUs.""" def __init__(self, prompt, num_samples): self.prompt = prompt diff --git a/examples/research_projects/consistency_training/train_cm_ct_unconditional.py b/examples/research_projects/consistency_training/train_cm_ct_unconditional.py index 947de230b6b2..b7a1e2a545f8 100644 --- a/examples/research_projects/consistency_training/train_cm_ct_unconditional.py +++ b/examples/research_projects/consistency_training/train_cm_ct_unconditional.py @@ -126,7 +126,7 @@ def get_karras_sigmas( return sigmas -def get_discretized_lognormal_weights(noise_levels: torch.FloatTensor, p_mean: float = -1.1, p_std: float = 2.0): +def get_discretized_lognormal_weights(noise_levels: torch.Tensor, p_mean: float = -1.1, p_std: float = 2.0): """ Calculates the unnormalized weights for a 1D array of noise level sigma_i based on the discretized lognormal" " distribution used in the iCT paper (given in Equation 10). @@ -137,14 +137,14 @@ def get_discretized_lognormal_weights(noise_levels: torch.FloatTensor, p_mean: f return weights -def get_loss_weighting_schedule(noise_levels: torch.FloatTensor): +def get_loss_weighting_schedule(noise_levels: torch.Tensor): """ Calculates the loss weighting schedule lambda given a set of noise levels. """ return 1.0 / (noise_levels[1:] - noise_levels[:-1]) -def add_noise(original_samples: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.FloatTensor): +def add_noise(original_samples: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor): # Make sure timesteps (Karras sigmas) have the same device and dtype as original_samples sigmas = timesteps.to(device=original_samples.device, dtype=original_samples.dtype) while len(sigmas.shape) < len(original_samples.shape): diff --git a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py index 0e82a45c024f..9b484c89fdbd 100644 --- a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py +++ b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py @@ -385,7 +385,7 @@ def __getitem__(self, index): class PromptDataset(Dataset): - "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + """A simple dataset to prepare the prompts to generate class images on multiple GPUs.""" def __init__(self, prompt, num_samples): self.prompt = prompt diff --git a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py index 3d79b2ceadaf..720a69b4e791 100644 --- a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py +++ b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py @@ -384,7 +384,7 @@ def __getitem__(self, index): class PromptDataset(Dataset): - "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + """A simple dataset to prepare the prompts to generate class images on multiple GPUs.""" def __init__(self, prompt, num_samples): self.prompt = prompt diff --git a/examples/research_projects/geodiff/geodiff_molecule_conformation.ipynb b/examples/research_projects/geodiff/geodiff_molecule_conformation.ipynb index 6997c74bcad1..bde093802a5d 100644 --- a/examples/research_projects/geodiff/geodiff_molecule_conformation.ipynb +++ b/examples/research_projects/geodiff/geodiff_molecule_conformation.ipynb @@ -737,11 +737,11 @@ "class MoleculeGNNOutput(BaseOutput):\n", " \"\"\"\n", " Args:\n", - " sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):\n", + " sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):\n", " Hidden states output. Output of last layer of model.\n", " \"\"\"\n", "\n", - " sample: torch.FloatTensor\n", + " sample: torch.Tensor\n", "\n", "\n", "class MultiLayerPerceptron(nn.Module):\n", @@ -1354,7 +1354,7 @@ " r\"\"\"\n", " Args:\n", " sample: packed torch geometric object\n", - " timestep (`torch.FloatTensor` or `float` or `int): TODO verify type and shape (batch) timesteps\n", + " timestep (`torch.Tensor` or `float` or `int): TODO verify type and shape (batch) timesteps\n", " return_dict (`bool`, *optional*, defaults to `True`):\n", " Whether or not to return a [`~models.molecule_gnn.MoleculeGNNOutput`] instead of a plain tuple.\n", " Returns:\n", @@ -1404,7 +1404,7 @@ " if not return_dict:\n", " return (-eps_pos,)\n", "\n", - " return MoleculeGNNOutput(sample=torch.FloatTensor(-eps_pos).to(pos.device))" + " return MoleculeGNNOutput(sample=torch.Tensor(-eps_pos).to(pos.device))" ], "metadata": { "id": "MCeZA1qQXzoK" diff --git a/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py b/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py index dcbc2704b833..61b1cbef195e 100644 --- a/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py +++ b/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py @@ -279,8 +279,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -312,8 +312,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -333,10 +333,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -852,7 +852,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -906,9 +906,9 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -928,10 +928,10 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for @@ -963,14 +963,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. @@ -981,7 +981,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/examples/research_projects/promptdiffusion/promptdiffusioncontrolnet.py b/examples/research_projects/promptdiffusion/promptdiffusioncontrolnet.py index 4e9015960157..46cabd863dfa 100644 --- a/examples/research_projects/promptdiffusion/promptdiffusioncontrolnet.py +++ b/examples/research_projects/promptdiffusion/promptdiffusioncontrolnet.py @@ -181,11 +181,11 @@ def __init__( def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, - controlnet_cond: torch.FloatTensor, - controlnet_query_cond: torch.FloatTensor, + controlnet_cond: torch.Tensor, + controlnet_query_cond: torch.Tensor, conditioning_scale: float = 1.0, class_labels: Optional[torch.Tensor] = None, timestep_cond: Optional[torch.Tensor] = None, @@ -194,20 +194,20 @@ def forward( cross_attention_kwargs: Optional[Dict[str, Any]] = None, guess_mode: bool = False, return_dict: bool = True, - ) -> Union[ControlNetOutput, Tuple[Tuple[torch.FloatTensor, ...], torch.FloatTensor]]: + ) -> Union[ControlNetOutput, Tuple[Tuple[torch.Tensor, ...], torch.Tensor]]: """ The [`~PromptDiffusionControlNetModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor. timestep (`Union[torch.Tensor, float, int]`): The number of timesteps to denoise an input. encoder_hidden_states (`torch.Tensor`): The encoder hidden states. - controlnet_cond (`torch.FloatTensor`): + controlnet_cond (`torch.Tensor`): The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. - controlnet_query_cond (`torch.FloatTensor`): + controlnet_query_cond (`torch.Tensor`): The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. conditioning_scale (`float`, defaults to `1.0`): The scale factor for ControlNet outputs. diff --git a/examples/research_projects/rdm/pipeline_rdm.py b/examples/research_projects/rdm/pipeline_rdm.py index e0c4847c7e39..201acb95aabd 100644 --- a/examples/research_projects/rdm/pipeline_rdm.py +++ b/examples/research_projects/rdm/pipeline_rdm.py @@ -163,11 +163,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, knn: Optional[int] = 10, **kwargs, @@ -199,11 +199,11 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -213,7 +213,7 @@ def __call__( Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/research_projects/rdm/retriever.py b/examples/research_projects/rdm/retriever.py index 16518ed1bc42..6be9785a21f3 100644 --- a/examples/research_projects/rdm/retriever.py +++ b/examples/research_projects/rdm/retriever.py @@ -20,7 +20,7 @@ def normalize_images(images: List[Image.Image]): return images -def preprocess_images(images: List[np.array], feature_extractor: CLIPFeatureExtractor) -> torch.FloatTensor: +def preprocess_images(images: List[np.array], feature_extractor: CLIPFeatureExtractor) -> torch.Tensor: """ Preprocesses a list of images into a batch of tensors. @@ -29,7 +29,7 @@ def preprocess_images(images: List[np.array], feature_extractor: CLIPFeatureExtr A list of images to preprocess. Returns: - :obj:`torch.FloatTensor`: A batch of tensors. + :obj:`torch.Tensor`: A batch of tensors. """ images = [np.array(image) for image in images] images = [(image + 1.0) / 2.0 for image in images] diff --git a/examples/research_projects/realfill/requirements.txt b/examples/research_projects/realfill/requirements.txt index f6abdc6e7e20..c9e4e7e4ae72 100644 --- a/examples/research_projects/realfill/requirements.txt +++ b/examples/research_projects/realfill/requirements.txt @@ -1,6 +1,6 @@ diffusers==0.20.1 accelerate==0.23.0 -transformers==4.36.0 +transformers==4.38.0 peft==0.5.0 torch==2.0.1 torchvision>=0.16 diff --git a/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth.py b/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth.py index 779da7328d97..5f7ca2262dcc 100644 --- a/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth.py +++ b/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth.py @@ -762,7 +762,7 @@ def collate_fn(examples, with_prior_preservation=False): class PromptDataset(Dataset): - "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + """A simple dataset to prepare the prompts to generate class images on multiple GPUs.""" def __init__(self, prompt, num_samples): self.prompt = prompt diff --git a/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora.py b/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora.py index 73c67dd5ddf7..250f8702d996 100644 --- a/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora.py +++ b/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora.py @@ -700,7 +700,7 @@ def collate_fn(examples, with_prior_preservation=False): class PromptDataset(Dataset): - "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + """A simple dataset to prepare the prompts to generate class images on multiple GPUs.""" def __init__(self, prompt, num_samples): self.prompt = prompt diff --git a/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora_sdxl.py b/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora_sdxl.py index 0004e6d6e87a..e8c9cb796ab1 100644 --- a/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora_sdxl.py +++ b/examples/research_projects/scheduled_huber_loss_training/dreambooth/train_dreambooth_lora_sdxl.py @@ -922,7 +922,7 @@ def collate_fn(examples, with_prior_preservation=False): class PromptDataset(Dataset): - "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + """A simple dataset to prepare the prompts to generate class images on multiple GPUs.""" def __init__(self, prompt, num_samples): self.prompt = prompt diff --git a/examples/text_to_image/requirements.txt b/examples/text_to_image/requirements.txt index 0dd164fc2035..c3ffa42f0edc 100644 --- a/examples/text_to_image/requirements.txt +++ b/examples/text_to_image/requirements.txt @@ -1,8 +1,8 @@ accelerate>=0.16.0 torchvision transformers>=4.25.1 -datasets +datasets>=2.19.1 ftfy tensorboard Jinja2 -peft==0.7.0 \ No newline at end of file +peft==0.7.0 diff --git a/examples/text_to_image/train_text_to_image_sdxl.py b/examples/text_to_image/train_text_to_image_sdxl.py index 90602ad597a9..74864da20d82 100644 --- a/examples/text_to_image/train_text_to_image_sdxl.py +++ b/examples/text_to_image/train_text_to_image_sdxl.py @@ -35,7 +35,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import ProjectConfiguration, set_seed +from accelerate.utils import DistributedType, ProjectConfiguration, set_seed from datasets import concatenate_datasets, load_dataset from huggingface_hub import create_repo, upload_folder from packaging import version @@ -50,7 +50,7 @@ from diffusers.training_utils import EMAModel, compute_snr from diffusers.utils import check_min_version, is_wandb_available from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card -from diffusers.utils.import_utils import is_xformers_available +from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available from diffusers.utils.torch_utils import is_compiled_module @@ -58,7 +58,8 @@ check_min_version("0.28.0.dev0") logger = get_logger(__name__) - +if is_torch_npu_available(): + torch.npu.config.allow_internal_format = False DATASET_NAME_MAPPING = { "lambdalabs/naruto-blip-captions": ("image", "text"), @@ -460,6 +461,9 @@ def parse_args(input_args=None): ), ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument( + "--enable_npu_flash_attention", action="store_true", help="Whether or not to use npu flash attention." + ) parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." ) @@ -716,7 +720,12 @@ def main(args): args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) ema_unet = EMAModel(ema_unet.parameters(), model_cls=UNet2DConditionModel, model_config=ema_unet.config) - + if args.enable_npu_flash_attention: + if is_torch_npu_available(): + logger.info("npu flash attention enabled.") + unet.enable_npu_flash_attention() + else: + raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu devices.") if args.enable_xformers_memory_efficient_attention: if is_xformers_available(): import xformers @@ -742,7 +751,8 @@ def save_model_hook(models, weights, output_dir): model.save_pretrained(os.path.join(output_dir, "unet")) # make sure to pop weight so that corresponding model is not saved again - weights.pop() + if weights: + weights.pop() def load_model_hook(models, input_dir): if args.use_ema: @@ -914,7 +924,7 @@ def preprocess_train(examples): train_dataset_with_vae = train_dataset.map( compute_vae_encodings_fn, batched=True, - batch_size=args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps, + batch_size=args.train_batch_size, new_fingerprint=new_fingerprint_for_vae, ) precomputed_dataset = concatenate_datasets( @@ -1160,7 +1170,8 @@ def compute_time_ids(original_size, crops_coords_top_left): accelerator.log({"train_loss": train_loss}, step=global_step) train_loss = 0.0 - if accelerator.is_main_process: + # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues. + if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process: if global_step % args.checkpointing_steps == 0: # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` if args.checkpoints_total_limit is not None: diff --git a/examples/vqgan/README.md b/examples/vqgan/README.md new file mode 100644 index 000000000000..0b0f3589baf5 --- /dev/null +++ b/examples/vqgan/README.md @@ -0,0 +1,127 @@ +## Training an VQGAN VAE +VQVAEs were first introduced in [Neural Discrete Representation Learning](https://arxiv.org/abs/1711.00937) and was combined with a GAN in the paper [Taming Transformers for High-Resolution Image Synthesis](https://arxiv.org/abs/2012.09841). The basic idea of a VQVAE is it's a type of a variational auto encoder with tokens as the latent space similar to tokens for LLMs. This script was adapted from a [pr to huggingface's open-muse project](https://github.com/huggingface/open-muse/pull/52) with general code following [lucidrian's implementation of the vqgan training script](https://github.com/lucidrains/muse-maskgit-pytorch/blob/main/muse_maskgit_pytorch/trainers.py) but both of these implementation follow from the [taming transformer repo](https://github.com/CompVis/taming-transformers?tab=readme-ov-file). + + +Creating a training image set is [described in a different document](https://huggingface.co/docs/datasets/image_process#image-datasets). + +### Installing the dependencies + +Before running the scripts, make sure to install the library's training dependencies: + +**Important** + +To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment: +```bash +git clone https://github.com/huggingface/diffusers +cd diffusers +pip install . +``` + +Then cd in the example folder and run +```bash +pip install -r requirements.txt +``` + + +And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with: + +```bash +accelerate config +``` + +### Training on CIFAR10 + +The command to train a VQGAN model on cifar10 dataset: + +```bash +accelerate launch train_vqgan.py \ + --dataset_name=cifar10 \ + --image_column=img \ + --validation_images images/bird.jpg images/car.jpg images/dog.jpg images/frog.jpg \ + --resolution=128 \ + --train_batch_size=2 \ + --gradient_accumulation_steps=8 \ + --report_to=wandb +``` + +An example training run is [here](https://wandb.ai/sayakpaul/vqgan-training/runs/0m5kzdfp) by @sayakpaul and a lower scale one [here](https://wandb.ai/dsbuddy27/vqgan-training/runs/eqd6xi4n?nw=nwuserisamu). The validation images can be obtained from [here](https://huggingface.co/datasets/diffusers/docs-images/tree/main/vqgan_validation_images). +The simplest way to improve the quality of a VQGAN model is to maximize the amount of information present in the bottleneck. The easiest way to do this is increasing the image resolution. However, other ways include, but not limited to, lowering compression by downsampling fewer times or increasing the vocaburary size which at most can be around 16384. How to do this is shown below. + +# Modifying the architecture + +To modify the architecture of the vqgan model you can save the config taken from [here](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder/blob/main/movq/config.json) and then provide that to the script with the option --model_config_name_or_path. This config is below +``` +{ + "_class_name": "VQModel", + "_diffusers_version": "0.17.0.dev0", + "act_fn": "silu", + "block_out_channels": [ + 128, + 256, + 256, + 512 + ], + "down_block_types": [ + "DownEncoderBlock2D", + "DownEncoderBlock2D", + "DownEncoderBlock2D", + "AttnDownEncoderBlock2D" + ], + "in_channels": 3, + "latent_channels": 4, + "layers_per_block": 2, + "norm_num_groups": 32, + "norm_type": "spatial", + "num_vq_embeddings": 16384, + "out_channels": 3, + "sample_size": 32, + "scaling_factor": 0.18215, + "up_block_types": [ + "AttnUpDecoderBlock2D", + "UpDecoderBlock2D", + "UpDecoderBlock2D", + "UpDecoderBlock2D" + ], + "vq_embed_dim": 4 +} +``` +To lower the amount of layers in a VQGan, you can remove layers by modifying the block_out_channels, down_block_types, and up_block_types like below +``` +{ + "_class_name": "VQModel", + "_diffusers_version": "0.17.0.dev0", + "act_fn": "silu", + "block_out_channels": [ + 128, + 256, + 256, + ], + "down_block_types": [ + "DownEncoderBlock2D", + "DownEncoderBlock2D", + "DownEncoderBlock2D", + ], + "in_channels": 3, + "latent_channels": 4, + "layers_per_block": 2, + "norm_num_groups": 32, + "norm_type": "spatial", + "num_vq_embeddings": 16384, + "out_channels": 3, + "sample_size": 32, + "scaling_factor": 0.18215, + "up_block_types": [ + "UpDecoderBlock2D", + "UpDecoderBlock2D", + "UpDecoderBlock2D" + ], + "vq_embed_dim": 4 +} +``` +For increasing the size of the vocaburaries you can increase num_vq_embeddings. However, [some research](https://magvit.cs.cmu.edu/v2/) shows that the representation of VQGANs start degrading after 2^14~16384 vq embeddings so it's not recommended to go past that. + +## Extra training tips/ideas +During logging take care to make sure data_time is low. data_time is the amount spent loading the data and where the GPU is not active. So essentially, it's the time wasted. The easiest way to lower data time is to increase the --dataloader_num_workers to a higher number like 4. Due to a bug in Pytorch, this only works on linux based systems. For more details check [here](https://github.com/huggingface/diffusers/issues/7646) +Secondly, training should seem to be done when both the discriminator and the generator loss converges. +Thirdly, another low hanging fruit is just using ema using the --use_ema parameter. This tends to make the output images smoother. This has a con where you have to lower your batch size by 1 but it may be worth it. +Another more experimental low hanging fruit is changing from the vgg19 to different models for the lpips loss using the --timm_model_backend. If you do this, I recommend also changing the timm_model_layers parameter to the layer in your model which you think is best for representation. However, becareful with the feature map norms since this can easily overdominate the loss. \ No newline at end of file diff --git a/examples/vqgan/discriminator.py b/examples/vqgan/discriminator.py new file mode 100644 index 000000000000..eb31c3cb0165 --- /dev/null +++ b/examples/vqgan/discriminator.py @@ -0,0 +1,48 @@ +""" +Ported from Paella +""" + +import torch +from torch import nn + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.models.modeling_utils import ModelMixin + + +# Discriminator model ported from Paella https://github.com/dome272/Paella/blob/main/src_distributed/vqgan.py +class Discriminator(ModelMixin, ConfigMixin): + @register_to_config + def __init__(self, in_channels=3, cond_channels=0, hidden_channels=512, depth=6): + super().__init__() + d = max(depth - 3, 3) + layers = [ + nn.utils.spectral_norm( + nn.Conv2d(in_channels, hidden_channels // (2**d), kernel_size=3, stride=2, padding=1) + ), + nn.LeakyReLU(0.2), + ] + for i in range(depth - 1): + c_in = hidden_channels // (2 ** max((d - i), 0)) + c_out = hidden_channels // (2 ** max((d - 1 - i), 0)) + layers.append(nn.utils.spectral_norm(nn.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1))) + layers.append(nn.InstanceNorm2d(c_out)) + layers.append(nn.LeakyReLU(0.2)) + self.encoder = nn.Sequential(*layers) + self.shuffle = nn.Conv2d( + (hidden_channels + cond_channels) if cond_channels > 0 else hidden_channels, 1, kernel_size=1 + ) + self.logits = nn.Sigmoid() + + def forward(self, x, cond=None): + x = self.encoder(x) + if cond is not None: + cond = cond.view( + cond.size(0), + cond.size(1), + 1, + 1, + ).expand(-1, -1, x.size(-2), x.size(-1)) + x = torch.cat([x, cond], dim=1) + x = self.shuffle(x) + x = self.logits(x) + return x diff --git a/examples/vqgan/requirements.txt b/examples/vqgan/requirements.txt new file mode 100644 index 000000000000..f204a70f1e0e --- /dev/null +++ b/examples/vqgan/requirements.txt @@ -0,0 +1,8 @@ +accelerate>=0.16.0 +torchvision +transformers>=4.25.1 +datasets +timm +numpy +tqdm +tensorboard \ No newline at end of file diff --git a/examples/vqgan/test_vqgan.py b/examples/vqgan/test_vqgan.py new file mode 100644 index 000000000000..664a7f7365b0 --- /dev/null +++ b/examples/vqgan/test_vqgan.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import os +import shutil +import sys +import tempfile + +import torch + +from diffusers import VQModel +from diffusers.utils.testing_utils import require_timm + + +sys.path.append("..") +from test_examples_utils import ExamplesTestsAccelerate, run_command # noqa: E402 + + +logging.basicConfig(level=logging.DEBUG) + +logger = logging.getLogger() +stream_handler = logging.StreamHandler(sys.stdout) +logger.addHandler(stream_handler) + + +@require_timm +class TextToImage(ExamplesTestsAccelerate): + @property + def test_vqmodel_config(self): + return { + "_class_name": "VQModel", + "_diffusers_version": "0.17.0.dev0", + "act_fn": "silu", + "block_out_channels": [ + 32, + ], + "down_block_types": [ + "DownEncoderBlock2D", + ], + "in_channels": 3, + "latent_channels": 4, + "layers_per_block": 2, + "norm_num_groups": 32, + "norm_type": "spatial", + "num_vq_embeddings": 32, + "out_channels": 3, + "sample_size": 32, + "scaling_factor": 0.18215, + "up_block_types": [ + "UpDecoderBlock2D", + ], + "vq_embed_dim": 4, + } + + @property + def test_discriminator_config(self): + return { + "_class_name": "Discriminator", + "_diffusers_version": "0.27.0.dev0", + "in_channels": 3, + "cond_channels": 0, + "hidden_channels": 8, + "depth": 4, + } + + def get_vq_and_discriminator_configs(self, tmpdir): + vqmodel_config_path = os.path.join(tmpdir, "vqmodel.json") + discriminator_config_path = os.path.join(tmpdir, "discriminator.json") + with open(vqmodel_config_path, "w") as fp: + json.dump(self.test_vqmodel_config, fp) + with open(discriminator_config_path, "w") as fp: + json.dump(self.test_discriminator_config, fp) + return vqmodel_config_path, discriminator_config_path + + def test_vqmodel(self): + with tempfile.TemporaryDirectory() as tmpdir: + vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir) + test_args = f""" + examples/vqgan/train_vqgan.py + --dataset_name hf-internal-testing/dummy_image_text_data + --resolution 32 + --image_column image + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 2 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --model_config_name_or_path {vqmodel_config_path} + --discriminator_config_name_or_path {discriminator_config_path} + --output_dir {tmpdir} + """.split() + + run_command(self._launch_args + test_args) + # save_pretrained smoke test + self.assertTrue( + os.path.isfile(os.path.join(tmpdir, "discriminator", "diffusion_pytorch_model.safetensors")) + ) + self.assertTrue(os.path.isfile(os.path.join(tmpdir, "vqmodel", "diffusion_pytorch_model.safetensors"))) + + def test_vqmodel_checkpointing(self): + with tempfile.TemporaryDirectory() as tmpdir: + vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir) + # Run training script with checkpointing + # max_train_steps == 4, checkpointing_steps == 2 + # Should create checkpoints at steps 2, 4 + + initial_run_args = f""" + examples/vqgan/train_vqgan.py + --dataset_name hf-internal-testing/dummy_image_text_data + --resolution 32 + --image_column image + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 4 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --model_config_name_or_path {vqmodel_config_path} + --discriminator_config_name_or_path {discriminator_config_path} + --checkpointing_steps=2 + --output_dir {tmpdir} + --seed=0 + """.split() + + run_command(self._launch_args + initial_run_args) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-2", "checkpoint-4"}, + ) + + # check can run an intermediate checkpoint + model = VQModel.from_pretrained(tmpdir, subfolder="checkpoint-2/vqmodel") + image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size) + _ = model(image) + + # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming + shutil.rmtree(os.path.join(tmpdir, "checkpoint-2")) + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-4"}, + ) + + # Run training script for 2 total steps resuming from checkpoint 4 + + resume_run_args = f""" + examples/vqgan/train_vqgan.py + --dataset_name hf-internal-testing/dummy_image_text_data + --resolution 32 + --image_column image + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 6 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --model_config_name_or_path {vqmodel_config_path} + --discriminator_config_name_or_path {discriminator_config_path} + --checkpointing_steps=1 + --resume_from_checkpoint={os.path.join(tmpdir, 'checkpoint-4')} + --output_dir {tmpdir} + --seed=0 + """.split() + + run_command(self._launch_args + resume_run_args) + + # check can run new fully trained pipeline + model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel") + image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size) + _ = model(image) + + # no checkpoint-2 -> check old checkpoints do not exist + # check new checkpoints exist + # In the current script, checkpointing_steps 1 is equivalent to checkpointing_steps 2 as after the generator gets trained for one step, + # the discriminator gets trained and loss and saving happens after that. Thus we do not expect to get a checkpoint-5 + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-4", "checkpoint-6"}, + ) + + def test_vqmodel_checkpointing_use_ema(self): + with tempfile.TemporaryDirectory() as tmpdir: + vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir) + # Run training script with checkpointing + # max_train_steps == 4, checkpointing_steps == 2 + # Should create checkpoints at steps 2, 4 + + initial_run_args = f""" + examples/vqgan/train_vqgan.py + --dataset_name hf-internal-testing/dummy_image_text_data + --resolution 32 + --image_column image + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 4 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --model_config_name_or_path {vqmodel_config_path} + --discriminator_config_name_or_path {discriminator_config_path} + --checkpointing_steps=2 + --output_dir {tmpdir} + --use_ema + --seed=0 + """.split() + + run_command(self._launch_args + initial_run_args) + + model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel") + image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size) + _ = model(image) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-2", "checkpoint-4"}, + ) + + # check can run an intermediate checkpoint + model = VQModel.from_pretrained(tmpdir, subfolder="checkpoint-2/vqmodel") + image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size) + _ = model(image) + + # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming + shutil.rmtree(os.path.join(tmpdir, "checkpoint-2")) + + # Run training script for 2 total steps resuming from checkpoint 4 + + resume_run_args = f""" + examples/vqgan/train_vqgan.py + --dataset_name hf-internal-testing/dummy_image_text_data + --resolution 32 + --image_column image + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 6 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --model_config_name_or_path {vqmodel_config_path} + --discriminator_config_name_or_path {discriminator_config_path} + --checkpointing_steps=1 + --resume_from_checkpoint={os.path.join(tmpdir, 'checkpoint-4')} + --output_dir {tmpdir} + --use_ema + --seed=0 + """.split() + + run_command(self._launch_args + resume_run_args) + + # check can run new fully trained pipeline + model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel") + image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size) + _ = model(image) + + # no checkpoint-2 -> check old checkpoints do not exist + # check new checkpoints exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-4", "checkpoint-6"}, + ) + + def test_vqmodel_checkpointing_checkpoints_total_limit(self): + with tempfile.TemporaryDirectory() as tmpdir: + vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir) + # Run training script with checkpointing + # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2 + # Should create checkpoints at steps 2, 4, 6 + # with checkpoint at step 2 deleted + + initial_run_args = f""" + examples/vqgan/train_vqgan.py + --dataset_name hf-internal-testing/dummy_image_text_data + --resolution 32 + --image_column image + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 6 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --model_config_name_or_path {vqmodel_config_path} + --discriminator_config_name_or_path {discriminator_config_path} + --output_dir {tmpdir} + --checkpointing_steps=2 + --checkpoints_total_limit=2 + --seed=0 + """.split() + + run_command(self._launch_args + initial_run_args) + + model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel") + image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size) + _ = model(image) + + # check checkpoint directories exist + # checkpoint-2 should have been deleted + self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"}) + + def test_vqmodel_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self): + with tempfile.TemporaryDirectory() as tmpdir: + vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir) + # Run training script with checkpointing + # max_train_steps == 4, checkpointing_steps == 2 + # Should create checkpoints at steps 2, 4 + + initial_run_args = f""" + examples/vqgan/train_vqgan.py + --dataset_name hf-internal-testing/dummy_image_text_data + --resolution 32 + --image_column image + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 4 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --model_config_name_or_path {vqmodel_config_path} + --discriminator_config_name_or_path {discriminator_config_path} + --checkpointing_steps=2 + --output_dir {tmpdir} + --seed=0 + """.split() + + run_command(self._launch_args + initial_run_args) + + model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel") + image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size) + _ = model(image) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-2", "checkpoint-4"}, + ) + + # resume and we should try to checkpoint at 6, where we'll have to remove + # checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint + + resume_run_args = f""" + examples/vqgan/train_vqgan.py + --dataset_name hf-internal-testing/dummy_image_text_data + --resolution 32 + --image_column image + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 8 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --model_config_name_or_path {vqmodel_config_path} + --discriminator_config_name_or_path {discriminator_config_path} + --output_dir {tmpdir} + --checkpointing_steps=2 + --resume_from_checkpoint={os.path.join(tmpdir, 'checkpoint-4')} + --checkpoints_total_limit=2 + --seed=0 + """.split() + + run_command(self._launch_args + resume_run_args) + + model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel") + image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size) + _ = model(image) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-6", "checkpoint-8"}, + ) diff --git a/examples/vqgan/train_vqgan.py b/examples/vqgan/train_vqgan.py new file mode 100644 index 000000000000..b7beee1f3b26 --- /dev/null +++ b/examples/vqgan/train_vqgan.py @@ -0,0 +1,1067 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import math +import os +import shutil +import time +from pathlib import Path + +import accelerate +import numpy as np +import PIL +import PIL.Image +import timm +import torch +import torch.nn.functional as F +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import DistributedType, ProjectConfiguration, set_seed +from datasets import load_dataset +from discriminator import Discriminator +from huggingface_hub import create_repo +from packaging import version +from PIL import Image +from timm.data import resolve_data_config +from timm.data.transforms_factory import create_transform +from torchvision import transforms +from tqdm import tqdm + +from diffusers import VQModel +from diffusers.optimization import get_scheduler +from diffusers.training_utils import EMAModel +from diffusers.utils import check_min_version, is_wandb_available + + +if is_wandb_available(): + import wandb + +# Will error if the minimal version of diffusers is not installed. Remove at your own risks. +check_min_version("0.27.0.dev0") + +logger = get_logger(__name__, log_level="INFO") + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def _map_layer_to_idx(backbone, layers, offset=0): + """Maps set of layer names to indices of model. Ported from anomalib + + Returns: + Feature map extracted from the CNN + """ + idx = [] + features = timm.create_model( + backbone, + pretrained=False, + features_only=False, + exportable=True, + ) + for i in layers: + try: + idx.append(list(dict(features.named_children()).keys()).index(i) - offset) + except ValueError: + raise ValueError( + f"Layer {i} not found in model {backbone}. Select layer from {list(dict(features.named_children()).keys())}. The network architecture is {features}" + ) + return idx + + +def get_perceptual_loss(pixel_values, fmap, timm_model, timm_model_resolution, timm_model_normalization): + img_timm_model_input = timm_model_normalization(F.interpolate(pixel_values, timm_model_resolution)) + fmap_timm_model_input = timm_model_normalization(F.interpolate(fmap, timm_model_resolution)) + + if pixel_values.shape[1] == 1: + # handle grayscale for timm_model + img_timm_model_input, fmap_timm_model_input = ( + t.repeat(1, 3, 1, 1) for t in (img_timm_model_input, fmap_timm_model_input) + ) + + img_timm_model_feats = timm_model(img_timm_model_input) + recon_timm_model_feats = timm_model(fmap_timm_model_input) + perceptual_loss = F.mse_loss(img_timm_model_feats[0], recon_timm_model_feats[0]) + for i in range(1, len(img_timm_model_feats)): + perceptual_loss += F.mse_loss(img_timm_model_feats[i], recon_timm_model_feats[i]) + perceptual_loss /= len(img_timm_model_feats) + return perceptual_loss + + +def grad_layer_wrt_loss(loss, layer): + return torch.autograd.grad( + outputs=loss, + inputs=layer, + grad_outputs=torch.ones_like(loss), + retain_graph=True, + )[0].detach() + + +def gradient_penalty(images, output, weight=10): + gradients = torch.autograd.grad( + outputs=output, + inputs=images, + grad_outputs=torch.ones(output.size(), device=images.device), + create_graph=True, + retain_graph=True, + only_inputs=True, + )[0] + bsz = gradients.shape[0] + gradients = torch.reshape(gradients, (bsz, -1)) + return weight * ((gradients.norm(2, dim=1) - 1) ** 2).mean() + + +@torch.no_grad() +def log_validation(model, args, validation_transform, accelerator, global_step): + logger.info("Generating images...") + dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + dtype = torch.bfloat16 + original_images = [] + for image_path in args.validation_images: + image = PIL.Image.open(image_path) + if not image.mode == "RGB": + image = image.convert("RGB") + image = validation_transform(image).to(accelerator.device, dtype=dtype) + original_images.append(image[None]) + # Generate images + model.eval() + images = [] + for original_image in original_images: + image = accelerator.unwrap_model(model)(original_image).sample + images.append(image) + model.train() + original_images = torch.cat(original_images, dim=0) + images = torch.cat(images, dim=0) + + # Convert to PIL images + images = torch.clamp(images, 0.0, 1.0) + original_images = torch.clamp(original_images, 0.0, 1.0) + images *= 255.0 + original_images *= 255.0 + images = images.permute(0, 2, 3, 1).cpu().numpy().astype(np.uint8) + original_images = original_images.permute(0, 2, 3, 1).cpu().numpy().astype(np.uint8) + images = np.concatenate([original_images, images], axis=2) + images = [Image.fromarray(image) for image in images] + + # Log images + for tracker in accelerator.trackers: + if tracker.name == "tensorboard": + np_images = np.stack([np.asarray(img) for img in images]) + tracker.writer.add_images("validation", np_images, global_step, dataformats="NHWC") + if tracker.name == "wandb": + tracker.log( + { + "validation": [ + wandb.Image(image, caption=f"{i}: Original, Generated") for i, image in enumerate(images) + ] + }, + step=global_step, + ) + torch.cuda.empty_cache() + return images + + +def log_grad_norm(model, accelerator, global_step): + for name, param in model.named_parameters(): + if param.grad is not None: + grads = param.grad.detach().data + grad_norm = (grads.norm(p=2) / grads.numel()).item() + accelerator.log({"grad_norm/" + name: grad_norm}, step=global_step) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Simple example of a training script.") + parser.add_argument( + "--log_grad_norm_steps", + type=int, + default=500, + help=("Print logs of gradient norms every X steps."), + ) + parser.add_argument( + "--log_steps", + type=int, + default=50, + help=("Print logs every X steps."), + ) + parser.add_argument( + "--validation_steps", + type=int, + default=100, + help=( + "Run validation every X steps. Validation consists of running reconstruction on images in" + " `args.validation_images` and logging the reconstructed images." + ), + ) + parser.add_argument( + "--vae_loss", + type=str, + default="l2", + help="The loss function for vae reconstruction loss.", + ) + parser.add_argument( + "--timm_model_offset", + type=int, + default=0, + help="Offset of timm layers to indices.", + ) + parser.add_argument( + "--timm_model_layers", + type=str, + default="head", + help="The layers to get output from in the timm model.", + ) + parser.add_argument( + "--timm_model_backend", + type=str, + default="vgg19", + help="Timm model used to get the lpips loss", + ) + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default=None, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--model_config_name_or_path", + type=str, + default=None, + help="The config of the Vq model to train, leave as None to use standard Vq model configuration.", + ) + parser.add_argument( + "--discriminator_config_name_or_path", + type=str, + default=None, + help="The config of the discriminator model to train, leave as None to use standard Vq model configuration.", + ) + parser.add_argument( + "--revision", + type=str, + default=None, + required=False, + help="Revision of pretrained model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--dataset_name", + type=str, + default=None, + help=( + "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private," + " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem," + " or to a folder containing files that 🤗 Datasets can understand." + ), + ) + parser.add_argument( + "--dataset_config_name", + type=str, + default=None, + help="The config of the Dataset, leave as None if there's only one config.", + ) + parser.add_argument( + "--train_data_dir", + type=str, + default=None, + help=( + "A folder containing the training data. Folder contents must follow the structure described in" + " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file" + " must exist to provide the captions for the images. Ignored if `dataset_name` is specified." + ), + ) + parser.add_argument( + "--image_column", type=str, default="image", help="The column of the dataset containing an image." + ) + parser.add_argument( + "--max_train_samples", + type=int, + default=None, + help=( + "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + ), + ) + parser.add_argument( + "--validation_images", + type=str, + default=None, + nargs="+", + help=("A set of validation images evaluated every `--validation_steps` and logged to `--report_to`."), + ) + parser.add_argument( + "--output_dir", + type=str, + default="vqgan-output", + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument( + "--cache_dir", + type=str, + default=None, + help="The directory where the downloaded models and datasets will be stored.", + ) + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") + parser.add_argument( + "--resolution", + type=int, + default=512, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument( + "--center_crop", + default=False, + action="store_true", + help=( + "Whether to center crop the input images to the resolution. If not set, the images will be randomly" + " cropped. The images will be resized to the resolution first before cropping." + ), + ) + parser.add_argument( + "--random_flip", + action="store_true", + help="whether to randomly flip images horizontally", + ) + parser.add_argument( + "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader." + ) + parser.add_argument("--num_train_epochs", type=int, default=100) + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--gradient_checkpointing", + action="store_true", + help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", + ) + parser.add_argument( + "--discr_learning_rate", + type=float, + default=1e-4, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=1e-4, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--scale_lr", + action="store_true", + default=False, + help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", + ) + parser.add_argument( + "--lr_scheduler", + type=str, + default="constant", + help=( + 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' + ' "constant", "constant_with_warmup"]' + ), + ) + parser.add_argument( + "--discr_lr_scheduler", + type=str, + default="constant", + help=( + 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' + ' "constant", "constant_with_warmup"]' + ), + ) + parser.add_argument( + "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument( + "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." + ) + parser.add_argument( + "--allow_tf32", + action="store_true", + help=( + "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" + " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" + ), + ) + parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.") + parser.add_argument( + "--non_ema_revision", + type=str, + default=None, + required=False, + help=( + "Revision of pretrained non-ema model identifier. Must be a branch, tag or git identifier of the local or" + " remote repository specified with --pretrained_model_name_or_path." + ), + ) + parser.add_argument( + "--dataloader_num_workers", + type=int, + default=0, + help=( + "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." + ), + ) + parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") + parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") + parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--prediction_type", + type=str, + default=None, + help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.", + ) + parser.add_argument( + "--hub_model_id", + type=str, + default=None, + help="The name of the repository to keep in sync with the local `output_dir`.", + ) + parser.add_argument( + "--logging_dir", + type=str, + default="logs", + help=( + "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" + " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." + ), + ) + parser.add_argument( + "--mixed_precision", + type=str, + default=None, + choices=["no", "fp16", "bf16"], + help=( + "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the" + " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." + ), + ) + parser.add_argument( + "--report_to", + type=str, + default="tensorboard", + help=( + 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' + ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' + ), + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument( + "--checkpointing_steps", + type=int, + default=500, + help=( + "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming" + " training using `--resume_from_checkpoint`." + ), + ) + parser.add_argument( + "--checkpoints_total_limit", + type=int, + default=None, + help=("Max number of checkpoints to store."), + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help=( + "Whether training should be resumed from a previous checkpoint. Use a path saved by" + ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' + ), + ) + parser.add_argument( + "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." + ) + parser.add_argument( + "--tracker_project_name", + type=str, + default="vqgan-training", + help=( + "The `project_name` argument passed to Accelerator.init_trackers for" + " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator" + ), + ) + + args = parser.parse_args() + env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) + if env_local_rank != -1 and env_local_rank != args.local_rank: + args.local_rank = env_local_rank + + # Sanity checks + if args.dataset_name is None and args.train_data_dir is None: + raise ValueError("Need either a dataset name or a training folder.") + + # default to using the same revision for the non-ema model if not specified + if args.non_ema_revision is None: + args.non_ema_revision = args.revision + + return args + + +def main(): + ######################### + # SETUP Accelerator # + ######################### + args = parse_args() + + # Enable TF32 on Ampere GPUs + if args.allow_tf32: + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.benchmark = True + torch.backends.cudnn.deterministic = False + + logging_dir = os.path.join(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) + + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with=args.report_to, + project_config=accelerator_project_config, + ) + + if accelerator.distributed_type == DistributedType.DEEPSPEED: + accelerator.state.deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = args.train_batch_size + + ##################################### + # SETUP LOGGING, SEED and CONFIG # + ##################################### + + if accelerator.is_main_process: + tracker_config = dict(vars(args)) + tracker_config.pop("validation_images") + accelerator.init_trackers(args.tracker_project_name, tracker_config) + + # If passed along, set the training seed now. + if args.seed is not None: + set_seed(args.seed) + + # Handle the repository creation + if accelerator.is_main_process: + if args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + + if args.push_to_hub: + create_repo( + repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token + ).repo_id + + ######################### + # MODELS and OPTIMIZER # + ######################### + logger.info("Loading models and optimizer") + + if args.model_config_name_or_path is None and args.pretrained_model_name_or_path is None: + # Taken from config of movq at kandinsky-community/kandinsky-2-2-decoder but without the attention layers + model = VQModel( + act_fn="silu", + block_out_channels=[ + 128, + 256, + 512, + ], + down_block_types=[ + "DownEncoderBlock2D", + "DownEncoderBlock2D", + "DownEncoderBlock2D", + ], + in_channels=3, + latent_channels=4, + layers_per_block=2, + norm_num_groups=32, + norm_type="spatial", + num_vq_embeddings=16384, + out_channels=3, + sample_size=32, + scaling_factor=0.18215, + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"], + vq_embed_dim=4, + ) + elif args.pretrained_model_name_or_path is not None: + model = VQModel.from_pretrained(args.pretrained_model_name_or_path) + else: + config = VQModel.load_config(args.model_config_name_or_path) + model = VQModel.from_config(config) + if args.use_ema: + ema_model = EMAModel(model.parameters(), model_cls=VQModel, model_config=model.config) + if args.discriminator_config_name_or_path is None: + discriminator = Discriminator() + else: + config = Discriminator.load_config(args.discriminator_config_name_or_path) + discriminator = Discriminator.from_config(config) + + idx = _map_layer_to_idx(args.timm_model_backend, args.timm_model_layers.split("|"), args.timm_model_offset) + + timm_model = timm.create_model( + args.timm_model_backend, + pretrained=True, + features_only=True, + exportable=True, + out_indices=idx, + ) + timm_model = timm_model.to(accelerator.device) + timm_model.requires_grad = False + timm_model.eval() + timm_transform = create_transform(**resolve_data_config(timm_model.pretrained_cfg, model=timm_model)) + try: + # Gets the resolution of the timm transformation after centercrop + timm_centercrop_transform = timm_transform.transforms[1] + assert isinstance( + timm_centercrop_transform, transforms.CenterCrop + ), f"Timm model {timm_model} is currently incompatible with this script. Try vgg19." + timm_model_resolution = timm_centercrop_transform.size[0] + # Gets final normalization + timm_model_normalization = timm_transform.transforms[-1] + assert isinstance( + timm_model_normalization, transforms.Normalize + ), f"Timm model {timm_model} is currently incompatible with this script. Try vgg19." + except AssertionError as e: + raise NotImplementedError(e) + # Enable flash attention if asked + if args.enable_xformers_memory_efficient_attention: + model.enable_xformers_memory_efficient_attention() + + # `accelerate` 0.16.0 will have better support for customized saving + if version.parse(accelerate.__version__) >= version.parse("0.16.0"): + # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format + def save_model_hook(models, weights, output_dir): + if accelerator.is_main_process: + if args.use_ema: + ema_model.save_pretrained(os.path.join(output_dir, "vqmodel_ema")) + vqmodel = models[0] + discriminator = models[1] + vqmodel.save_pretrained(os.path.join(output_dir, "vqmodel")) + discriminator.save_pretrained(os.path.join(output_dir, "discriminator")) + weights.pop() + weights.pop() + + def load_model_hook(models, input_dir): + if args.use_ema: + load_model = EMAModel.from_pretrained(os.path.join(input_dir, "vqmodel_ema"), VQModel) + ema_model.load_state_dict(load_model.state_dict()) + ema_model.to(accelerator.device) + del load_model + discriminator = models.pop() + load_model = Discriminator.from_pretrained(input_dir, subfolder="discriminator") + discriminator.register_to_config(**load_model.config) + discriminator.load_state_dict(load_model.state_dict()) + del load_model + vqmodel = models.pop() + load_model = VQModel.from_pretrained(input_dir, subfolder="vqmodel") + vqmodel.register_to_config(**load_model.config) + vqmodel.load_state_dict(load_model.state_dict()) + del load_model + + accelerator.register_save_state_pre_hook(save_model_hook) + accelerator.register_load_state_pre_hook(load_model_hook) + + learning_rate = args.learning_rate + if args.scale_lr: + learning_rate = ( + learning_rate * args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + ) + + # Initialize the optimizer + if args.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError( + "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`" + ) + + optimizer_cls = bnb.optim.AdamW8bit + else: + optimizer_cls = torch.optim.AdamW + + optimizer = optimizer_cls( + list(model.parameters()), + lr=args.learning_rate, + betas=(args.adam_beta1, args.adam_beta2), + weight_decay=args.adam_weight_decay, + eps=args.adam_epsilon, + ) + discr_optimizer = optimizer_cls( + list(discriminator.parameters()), + lr=args.discr_learning_rate, + betas=(args.adam_beta1, args.adam_beta2), + weight_decay=args.adam_weight_decay, + eps=args.adam_epsilon, + ) + + ################################## + # DATLOADER and LR-SCHEDULER # + ################################# + logger.info("Creating dataloaders and lr_scheduler") + + args.train_batch_size * accelerator.num_processes + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + # DataLoaders creation: + if args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + dataset = load_dataset( + args.dataset_name, + args.dataset_config_name, + cache_dir=args.cache_dir, + data_dir=args.train_data_dir, + ) + else: + data_files = {} + if args.train_data_dir is not None: + data_files["train"] = os.path.join(args.train_data_dir, "**") + dataset = load_dataset( + "imagefolder", + data_files=data_files, + cache_dir=args.cache_dir, + ) + # See more about loading custom images at + # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder + + # Preprocessing the datasets. + # We need to tokenize inputs and targets. + column_names = dataset["train"].column_names + + # 6. Get the column names for input/target. + assert args.image_column is not None + image_column = args.image_column + if image_column not in column_names: + raise ValueError(f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}") + # Preprocessing the datasets. + train_transforms = transforms.Compose( + [ + transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution), + transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x), + transforms.ToTensor(), + ] + ) + validation_transform = transforms.Compose( + [ + transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.ToTensor(), + ] + ) + + def preprocess_train(examples): + images = [image.convert("RGB") for image in examples[image_column]] + examples["pixel_values"] = [train_transforms(image) for image in images] + return examples + + with accelerator.main_process_first(): + if args.max_train_samples is not None: + dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples)) + train_dataset = dataset["train"].with_transform(preprocess_train) + + def collate_fn(examples): + pixel_values = torch.stack([example["pixel_values"] for example in examples]) + pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() + return {"pixel_values": pixel_values} + + # DataLoaders creation: + train_dataloader = torch.utils.data.DataLoader( + train_dataset, + shuffle=True, + collate_fn=collate_fn, + batch_size=args.train_batch_size, + num_workers=args.dataloader_num_workers, + ) + + lr_scheduler = get_scheduler( + args.lr_scheduler, + optimizer=optimizer, + num_training_steps=args.max_train_steps, + num_warmup_steps=args.lr_warmup_steps, + ) + discr_lr_scheduler = get_scheduler( + args.discr_lr_scheduler, + optimizer=discr_optimizer, + num_training_steps=args.max_train_steps, + num_warmup_steps=args.lr_warmup_steps, + ) + + # Prepare everything with accelerator + logger.info("Preparing model, optimizer and dataloaders") + # The dataloader are already aware of distributed training, so we don't need to prepare them. + model, discriminator, optimizer, discr_optimizer, lr_scheduler, discr_lr_scheduler = accelerator.prepare( + model, discriminator, optimizer, discr_optimizer, lr_scheduler, discr_lr_scheduler + ) + if args.use_ema: + ema_model.to(accelerator.device) + # Train! + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(train_dataset)}") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + global_step = 0 + first_epoch = 0 + # Scheduler and math around the number of training steps. + overrode_max_train_steps = False + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + overrode_max_train_steps = True + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # Potentially load in the weights and states from a previous save + resume_from_checkpoint = args.resume_from_checkpoint + if resume_from_checkpoint: + if resume_from_checkpoint != "latest": + path = resume_from_checkpoint + else: + # Get the most recent checkpoint + dirs = os.listdir(args.output_dir) + dirs = [d for d in dirs if d.startswith("checkpoint")] + dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) + path = dirs[-1] if len(dirs) > 0 else None + path = os.path.join(args.output_dir, path) + + if path is None: + accelerator.print(f"Checkpoint '{resume_from_checkpoint}' does not exist. Starting a new training run.") + resume_from_checkpoint = None + else: + accelerator.print(f"Resuming from checkpoint {path}") + accelerator.load_state(path) + accelerator.wait_for_everyone() + global_step = int(os.path.basename(path).split("-")[1]) + first_epoch = global_step // num_update_steps_per_epoch + + batch_time_m = AverageMeter() + data_time_m = AverageMeter() + end = time.time() + progress_bar = tqdm( + range(0, args.max_train_steps), + initial=global_step, + desc="Steps", + # Only show the progress bar once on each machine. + disable=not accelerator.is_local_main_process, + ) + # As stated above, we are not doing epoch based training here, but just using this for book keeping and being able to + # reuse the same training loop with other datasets/loaders. + avg_gen_loss, avg_discr_loss = None, None + for epoch in range(first_epoch, args.num_train_epochs): + model.train() + discriminator.train() + for i, batch in enumerate(train_dataloader): + pixel_values = batch["pixel_values"] + pixel_values = pixel_values.to(accelerator.device, non_blocking=True) + data_time_m.update(time.time() - end) + generator_step = ((i // args.gradient_accumulation_steps) % 2) == 0 + # Train Step + # The behavior of accelerator.accumulate is to + # 1. Check if gradients are synced(reached gradient-accumulation_steps) + # 2. If so sync gradients by stopping the not syncing process + if generator_step: + optimizer.zero_grad(set_to_none=True) + else: + discr_optimizer.zero_grad(set_to_none=True) + # encode images to the latent space and get the commit loss from vq tokenization + # Return commit loss + fmap, commit_loss = model(pixel_values, return_dict=False) + + if generator_step: + with accelerator.accumulate(model): + # reconstruction loss. Pixel level differences between input vs output + if args.vae_loss == "l2": + loss = F.mse_loss(pixel_values, fmap) + else: + loss = F.l1_loss(pixel_values, fmap) + # perceptual loss. The high level feature mean squared error loss + perceptual_loss = get_perceptual_loss( + pixel_values, + fmap, + timm_model, + timm_model_resolution=timm_model_resolution, + timm_model_normalization=timm_model_normalization, + ) + # generator loss + gen_loss = -discriminator(fmap).mean() + last_dec_layer = accelerator.unwrap_model(model).decoder.conv_out.weight + norm_grad_wrt_perceptual_loss = grad_layer_wrt_loss(perceptual_loss, last_dec_layer).norm(p=2) + norm_grad_wrt_gen_loss = grad_layer_wrt_loss(gen_loss, last_dec_layer).norm(p=2) + + adaptive_weight = norm_grad_wrt_perceptual_loss / norm_grad_wrt_gen_loss.clamp(min=1e-8) + adaptive_weight = adaptive_weight.clamp(max=1e4) + loss += commit_loss + loss += perceptual_loss + loss += adaptive_weight * gen_loss + # Gather the losses across all processes for logging (if we use distributed training). + avg_gen_loss = accelerator.gather(loss.repeat(args.train_batch_size)).float().mean() + accelerator.backward(loss) + + if args.max_grad_norm is not None and accelerator.sync_gradients: + accelerator.clip_grad_norm_(model.parameters(), args.max_grad_norm) + + optimizer.step() + lr_scheduler.step() + # log gradient norm before zeroing it + if ( + accelerator.sync_gradients + and global_step % args.log_grad_norm_steps == 0 + and accelerator.is_main_process + ): + log_grad_norm(model, accelerator, global_step) + else: + # Return discriminator loss + with accelerator.accumulate(discriminator): + fmap.detach_() + pixel_values.requires_grad_() + real = discriminator(pixel_values) + fake = discriminator(fmap) + loss = (F.relu(1 + fake) + F.relu(1 - real)).mean() + gp = gradient_penalty(pixel_values, real) + loss += gp + avg_discr_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean() + accelerator.backward(loss) + + if args.max_grad_norm is not None and accelerator.sync_gradients: + accelerator.clip_grad_norm_(discriminator.parameters(), args.max_grad_norm) + + discr_optimizer.step() + discr_lr_scheduler.step() + if ( + accelerator.sync_gradients + and global_step % args.log_grad_norm_steps == 0 + and accelerator.is_main_process + ): + log_grad_norm(discriminator, accelerator, global_step) + batch_time_m.update(time.time() - end) + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + global_step += 1 + progress_bar.update(1) + if args.use_ema: + ema_model.step(model.parameters()) + if accelerator.sync_gradients and not generator_step and accelerator.is_main_process: + # wait for both generator and discriminator to settle + # Log metrics + if global_step % args.log_steps == 0: + samples_per_second_per_gpu = ( + args.gradient_accumulation_steps * args.train_batch_size / batch_time_m.val + ) + logs = { + "step_discr_loss": avg_discr_loss.item(), + "lr": lr_scheduler.get_last_lr()[0], + "samples/sec/gpu": samples_per_second_per_gpu, + "data_time": data_time_m.val, + "batch_time": batch_time_m.val, + } + if avg_gen_loss is not None: + logs["step_gen_loss"] = avg_gen_loss.item() + accelerator.log(logs, step=global_step) + + # resetting batch / data time meters per log window + batch_time_m.reset() + data_time_m.reset() + # Save model checkpoint + if global_step % args.checkpointing_steps == 0: + if accelerator.is_main_process: + # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` + if args.checkpoints_total_limit is not None: + checkpoints = os.listdir(args.output_dir) + checkpoints = [d for d in checkpoints if d.startswith("checkpoint")] + checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1])) + + # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints + if len(checkpoints) >= args.checkpoints_total_limit: + num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1 + removing_checkpoints = checkpoints[0:num_to_remove] + + logger.info( + f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints" + ) + logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}") + + for removing_checkpoint in removing_checkpoints: + removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint) + shutil.rmtree(removing_checkpoint) + + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") + accelerator.save_state(save_path) + logger.info(f"Saved state to {save_path}") + + # Generate images + if global_step % args.validation_steps == 0: + if args.use_ema: + # Store the VQGAN parameters temporarily and load the EMA parameters to perform inference. + ema_model.store(model.parameters()) + ema_model.copy_to(model.parameters()) + log_validation(model, args, validation_transform, accelerator, global_step) + if args.use_ema: + # Switch back to the original VQGAN parameters. + ema_model.restore(model.parameters()) + end = time.time() + # Stop training if max steps is reached + if global_step >= args.max_train_steps: + break + # End for + + accelerator.wait_for_everyone() + + # Save the final trained checkpoint + if accelerator.is_main_process: + model = accelerator.unwrap_model(model) + discriminator = accelerator.unwrap_model(discriminator) + if args.use_ema: + ema_model.copy_to(model.parameters()) + model.save_pretrained(os.path.join(args.output_dir, "vqmodel")) + discriminator.save_pretrained(os.path.join(args.output_dir, "discriminator")) + + accelerator.end_training() + + +if __name__ == "__main__": + main() diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 41ee8b914774..1ac9ceddb57e 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -17,109 +17,99 @@ def load_notes_encoder(weights, model): - model.token_embedder.weight = nn.Parameter(torch.FloatTensor(weights["token_embedder"]["embedding"])) - model.position_encoding.weight = nn.Parameter( - torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False - ) + model.token_embedder.weight = nn.Parameter(torch.Tensor(weights["token_embedder"]["embedding"])) + model.position_encoding.weight = nn.Parameter(torch.Tensor(weights["Embed_0"]["embedding"]), requires_grad=False) for lyr_num, lyr in enumerate(model.encoders): ly_weight = weights[f"layers_{lyr_num}"] - lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) - ) + lyr.layer[0].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_attention_layer_norm"]["scale"])) attention_weights = ly_weight["attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) - lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) - lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) - lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) + lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.Tensor(attention_weights["query"]["kernel"].T)) + lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.Tensor(attention_weights["key"]["kernel"].T)) + lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.Tensor(attention_weights["value"]["kernel"].T)) + lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.Tensor(attention_weights["out"]["kernel"].T)) - lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + lyr.layer[1].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_mlp_layer_norm"]["scale"])) - lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) - lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) - lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wo"]["kernel"].T)) - model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) + model.layer_norm.weight = nn.Parameter(torch.Tensor(weights["encoder_norm"]["scale"])) return model def load_continuous_encoder(weights, model): - model.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T)) + model.input_proj.weight = nn.Parameter(torch.Tensor(weights["input_proj"]["kernel"].T)) - model.position_encoding.weight = nn.Parameter( - torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False - ) + model.position_encoding.weight = nn.Parameter(torch.Tensor(weights["Embed_0"]["embedding"]), requires_grad=False) for lyr_num, lyr in enumerate(model.encoders): ly_weight = weights[f"layers_{lyr_num}"] attention_weights = ly_weight["attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) - lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) - lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) - lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) - lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) - ) + lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.Tensor(attention_weights["query"]["kernel"].T)) + lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.Tensor(attention_weights["key"]["kernel"].T)) + lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.Tensor(attention_weights["value"]["kernel"].T)) + lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.Tensor(attention_weights["out"]["kernel"].T)) + lyr.layer[0].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_attention_layer_norm"]["scale"])) - lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) - lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) - lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) - lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wo"]["kernel"].T)) + lyr.layer[1].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_mlp_layer_norm"]["scale"])) - model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) + model.layer_norm.weight = nn.Parameter(torch.Tensor(weights["encoder_norm"]["scale"])) return model def load_decoder(weights, model): - model.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T)) - model.conditioning_emb[2].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense1"]["kernel"].T)) + model.conditioning_emb[0].weight = nn.Parameter(torch.Tensor(weights["time_emb_dense0"]["kernel"].T)) + model.conditioning_emb[2].weight = nn.Parameter(torch.Tensor(weights["time_emb_dense1"]["kernel"].T)) - model.position_encoding.weight = nn.Parameter( - torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False - ) + model.position_encoding.weight = nn.Parameter(torch.Tensor(weights["Embed_0"]["embedding"]), requires_grad=False) model.continuous_inputs_projection.weight = nn.Parameter( - torch.FloatTensor(weights["continuous_inputs_projection"]["kernel"].T) + torch.Tensor(weights["continuous_inputs_projection"]["kernel"].T) ) for lyr_num, lyr in enumerate(model.decoders): ly_weight = weights[f"layers_{lyr_num}"] lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"]) + torch.Tensor(ly_weight["pre_self_attention_layer_norm"]["scale"]) ) lyr.layer[0].FiLMLayer.scale_bias.weight = nn.Parameter( - torch.FloatTensor(ly_weight["FiLMLayer_0"]["DenseGeneral_0"]["kernel"].T) + torch.Tensor(ly_weight["FiLMLayer_0"]["DenseGeneral_0"]["kernel"].T) ) attention_weights = ly_weight["self_attention"] - lyr.layer[0].attention.to_q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) - lyr.layer[0].attention.to_k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) - lyr.layer[0].attention.to_v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) - lyr.layer[0].attention.to_out[0].weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) + lyr.layer[0].attention.to_q.weight = nn.Parameter(torch.Tensor(attention_weights["query"]["kernel"].T)) + lyr.layer[0].attention.to_k.weight = nn.Parameter(torch.Tensor(attention_weights["key"]["kernel"].T)) + lyr.layer[0].attention.to_v.weight = nn.Parameter(torch.Tensor(attention_weights["value"]["kernel"].T)) + lyr.layer[0].attention.to_out[0].weight = nn.Parameter(torch.Tensor(attention_weights["out"]["kernel"].T)) attention_weights = ly_weight["MultiHeadDotProductAttention_0"] - lyr.layer[1].attention.to_q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) - lyr.layer[1].attention.to_k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) - lyr.layer[1].attention.to_v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) - lyr.layer[1].attention.to_out[0].weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) + lyr.layer[1].attention.to_q.weight = nn.Parameter(torch.Tensor(attention_weights["query"]["kernel"].T)) + lyr.layer[1].attention.to_k.weight = nn.Parameter(torch.Tensor(attention_weights["key"]["kernel"].T)) + lyr.layer[1].attention.to_v.weight = nn.Parameter(torch.Tensor(attention_weights["value"]["kernel"].T)) + lyr.layer[1].attention.to_out[0].weight = nn.Parameter(torch.Tensor(attention_weights["out"]["kernel"].T)) lyr.layer[1].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"]) + torch.Tensor(ly_weight["pre_cross_attention_layer_norm"]["scale"]) ) - lyr.layer[2].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + lyr.layer[2].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_mlp_layer_norm"]["scale"])) lyr.layer[2].film.scale_bias.weight = nn.Parameter( - torch.FloatTensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T) + torch.Tensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T) ) - lyr.layer[2].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) - lyr.layer[2].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) - lyr.layer[2].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) + lyr.layer[2].DenseReluDense.wi_0.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[2].DenseReluDense.wi_1.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) + lyr.layer[2].DenseReluDense.wo.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wo"]["kernel"].T)) - model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"])) + model.decoder_norm.weight = nn.Parameter(torch.Tensor(weights["decoder_norm"]["scale"])) - model.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T)) + model.spec_out.weight = nn.Parameter(torch.Tensor(weights["spec_out_dense"]["kernel"].T)) return model diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index f7fa82157c52..66c98804eadc 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -27,6 +27,7 @@ _import_structure = { "configuration_utils": ["ConfigMixin"], + "loaders": ["FromOriginalModelMixin"], "models": [], "pipelines": [], "schedulers": [], diff --git a/src/diffusers/callbacks.py b/src/diffusers/callbacks.py new file mode 100644 index 000000000000..38542407e31f --- /dev/null +++ b/src/diffusers/callbacks.py @@ -0,0 +1,156 @@ +from typing import Any, Dict, List + +from .configuration_utils import ConfigMixin, register_to_config +from .utils import CONFIG_NAME + + +class PipelineCallback(ConfigMixin): + """ + Base class for all the official callbacks used in a pipeline. This class provides a structure for implementing + custom callbacks and ensures that all callbacks have a consistent interface. + + Please implement the following: + `tensor_inputs`: This should return a list of tensor inputs specific to your callback. You will only be able to + include + variables listed in the `._callback_tensor_inputs` attribute of your pipeline class. + `callback_fn`: This method defines the core functionality of your callback. + """ + + config_name = CONFIG_NAME + + @register_to_config + def __init__(self, cutoff_step_ratio=1.0, cutoff_step_index=None): + super().__init__() + + if (cutoff_step_ratio is None and cutoff_step_index is None) or ( + cutoff_step_ratio is not None and cutoff_step_index is not None + ): + raise ValueError("Either cutoff_step_ratio or cutoff_step_index should be provided, not both or none.") + + if cutoff_step_ratio is not None and ( + not isinstance(cutoff_step_ratio, float) or not (0.0 <= cutoff_step_ratio <= 1.0) + ): + raise ValueError("cutoff_step_ratio must be a float between 0.0 and 1.0.") + + @property + def tensor_inputs(self) -> List[str]: + raise NotImplementedError(f"You need to set the attribute `tensor_inputs` for {self.__class__}") + + def callback_fn(self, pipeline, step_index, timesteps, callback_kwargs) -> Dict[str, Any]: + raise NotImplementedError(f"You need to implement the method `callback_fn` for {self.__class__}") + + def __call__(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]: + return self.callback_fn(pipeline, step_index, timestep, callback_kwargs) + + +class MultiPipelineCallbacks: + """ + This class is designed to handle multiple pipeline callbacks. It accepts a list of PipelineCallback objects and + provides a unified interface for calling all of them. + """ + + def __init__(self, callbacks: List[PipelineCallback]): + self.callbacks = callbacks + + @property + def tensor_inputs(self) -> List[str]: + return [input for callback in self.callbacks for input in callback.tensor_inputs] + + def __call__(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]: + """ + Calls all the callbacks in order with the given arguments and returns the final callback_kwargs. + """ + for callback in self.callbacks: + callback_kwargs = callback(pipeline, step_index, timestep, callback_kwargs) + + return callback_kwargs + + +class SDCFGCutoffCallback(PipelineCallback): + """ + Callback function for Stable Diffusion Pipelines. After certain number of steps (set by `cutoff_step_ratio` or + `cutoff_step_index`), this callback will disable the CFG. + + Note: This callback mutates the pipeline by changing the `_guidance_scale` attribute to 0.0 after the cutoff step. + """ + + tensor_inputs = ["prompt_embeds"] + + def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]: + cutoff_step_ratio = self.config.cutoff_step_ratio + cutoff_step_index = self.config.cutoff_step_index + + # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio + cutoff_step = ( + cutoff_step_index if cutoff_step_index is not None else int(pipeline.num_timesteps * cutoff_step_ratio) + ) + + if step_index == cutoff_step: + prompt_embeds = callback_kwargs[self.tensor_inputs[0]] + prompt_embeds = prompt_embeds[-1:] # "-1" denotes the embeddings for conditional text tokens. + + pipeline._guidance_scale = 0.0 + + callback_kwargs[self.tensor_inputs[0]] = prompt_embeds + return callback_kwargs + + +class SDXLCFGCutoffCallback(PipelineCallback): + """ + Callback function for Stable Diffusion XL Pipelines. After certain number of steps (set by `cutoff_step_ratio` or + `cutoff_step_index`), this callback will disable the CFG. + + Note: This callback mutates the pipeline by changing the `_guidance_scale` attribute to 0.0 after the cutoff step. + """ + + tensor_inputs = ["prompt_embeds", "add_text_embeds", "add_time_ids"] + + def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]: + cutoff_step_ratio = self.config.cutoff_step_ratio + cutoff_step_index = self.config.cutoff_step_index + + # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio + cutoff_step = ( + cutoff_step_index if cutoff_step_index is not None else int(pipeline.num_timesteps * cutoff_step_ratio) + ) + + if step_index == cutoff_step: + prompt_embeds = callback_kwargs[self.tensor_inputs[0]] + prompt_embeds = prompt_embeds[-1:] # "-1" denotes the embeddings for conditional text tokens. + + add_text_embeds = callback_kwargs[self.tensor_inputs[1]] + add_text_embeds = add_text_embeds[-1:] # "-1" denotes the embeddings for conditional pooled text tokens + + add_time_ids = callback_kwargs[self.tensor_inputs[2]] + add_time_ids = add_time_ids[-1:] # "-1" denotes the embeddings for conditional added time vector + + pipeline._guidance_scale = 0.0 + + callback_kwargs[self.tensor_inputs[0]] = prompt_embeds + callback_kwargs[self.tensor_inputs[1]] = add_text_embeds + callback_kwargs[self.tensor_inputs[2]] = add_time_ids + return callback_kwargs + + +class IPAdapterScaleCutoffCallback(PipelineCallback): + """ + Callback function for any pipeline that inherits `IPAdapterMixin`. After certain number of steps (set by + `cutoff_step_ratio` or `cutoff_step_index`), this callback will set the IP Adapter scale to `0.0`. + + Note: This callback mutates the IP Adapter attention processors by setting the scale to 0.0 after the cutoff step. + """ + + tensor_inputs = [] + + def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]: + cutoff_step_ratio = self.config.cutoff_step_ratio + cutoff_step_index = self.config.cutoff_step_index + + # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio + cutoff_step = ( + cutoff_step_index if cutoff_step_index is not None else int(pipeline.num_timesteps * cutoff_step_ratio) + ) + + if step_index == cutoff_step: + pipeline.set_ip_adapter_scale(0.0) + return callback_kwargs diff --git a/src/diffusers/commands/env.py b/src/diffusers/commands/env.py index baa69b361f5d..a828216670ce 100644 --- a/src/diffusers/commands/env.py +++ b/src/diffusers/commands/env.py @@ -13,12 +13,25 @@ # limitations under the License. import platform +import subprocess from argparse import ArgumentParser import huggingface_hub from .. import __version__ as version -from ..utils import is_accelerate_available, is_torch_available, is_transformers_available, is_xformers_available +from ..utils import ( + is_accelerate_available, + is_bitsandbytes_available, + is_flax_available, + is_google_colab, + is_notebook, + is_peft_available, + is_safetensors_available, + is_torch_available, + is_transformers_available, + is_xformers_available, +) +from ..utils.testing_utils import get_python_version from . import BaseDiffusersCLICommand @@ -28,13 +41,19 @@ def info_command_factory(_): class EnvironmentCommand(BaseDiffusersCLICommand): @staticmethod - def register_subcommand(parser: ArgumentParser): + def register_subcommand(parser: ArgumentParser) -> None: download_parser = parser.add_parser("env") download_parser.set_defaults(func=info_command_factory) - def run(self): + def run(self) -> dict: hub_version = huggingface_hub.__version__ + safetensors_version = "not installed" + if is_safetensors_available(): + import safetensors + + safetensors_version = safetensors.__version__ + pt_version = "not installed" pt_cuda_available = "NA" if is_torch_available(): @@ -43,6 +62,20 @@ def run(self): pt_version = torch.__version__ pt_cuda_available = torch.cuda.is_available() + flax_version = "not installed" + jax_version = "not installed" + jaxlib_version = "not installed" + jax_backend = "NA" + if is_flax_available(): + import flax + import jax + import jaxlib + + flax_version = flax.__version__ + jax_version = jax.__version__ + jaxlib_version = jaxlib.__version__ + jax_backend = jax.lib.xla_bridge.get_backend().platform + transformers_version = "not installed" if is_transformers_available(): import transformers @@ -55,21 +88,92 @@ def run(self): accelerate_version = accelerate.__version__ + peft_version = "not installed" + if is_peft_available(): + import peft + + peft_version = peft.__version__ + + bitsandbytes_version = "not installed" + if is_bitsandbytes_available(): + import bitsandbytes + + bitsandbytes_version = bitsandbytes.__version__ + xformers_version = "not installed" if is_xformers_available(): import xformers xformers_version = xformers.__version__ + if get_python_version() >= (3, 10): + platform_info = f"{platform.freedesktop_os_release().get('PRETTY_NAME', None)} - {platform.platform()}" + else: + platform_info = platform.platform() + + is_notebook_str = "Yes" if is_notebook() else "No" + + is_google_colab_str = "Yes" if is_google_colab() else "No" + + accelerator = "NA" + if platform.system() in {"Linux", "Windows"}: + try: + sp = subprocess.Popen( + ["nvidia-smi", "--query-gpu=gpu_name,memory.total", "--format=csv,noheader"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + out_str, _ = sp.communicate() + out_str = out_str.decode("utf-8") + + if len(out_str) > 0: + accelerator = out_str.strip() + " VRAM" + except FileNotFoundError: + pass + elif platform.system() == "Darwin": # Mac OS + try: + sp = subprocess.Popen( + ["system_profiler", "SPDisplaysDataType"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + out_str, _ = sp.communicate() + out_str = out_str.decode("utf-8") + + start = out_str.find("Chipset Model:") + if start != -1: + start += len("Chipset Model:") + end = out_str.find("\n", start) + accelerator = out_str[start:end].strip() + + start = out_str.find("VRAM (Total):") + if start != -1: + start += len("VRAM (Total):") + end = out_str.find("\n", start) + accelerator += " VRAM: " + out_str[start:end].strip() + except FileNotFoundError: + pass + else: + print("It seems you are running an unusual OS. Could you fill in the accelerator manually?") + info = { - "`diffusers` version": version, - "Platform": platform.platform(), + "🤗 Diffusers version": version, + "Platform": platform_info, + "Running on a notebook?": is_notebook_str, + "Running on Google Colab?": is_google_colab_str, "Python version": platform.python_version(), "PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})", + "Flax version (CPU?/GPU?/TPU?)": f"{flax_version} ({jax_backend})", + "Jax version": jax_version, + "JaxLib version": jaxlib_version, "Huggingface_hub version": hub_version, "Transformers version": transformers_version, "Accelerate version": accelerate_version, + "PEFT version": peft_version, + "Bitsandbytes version": bitsandbytes_version, + "Safetensors version": safetensors_version, "xFormers version": xformers_version, + "Accelerator": accelerator, "Using GPU in script?": "", "Using distributed or parallel set-up in script?": "", } @@ -80,5 +184,5 @@ def run(self): return info @staticmethod - def format_dict(d): + def format_dict(d: dict) -> str: return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n" diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py index 7d76687a3d1e..d601ce2356b1 100644 --- a/src/diffusers/configuration_utils.py +++ b/src/diffusers/configuration_utils.py @@ -340,6 +340,8 @@ def load_config( """ cache_dir = kwargs.pop("cache_dir", None) + local_dir = kwargs.pop("local_dir", None) + local_dir_use_symlinks = kwargs.pop("local_dir_use_symlinks", "auto") force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) @@ -364,13 +366,13 @@ def load_config( if os.path.isfile(pretrained_model_name_or_path): config_file = pretrained_model_name_or_path elif os.path.isdir(pretrained_model_name_or_path): - if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)): - # Load from a PyTorch checkpoint - config_file = os.path.join(pretrained_model_name_or_path, cls.config_name) - elif subfolder is not None and os.path.isfile( + if subfolder is not None and os.path.isfile( os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name) ): config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name) + elif os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)): + # Load from a PyTorch checkpoint + config_file = os.path.join(pretrained_model_name_or_path, cls.config_name) else: raise EnvironmentError( f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}." @@ -390,6 +392,8 @@ def load_config( user_agent=user_agent, subfolder=subfolder, revision=revision, + local_dir=local_dir, + local_dir_use_symlinks=local_dir_use_symlinks, ) except RepositoryNotFoundError: raise EnvironmentError( diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 0f4481570829..4d8d73c863ae 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -29,15 +29,34 @@ PipelineImageInput = Union[ PIL.Image.Image, np.ndarray, - torch.FloatTensor, + torch.Tensor, List[PIL.Image.Image], List[np.ndarray], - List[torch.FloatTensor], + List[torch.Tensor], ] PipelineDepthInput = PipelineImageInput +def is_valid_image(image): + return isinstance(image, PIL.Image.Image) or isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim in (2, 3) + + +def is_valid_image_imagelist(images): + # check if the image input is one of the supported formats for image and image list: + # it can be either one of below 3 + # (1) a 4d pytorch tensor or numpy array, + # (2) a valid image: PIL.Image.Image, 2-d np.ndarray or torch.Tensor (grayscale image), 3-d np.ndarray or torch.Tensor + # (3) a list of valid image + if isinstance(images, (np.ndarray, torch.Tensor)) and images.ndim == 4: + return True + elif is_valid_image(images): + return True + elif isinstance(images, list): + return all(is_valid_image(image) for image in images) + return False + + class VaeImageProcessor(ConfigMixin): """ Image processor for VAE. @@ -110,7 +129,7 @@ def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.nd return images @staticmethod - def numpy_to_pt(images: np.ndarray) -> torch.FloatTensor: + def numpy_to_pt(images: np.ndarray) -> torch.Tensor: """ Convert a NumPy image to a PyTorch tensor. """ @@ -121,7 +140,7 @@ def numpy_to_pt(images: np.ndarray) -> torch.FloatTensor: return images @staticmethod - def pt_to_numpy(images: torch.FloatTensor) -> np.ndarray: + def pt_to_numpy(images: torch.Tensor) -> np.ndarray: """ Convert a PyTorch tensor to a NumPy image. """ @@ -497,12 +516,27 @@ def preprocess( else: image = np.expand_dims(image, axis=-1) - if isinstance(image, supported_formats): - image = [image] - elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)): + if isinstance(image, list) and isinstance(image[0], np.ndarray) and image[0].ndim == 4: + warnings.warn( + "Passing `image` as a list of 4d np.ndarray is deprecated." + "Please concatenate the list along the batch dimension and pass it as a single 4d np.ndarray", + FutureWarning, + ) + image = np.concatenate(image, axis=0) + if isinstance(image, list) and isinstance(image[0], torch.Tensor) and image[0].ndim == 4: + warnings.warn( + "Passing `image` as a list of 4d torch.Tensor is deprecated." + "Please concatenate the list along the batch dimension and pass it as a single 4d torch.Tensor", + FutureWarning, + ) + image = torch.cat(image, axis=0) + + if not is_valid_image_imagelist(image): raise ValueError( - f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support {', '.join(supported_formats)}" + f"Input is in incorrect format. Currently, we only support {', '.join(supported_formats)}" ) + if not isinstance(image, list): + image = [image] if isinstance(image[0], PIL.Image.Image): if crops_coords is not None: @@ -561,15 +595,15 @@ def preprocess( def postprocess( self, - image: torch.FloatTensor, + image: torch.Tensor, output_type: str = "pil", do_denormalize: Optional[List[bool]] = None, - ) -> Union[PIL.Image.Image, np.ndarray, torch.FloatTensor]: + ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]: """ Postprocess the image output from tensor to `output_type`. Args: - image (`torch.FloatTensor`): + image (`torch.Tensor`): The image input, should be a pytorch tensor with shape `B x C x H x W`. output_type (`str`, *optional*, defaults to `pil`): The output type of the image, can be one of `pil`, `np`, `pt`, `latent`. @@ -578,7 +612,7 @@ def postprocess( `VaeImageProcessor` config. Returns: - `PIL.Image.Image`, `np.ndarray` or `torch.FloatTensor`: + `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`: The postprocessed image. """ if not isinstance(image, torch.Tensor): @@ -738,15 +772,15 @@ def numpy_to_depth(self, images: np.ndarray) -> List[PIL.Image.Image]: def postprocess( self, - image: torch.FloatTensor, + image: torch.Tensor, output_type: str = "pil", do_denormalize: Optional[List[bool]] = None, - ) -> Union[PIL.Image.Image, np.ndarray, torch.FloatTensor]: + ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]: """ Postprocess the image output from tensor to `output_type`. Args: - image (`torch.FloatTensor`): + image (`torch.Tensor`): The image input, should be a pytorch tensor with shape `B x C x H x W`. output_type (`str`, *optional*, defaults to `pil`): The output type of the image, can be one of `pil`, `np`, `pt`, `latent`. @@ -755,7 +789,7 @@ def postprocess( `VaeImageProcessor` config. Returns: - `PIL.Image.Image`, `np.ndarray` or `torch.FloatTensor`: + `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`: The postprocessed image. """ if not isinstance(image, torch.Tensor): @@ -793,8 +827,8 @@ def postprocess( def preprocess( self, - rgb: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], - depth: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], + rgb: Union[torch.Tensor, PIL.Image.Image, np.ndarray], + depth: Union[torch.Tensor, PIL.Image.Image, np.ndarray], height: Optional[int] = None, width: Optional[int] = None, target_res: Optional[int] = None, @@ -933,13 +967,13 @@ def __init__( ) @staticmethod - def downsample(mask: torch.FloatTensor, batch_size: int, num_queries: int, value_embed_dim: int): + def downsample(mask: torch.Tensor, batch_size: int, num_queries: int, value_embed_dim: int): """ Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention. If the aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued. Args: - mask (`torch.FloatTensor`): + mask (`torch.Tensor`): The input mask tensor generated with `IPAdapterMaskProcessor.preprocess()`. batch_size (`int`): The batch size. @@ -949,7 +983,7 @@ def downsample(mask: torch.FloatTensor, batch_size: int, num_queries: int, value The dimensionality of the value embeddings. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The downsampled mask tensor. """ diff --git a/src/diffusers/loaders/__init__.py b/src/diffusers/loaders/__init__.py index 4da047435d8e..75c8b0eb820b 100644 --- a/src/diffusers/loaders/__init__.py +++ b/src/diffusers/loaders/__init__.py @@ -54,9 +54,7 @@ def text_encoder_attn_modules(text_encoder): _import_structure = {} if is_torch_available(): - _import_structure["autoencoder"] = ["FromOriginalVAEMixin"] - - _import_structure["controlnet"] = ["FromOriginalControlNetMixin"] + _import_structure["single_file_model"] = ["FromOriginalModelMixin"] _import_structure["unet"] = ["UNet2DConditionLoadersMixin"] _import_structure["utils"] = ["AttnProcsLayers"] if is_transformers_available(): @@ -70,8 +68,7 @@ def text_encoder_attn_modules(text_encoder): if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: if is_torch_available(): - from .autoencoder import FromOriginalVAEMixin - from .controlnet import FromOriginalControlNetMixin + from .single_file_model import FromOriginalModelMixin from .unet import UNet2DConditionLoadersMixin from .utils import AttnProcsLayers diff --git a/src/diffusers/loaders/lora.py b/src/diffusers/loaders/lora.py index 24be4999de27..e089d202ee75 100644 --- a/src/diffusers/loaders/lora.py +++ b/src/diffusers/loaders/lora.py @@ -363,7 +363,7 @@ def _optionally_disable_offloading(cls, _pipeline): is_model_cpu_offload = False is_sequential_cpu_offload = False - if _pipeline is not None: + if _pipeline is not None and _pipeline.hf_device_map is None: for _, component in _pipeline.components.items(): if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"): if not is_model_cpu_offload: @@ -1294,7 +1294,7 @@ def set_lora_device(self, adapter_names: List[str], device: Union[torch.device, text_encoder_module.lora_B[adapter_name].to(device) # this is a param, not a module, so device placement is not in-place -> re-assign if ( - hasattr(text_encoder, "lora_magnitude_vector") + hasattr(text_encoder_module, "lora_magnitude_vector") and text_encoder_module.lora_magnitude_vector is not None ): text_encoder_module.lora_magnitude_vector[ diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py index d8ff92d0b0ff..32b458ddd591 100644 --- a/src/diffusers/loaders/single_file.py +++ b/src/diffusers/loaders/single_file.py @@ -11,144 +11,243 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import importlib +import inspect +import os -from huggingface_hub.utils import validate_hf_hub_args +import torch +from huggingface_hub import snapshot_download +from huggingface_hub.utils import LocalEntryNotFoundError, validate_hf_hub_args +from packaging import version -from ..utils import is_transformers_available, logging +from ..utils import deprecate, is_transformers_available, logging from .single_file_utils import ( - create_diffusers_unet_model_from_ldm, - create_diffusers_vae_model_from_ldm, - create_scheduler_from_ldm, - create_text_encoders_and_tokenizers_from_ldm, - fetch_ldm_config_and_checkpoint, - infer_model_type, + SingleFileComponentError, + _is_model_weights_in_cached_folder, + _legacy_load_clip_tokenizer, + _legacy_load_safety_checker, + _legacy_load_scheduler, + create_diffusers_clip_model_from_ldm, + fetch_diffusers_config, + fetch_original_config, + is_clip_model_in_single_file, + load_single_file_checkpoint, ) logger = logging.get_logger(__name__) -# Pipelines that support the SDXL Refiner checkpoint -REFINER_PIPELINES = [ - "StableDiffusionXLImg2ImgPipeline", - "StableDiffusionXLInpaintPipeline", - "StableDiffusionXLControlNetImg2ImgPipeline", -] +# Legacy behaviour. `from_single_file` does not load the safety checker unless explicitly provided +SINGLE_FILE_OPTIONAL_COMPONENTS = ["safety_checker"] + if is_transformers_available(): - from transformers import AutoFeatureExtractor + import transformers + from transformers import PreTrainedModel, PreTrainedTokenizer -def build_sub_model_components( - pipeline_components, - pipeline_class_name, - component_name, - original_config, +def load_single_file_sub_model( + library_name, + class_name, + name, checkpoint, + pipelines, + is_pipeline_module, + cached_model_config_path, + original_config=None, local_files_only=False, - load_safety_checker=False, - model_type=None, - image_size=None, torch_dtype=None, + is_legacy_loading=False, **kwargs, ): - if component_name in pipeline_components: - return {} - - if component_name == "unet": - num_in_channels = kwargs.pop("num_in_channels", None) - upcast_attention = kwargs.pop("upcast_attention", None) - - unet_components = create_diffusers_unet_model_from_ldm( - pipeline_class_name, - original_config, - checkpoint, - num_in_channels=num_in_channels, - image_size=image_size, + if is_pipeline_module: + pipeline_module = getattr(pipelines, library_name) + class_obj = getattr(pipeline_module, class_name) + else: + # else we just import it from the library. + library = importlib.import_module(library_name) + class_obj = getattr(library, class_name) + + if is_transformers_available(): + transformers_version = version.parse(version.parse(transformers.__version__).base_version) + else: + transformers_version = "N/A" + + is_transformers_model = ( + is_transformers_available() + and issubclass(class_obj, PreTrainedModel) + and transformers_version >= version.parse("4.20.0") + ) + is_tokenizer = ( + is_transformers_available() + and issubclass(class_obj, PreTrainedTokenizer) + and transformers_version >= version.parse("4.20.0") + ) + + diffusers_module = importlib.import_module(__name__.split(".")[0]) + is_diffusers_single_file_model = issubclass(class_obj, diffusers_module.FromOriginalModelMixin) + is_diffusers_model = issubclass(class_obj, diffusers_module.ModelMixin) + is_diffusers_scheduler = issubclass(class_obj, diffusers_module.SchedulerMixin) + + if is_diffusers_single_file_model: + load_method = getattr(class_obj, "from_single_file") + + # We cannot provide two different config options to the `from_single_file` method + # Here we have to ignore loading the config from `cached_model_config_path` if `original_config` is provided + if original_config: + cached_model_config_path = None + + loaded_sub_model = load_method( + pretrained_model_link_or_path_or_dict=checkpoint, + original_config=original_config, + config=cached_model_config_path, + subfolder=name, torch_dtype=torch_dtype, - model_type=model_type, - upcast_attention=upcast_attention, + local_files_only=local_files_only, + **kwargs, ) - return unet_components - if component_name == "vae": - scaling_factor = kwargs.get("scaling_factor", None) - vae_components = create_diffusers_vae_model_from_ldm( - pipeline_class_name, - original_config, - checkpoint, - image_size, - scaling_factor, - torch_dtype, - model_type=model_type, + elif is_transformers_model and is_clip_model_in_single_file(class_obj, checkpoint): + loaded_sub_model = create_diffusers_clip_model_from_ldm( + class_obj, + checkpoint=checkpoint, + config=cached_model_config_path, + subfolder=name, + torch_dtype=torch_dtype, + local_files_only=local_files_only, + is_legacy_loading=is_legacy_loading, ) - return vae_components - - if component_name == "scheduler": - scheduler_type = kwargs.get("scheduler_type", "ddim") - prediction_type = kwargs.get("prediction_type", None) - - scheduler_components = create_scheduler_from_ldm( - pipeline_class_name, - original_config, - checkpoint, - scheduler_type=scheduler_type, - prediction_type=prediction_type, - model_type=model_type, + + elif is_tokenizer and is_legacy_loading: + loaded_sub_model = _legacy_load_clip_tokenizer( + class_obj, checkpoint=checkpoint, config=cached_model_config_path, local_files_only=local_files_only ) - return scheduler_components + elif is_diffusers_scheduler and is_legacy_loading: + loaded_sub_model = _legacy_load_scheduler( + class_obj, checkpoint=checkpoint, component_name=name, original_config=original_config, **kwargs + ) - if component_name in ["text_encoder", "text_encoder_2", "tokenizer", "tokenizer_2"]: - text_encoder_components = create_text_encoders_and_tokenizers_from_ldm( - original_config, - checkpoint, - model_type=model_type, - local_files_only=local_files_only, - torch_dtype=torch_dtype, + else: + if not hasattr(class_obj, "from_pretrained"): + raise ValueError( + ( + f"The component {class_obj.__name__} cannot be loaded as it does not seem to have" + " a supported loading method." + ) + ) + + loading_kwargs = {} + loading_kwargs.update( + { + "pretrained_model_name_or_path": cached_model_config_path, + "subfolder": name, + "local_files_only": local_files_only, + } ) - return text_encoder_components - if component_name == "safety_checker": - if load_safety_checker: - from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker + # Schedulers and Tokenizers don't make use of torch_dtype + # Skip passing it to those objects + if issubclass(class_obj, torch.nn.Module): + loading_kwargs.update({"torch_dtype": torch_dtype}) - safety_checker = StableDiffusionSafetyChecker.from_pretrained( - "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only, torch_dtype=torch_dtype - ) - else: - safety_checker = None - return {"safety_checker": safety_checker} + if is_diffusers_model or is_transformers_model: + if not _is_model_weights_in_cached_folder(cached_model_config_path, name): + raise SingleFileComponentError( + f"Failed to load {class_name}. Weights for this component appear to be missing in the checkpoint." + ) - if component_name == "feature_extractor": - if load_safety_checker: - feature_extractor = AutoFeatureExtractor.from_pretrained( - "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only - ) - else: - feature_extractor = None - return {"feature_extractor": feature_extractor} + load_method = getattr(class_obj, "from_pretrained") + loaded_sub_model = load_method(**loading_kwargs) - return + return loaded_sub_model -def set_additional_components( - pipeline_class_name, - original_config, - checkpoint=None, - model_type=None, -): - components = {} - if pipeline_class_name in REFINER_PIPELINES: - model_type = infer_model_type(original_config, checkpoint=checkpoint, model_type=model_type) - is_refiner = model_type == "SDXL-Refiner" - components.update( - { - "requires_aesthetics_score": is_refiner, - "force_zeros_for_empty_prompt": False if is_refiner else True, - } +def _map_component_types_to_config_dict(component_types): + diffusers_module = importlib.import_module(__name__.split(".")[0]) + config_dict = {} + component_types.pop("self", None) + + if is_transformers_available(): + transformers_version = version.parse(version.parse(transformers.__version__).base_version) + else: + transformers_version = "N/A" + + for component_name, component_value in component_types.items(): + is_diffusers_model = issubclass(component_value[0], diffusers_module.ModelMixin) + is_scheduler_enum = component_value[0].__name__ == "KarrasDiffusionSchedulers" + is_scheduler = issubclass(component_value[0], diffusers_module.SchedulerMixin) + + is_transformers_model = ( + is_transformers_available() + and issubclass(component_value[0], PreTrainedModel) + and transformers_version >= version.parse("4.20.0") ) + is_transformers_tokenizer = ( + is_transformers_available() + and issubclass(component_value[0], PreTrainedTokenizer) + and transformers_version >= version.parse("4.20.0") + ) + + if is_diffusers_model and component_name not in SINGLE_FILE_OPTIONAL_COMPONENTS: + config_dict[component_name] = ["diffusers", component_value[0].__name__] + + elif is_scheduler_enum or is_scheduler: + if is_scheduler_enum: + # Since we cannot fetch a scheduler config from the hub, we default to DDIMScheduler + # if the type hint is a KarrassDiffusionSchedulers enum + config_dict[component_name] = ["diffusers", "DDIMScheduler"] + + elif is_scheduler: + config_dict[component_name] = ["diffusers", component_value[0].__name__] + + elif ( + is_transformers_model or is_transformers_tokenizer + ) and component_name not in SINGLE_FILE_OPTIONAL_COMPONENTS: + config_dict[component_name] = ["transformers", component_value[0].__name__] - return components + else: + config_dict[component_name] = [None, None] + + return config_dict + + +def _infer_pipeline_config_dict(pipeline_class): + parameters = inspect.signature(pipeline_class.__init__).parameters + required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty} + component_types = pipeline_class._get_signature_types() + + # Ignore parameters that are not required for the pipeline + component_types = {k: v for k, v in component_types.items() if k in required_parameters} + config_dict = _map_component_types_to_config_dict(component_types) + + return config_dict + + +def _download_diffusers_model_config_from_hub( + pretrained_model_name_or_path, + cache_dir, + revision, + proxies, + force_download=None, + resume_download=None, + local_files_only=None, + token=None, +): + allow_patterns = ["**/*.json", "*.json", "*.txt", "**/*.txt"] + cached_model_path = snapshot_download( + pretrained_model_name_or_path, + cache_dir=cache_dir, + revision=revision, + proxies=proxies, + force_download=force_download, + resume_download=resume_download, + local_files_only=local_files_only, + token=token, + allow_patterns=allow_patterns, + ) + + return cached_model_path class FromSingleFileMixin: @@ -195,27 +294,12 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): original_config_file (`str`, *optional*): The path to the original config file that was used to train the model. If not provided, the config file will be inferred from the checkpoint file. - model_type (`str`, *optional*): - The type of model to load. If not provided, the model type will be inferred from the checkpoint file. - image_size (`int`, *optional*): - The size of the image output. It's used to configure the `sample_size` parameter of the UNet and VAE - model. - load_safety_checker (`bool`, *optional*, defaults to `False`): - Whether to load the safety checker model or not. By default, the safety checker is not loaded unless a - `safety_checker` component is passed to the `kwargs`. - num_in_channels (`int`, *optional*): - Specify the number of input channels for the UNet model. Read more about how to configure UNet model - with this parameter - [here](https://huggingface.co/docs/diffusers/training/adapt_a_model#configure-unet2dconditionmodel-parameters). - scaling_factor (`float`, *optional*): - The scaling factor to use for the VAE model. If not provided, it is inferred from the config file - first. If the scaling factor is not found in the config file, the default value 0.18215 is used. - scheduler_type (`str`, *optional*): - The type of scheduler to load. If not provided, the scheduler type will be inferred from the checkpoint - file. - prediction_type (`str`, *optional*): - The type of prediction to load. If not provided, the prediction type will be inferred from the - checkpoint file. + config (`str`, *optional*): + Can be either: + - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline + hosted on the Hub. + - A path to a *directory* (for example `./my_pipeline_directory/`) containing the pipeline + component configs in Diffusers format. kwargs (remaining dictionary of keyword arguments, *optional*): Can be used to overwrite load and saveable variables (the pipeline components of the specific pipeline class). The overwritten components are passed directly to the pipelines `__init__` method. See example @@ -233,7 +317,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): >>> # Download pipeline from local file >>> # file is downloaded under ./v1-5-pruned-emaonly.ckpt - >>> pipeline = StableDiffusionPipeline.from_single_file("./v1-5-pruned-emaonly") + >>> pipeline = StableDiffusionPipeline.from_single_file("./v1-5-pruned-emaonly.ckpt") >>> # Enable float16 and move to GPU >>> pipeline = StableDiffusionPipeline.from_single_file( @@ -242,8 +326,20 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): ... ) >>> pipeline.to("cuda") ``` + """ original_config_file = kwargs.pop("original_config_file", None) + config = kwargs.pop("config", None) + original_config = kwargs.pop("original_config", None) + + if original_config_file is not None: + deprecation_message = ( + "`original_config_file` argument is deprecated and will be removed in future versions." + "please use the `original_config` argument instead." + ) + deprecate("original_config_file", "1.0.0", deprecation_message) + original_config = original_config_file + resume_download = kwargs.pop("resume_download", None) force_download = kwargs.pop("force_download", False) proxies = kwargs.pop("proxies", None) @@ -253,68 +349,198 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): revision = kwargs.pop("revision", None) torch_dtype = kwargs.pop("torch_dtype", None) - class_name = cls.__name__ + is_legacy_loading = False - original_config, checkpoint = fetch_ldm_config_and_checkpoint( - pretrained_model_link_or_path=pretrained_model_link_or_path, - class_name=class_name, - original_config_file=original_config_file, + # We shouldn't allow configuring individual models components through a Pipeline creation method + # These model kwargs should be deprecated + scaling_factor = kwargs.get("scaling_factor", None) + if scaling_factor is not None: + deprecation_message = ( + "Passing the `scaling_factor` argument to `from_single_file is deprecated " + "and will be ignored in future versions." + ) + deprecate("scaling_factor", "1.0.0", deprecation_message) + + if original_config is not None: + original_config = fetch_original_config(original_config, local_files_only=local_files_only) + + from ..pipelines.pipeline_utils import _get_pipeline_class + + pipeline_class = _get_pipeline_class(cls, config=None) + + checkpoint = load_single_file_checkpoint( + pretrained_model_link_or_path, resume_download=resume_download, force_download=force_download, proxies=proxies, token=token, - revision=revision, - local_files_only=local_files_only, cache_dir=cache_dir, + local_files_only=local_files_only, + revision=revision, ) - from ..pipelines.pipeline_utils import _get_pipeline_class + if config is None: + config = fetch_diffusers_config(checkpoint) + default_pretrained_model_config_name = config["pretrained_model_name_or_path"] + else: + default_pretrained_model_config_name = config + + if not os.path.isdir(default_pretrained_model_config_name): + # Provided config is a repo_id + if default_pretrained_model_config_name.count("/") > 1: + raise ValueError( + f'The provided config "{config}"' + " is neither a valid local path nor a valid repo id. Please check the parameter." + ) + try: + # Attempt to download the config files for the pipeline + cached_model_config_path = _download_diffusers_model_config_from_hub( + default_pretrained_model_config_name, + cache_dir=cache_dir, + revision=revision, + proxies=proxies, + force_download=force_download, + resume_download=resume_download, + local_files_only=local_files_only, + token=token, + ) + config_dict = pipeline_class.load_config(cached_model_config_path) + + except LocalEntryNotFoundError: + # `local_files_only=True` but a local diffusers format model config is not available in the cache + # If `original_config` is not provided, we need override `local_files_only` to False + # to fetch the config files from the hub so that we have a way + # to configure the pipeline components. + + if original_config is None: + logger.warning( + "`local_files_only` is True but no local configs were found for this checkpoint.\n" + "Attempting to download the necessary config files for this pipeline.\n" + ) + cached_model_config_path = _download_diffusers_model_config_from_hub( + default_pretrained_model_config_name, + cache_dir=cache_dir, + revision=revision, + proxies=proxies, + force_download=force_download, + resume_download=resume_download, + local_files_only=False, + token=token, + ) + config_dict = pipeline_class.load_config(cached_model_config_path) + + else: + # For backwards compatibility + # If `original_config` is provided, then we need to assume we are using legacy loading for pipeline components + logger.warning( + "Detected legacy `from_single_file` loading behavior. Attempting to create the pipeline based on inferred components.\n" + "This may lead to errors if the model components are not correctly inferred. \n" + "To avoid this warning, please explicity pass the `config` argument to `from_single_file` with a path to a local diffusers model repo \n" + "e.g. `from_single_file(, config=) \n" + "or run `from_single_file` with `local_files_only=False` first to update the local cache directory with " + "the necessary config files.\n" + ) + is_legacy_loading = True + cached_model_config_path = None + + config_dict = _infer_pipeline_config_dict(pipeline_class) + config_dict["_class_name"] = pipeline_class.__name__ - pipeline_class = _get_pipeline_class( - cls, - config=None, - cache_dir=cache_dir, - ) + else: + # Provided config is a path to a local directory attempt to load directly. + cached_model_config_path = default_pretrained_model_config_name + config_dict = pipeline_class.load_config(cached_model_config_path) - expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class) + # pop out "_ignore_files" as it is only needed for download + config_dict.pop("_ignore_files", None) + + expected_modules, optional_kwargs = pipeline_class._get_signature_keys(cls) passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs} passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs} - model_type = kwargs.pop("model_type", None) - image_size = kwargs.pop("image_size", None) - load_safety_checker = (kwargs.pop("load_safety_checker", False)) or ( - passed_class_obj.get("safety_checker", None) is not None - ) + init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs) + init_kwargs = {k: init_dict.pop(k) for k in optional_kwargs if k in init_dict} + init_kwargs = {**init_kwargs, **passed_pipe_kwargs} + + from diffusers import pipelines + + # remove `null` components + def load_module(name, value): + if value[0] is None: + return False + if name in passed_class_obj and passed_class_obj[name] is None: + return False + if name in SINGLE_FILE_OPTIONAL_COMPONENTS: + return False + + return True + + init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)} + + for name, (library_name, class_name) in logging.tqdm( + sorted(init_dict.items()), desc="Loading pipeline components..." + ): + loaded_sub_model = None + is_pipeline_module = hasattr(pipelines, library_name) - init_kwargs = {} - for name in expected_modules: if name in passed_class_obj: - init_kwargs[name] = passed_class_obj[name] + loaded_sub_model = passed_class_obj[name] + else: - components = build_sub_model_components( - init_kwargs, - class_name, - name, - original_config, - checkpoint, - model_type=model_type, - image_size=image_size, - load_safety_checker=load_safety_checker, - local_files_only=local_files_only, - torch_dtype=torch_dtype, - **kwargs, - ) - if not components: - continue - init_kwargs.update(components) + try: + loaded_sub_model = load_single_file_sub_model( + library_name=library_name, + class_name=class_name, + name=name, + checkpoint=checkpoint, + is_pipeline_module=is_pipeline_module, + cached_model_config_path=cached_model_config_path, + pipelines=pipelines, + torch_dtype=torch_dtype, + original_config=original_config, + local_files_only=local_files_only, + is_legacy_loading=is_legacy_loading, + **kwargs, + ) + except SingleFileComponentError as e: + raise SingleFileComponentError( + ( + f"{e.message}\n" + f"Please load the component before passing it in as an argument to `from_single_file`.\n" + f"\n" + f"{name} = {class_name}.from_pretrained('...')\n" + f"pipe = {pipeline_class.__name__}.from_single_file(, {name}={name})\n" + f"\n" + ) + ) + + init_kwargs[name] = loaded_sub_model + + missing_modules = set(expected_modules) - set(init_kwargs.keys()) + passed_modules = list(passed_class_obj.keys()) + optional_modules = pipeline_class._optional_components + + if len(missing_modules) > 0 and missing_modules <= set(passed_modules + optional_modules): + for module in missing_modules: + init_kwargs[module] = passed_class_obj.get(module, None) + elif len(missing_modules) > 0: + passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - optional_kwargs + raise ValueError( + f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed." + ) - additional_components = set_additional_components( - class_name, original_config, checkpoint=checkpoint, model_type=model_type - ) - if additional_components: - init_kwargs.update(additional_components) + # deprecated kwargs + load_safety_checker = kwargs.pop("load_safety_checker", None) + if load_safety_checker is not None: + deprecation_message = ( + "Please pass instances of `StableDiffusionSafetyChecker` and `AutoImageProcessor`" + "using the `safety_checker` and `feature_extractor` arguments in `from_single_file`" + ) + deprecate("load_safety_checker", "1.0.0", deprecation_message) + + safety_checker_components = _legacy_load_safety_checker(local_files_only, torch_dtype) + init_kwargs.update(safety_checker_components) - init_kwargs.update(passed_pipe_kwargs) pipe = pipeline_class(**init_kwargs) if torch_dtype is not None: diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py new file mode 100644 index 000000000000..f06f4832740c --- /dev/null +++ b/src/diffusers/loaders/single_file_model.py @@ -0,0 +1,290 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import inspect +import re +from contextlib import nullcontext +from typing import Optional + +from huggingface_hub.utils import validate_hf_hub_args + +from ..utils import deprecate, is_accelerate_available, logging +from .single_file_utils import ( + SingleFileComponentError, + convert_controlnet_checkpoint, + convert_ldm_unet_checkpoint, + convert_ldm_vae_checkpoint, + convert_stable_cascade_unet_single_file_to_diffusers, + create_controlnet_diffusers_config_from_ldm, + create_unet_diffusers_config_from_ldm, + create_vae_diffusers_config_from_ldm, + fetch_diffusers_config, + fetch_original_config, + load_single_file_checkpoint, +) + + +logger = logging.get_logger(__name__) + + +if is_accelerate_available(): + from accelerate import init_empty_weights + + from ..models.modeling_utils import load_model_dict_into_meta + + +SINGLE_FILE_LOADABLE_CLASSES = { + "StableCascadeUNet": { + "checkpoint_mapping_fn": convert_stable_cascade_unet_single_file_to_diffusers, + }, + "UNet2DConditionModel": { + "checkpoint_mapping_fn": convert_ldm_unet_checkpoint, + "config_mapping_fn": create_unet_diffusers_config_from_ldm, + "default_subfolder": "unet", + "legacy_kwargs": { + "num_in_channels": "in_channels", # Legacy kwargs supported by `from_single_file` mapped to new args + }, + }, + "AutoencoderKL": { + "checkpoint_mapping_fn": convert_ldm_vae_checkpoint, + "config_mapping_fn": create_vae_diffusers_config_from_ldm, + "default_subfolder": "vae", + }, + "ControlNetModel": { + "checkpoint_mapping_fn": convert_controlnet_checkpoint, + "config_mapping_fn": create_controlnet_diffusers_config_from_ldm, + }, +} + + +def _get_mapping_function_kwargs(mapping_fn, **kwargs): + parameters = inspect.signature(mapping_fn).parameters + + mapping_kwargs = {} + for parameter in parameters: + if parameter in kwargs: + mapping_kwargs[parameter] = kwargs[parameter] + + return mapping_kwargs + + +class FromOriginalModelMixin: + """ + Load pretrained weights saved in the `.ckpt` or `.safetensors` format into a model. + """ + + @classmethod + @validate_hf_hub_args + def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = None, **kwargs): + r""" + Instantiate a model from pretrained weights saved in the original `.ckpt` or `.safetensors` format. The model + is set in evaluation mode (`model.eval()`) by default. + + Parameters: + pretrained_model_link_or_path_or_dict (`str`, *optional*): + Can be either: + - A link to the `.safetensors` or `.ckpt` file (for example + `"https://huggingface.co//blob/main/.safetensors"`) on the Hub. + - A path to a local *file* containing the weights of the component model. + - A state dict containing the component model weights. + config (`str`, *optional*): + - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline hosted + on the Hub. + - A path to a *directory* (for example `./my_pipeline_directory/`) containing the pipeline component + configs in Diffusers format. + subfolder (`str`, *optional*, defaults to `""`): + The subfolder location of a model file within a larger model repository on the Hub or locally. + original_config (`str`, *optional*): + Dict or path to a yaml file containing the configuration for the model in its original format. + If a dict is provided, it will be used to initialize the model configuration. + torch_dtype (`str` or `torch.dtype`, *optional*): + Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the + dtype is automatically derived from the model's weights. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + cache_dir (`Union[str, os.PathLike]`, *optional*): + Path to a directory where a downloaded pretrained model configuration is cached if the standard cache + is not used. + resume_download (`bool`, *optional*, defaults to `False`): + Whether or not to resume downloading the model weights and configuration files. If set to `False`, any + incompletely downloaded files are deleted. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + local_files_only (`bool`, *optional*, defaults to `False`): + Whether to only load local model weights and configuration files or not. If set to True, the model + won't be downloaded from the Hub. + token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from + `diffusers-cli login` (stored in `~/.huggingface`) is used. + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier + allowed by Git. + kwargs (remaining dictionary of keyword arguments, *optional*): + Can be used to overwrite load and saveable variables (for example the pipeline components of the + specific pipeline class). The overwritten components are directly passed to the pipelines `__init__` + method. See example below for more information. + + ```py + >>> from diffusers import StableCascadeUNet + + >>> ckpt_path = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_lite.safetensors" + >>> model = StableCascadeUNet.from_single_file(ckpt_path) + ``` + """ + + class_name = cls.__name__ + if class_name not in SINGLE_FILE_LOADABLE_CLASSES: + raise ValueError( + f"FromOriginalModelMixin is currently only compatible with {', '.join(SINGLE_FILE_LOADABLE_CLASSES.keys())}" + ) + + pretrained_model_link_or_path = kwargs.get("pretrained_model_link_or_path", None) + if pretrained_model_link_or_path is not None: + deprecation_message = ( + "Please use `pretrained_model_link_or_path_or_dict` argument instead for model classes" + ) + deprecate("pretrained_model_link_or_path", "1.0.0", deprecation_message) + pretrained_model_link_or_path_or_dict = pretrained_model_link_or_path + + config = kwargs.pop("config", None) + original_config = kwargs.pop("original_config", None) + + if config is not None and original_config is not None: + raise ValueError( + "`from_single_file` cannot accept both `config` and `original_config` arguments. Please provide only one of these arguments" + ) + + resume_download = kwargs.pop("resume_download", None) + force_download = kwargs.pop("force_download", False) + proxies = kwargs.pop("proxies", None) + token = kwargs.pop("token", None) + cache_dir = kwargs.pop("cache_dir", None) + local_files_only = kwargs.pop("local_files_only", None) + subfolder = kwargs.pop("subfolder", None) + revision = kwargs.pop("revision", None) + torch_dtype = kwargs.pop("torch_dtype", None) + + if isinstance(pretrained_model_link_or_path_or_dict, dict): + checkpoint = pretrained_model_link_or_path_or_dict + else: + checkpoint = load_single_file_checkpoint( + pretrained_model_link_or_path_or_dict, + resume_download=resume_download, + force_download=force_download, + proxies=proxies, + token=token, + cache_dir=cache_dir, + local_files_only=local_files_only, + revision=revision, + ) + + mapping_functions = SINGLE_FILE_LOADABLE_CLASSES[class_name] + + checkpoint_mapping_fn = mapping_functions["checkpoint_mapping_fn"] + if original_config: + if "config_mapping_fn" in mapping_functions: + config_mapping_fn = mapping_functions["config_mapping_fn"] + else: + config_mapping_fn = None + + if config_mapping_fn is None: + raise ValueError( + ( + f"`original_config` has been provided for {class_name} but no mapping function" + "was found to convert the original config to a Diffusers config in" + "`diffusers.loaders.single_file_utils`" + ) + ) + + if isinstance(original_config, str): + # If original_config is a URL or filepath fetch the original_config dict + original_config = fetch_original_config(original_config, local_files_only=local_files_only) + + config_mapping_kwargs = _get_mapping_function_kwargs(config_mapping_fn, **kwargs) + diffusers_model_config = config_mapping_fn( + original_config=original_config, checkpoint=checkpoint, **config_mapping_kwargs + ) + else: + if config: + if isinstance(config, str): + default_pretrained_model_config_name = config + else: + raise ValueError( + ( + "Invalid `config` argument. Please provide a string representing a repo id" + "or path to a local Diffusers model repo." + ) + ) + + else: + config = fetch_diffusers_config(checkpoint) + default_pretrained_model_config_name = config["pretrained_model_name_or_path"] + + if "default_subfolder" in mapping_functions: + subfolder = mapping_functions["default_subfolder"] + + subfolder = subfolder or config.pop( + "subfolder", None + ) # some configs contain a subfolder key, e.g. StableCascadeUNet + + diffusers_model_config = cls.load_config( + pretrained_model_name_or_path=default_pretrained_model_config_name, + subfolder=subfolder, + local_files_only=local_files_only, + ) + expected_kwargs, optional_kwargs = cls._get_signature_keys(cls) + + # Map legacy kwargs to new kwargs + if "legacy_kwargs" in mapping_functions: + legacy_kwargs = mapping_functions["legacy_kwargs"] + for legacy_key, new_key in legacy_kwargs.items(): + if legacy_key in kwargs: + kwargs[new_key] = kwargs.pop(legacy_key) + + model_kwargs = {k: kwargs.get(k) for k in kwargs if k in expected_kwargs or k in optional_kwargs} + diffusers_model_config.update(model_kwargs) + + checkpoint_mapping_kwargs = _get_mapping_function_kwargs(checkpoint_mapping_fn, **kwargs) + diffusers_format_checkpoint = checkpoint_mapping_fn( + config=diffusers_model_config, checkpoint=checkpoint, **checkpoint_mapping_kwargs + ) + if not diffusers_format_checkpoint: + raise SingleFileComponentError( + f"Failed to load {class_name}. Weights for this component appear to be missing in the checkpoint." + ) + + ctx = init_empty_weights if is_accelerate_available() else nullcontext + with ctx(): + model = cls.from_config(diffusers_model_config) + + if is_accelerate_available(): + unexpected_keys = load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype) + if model._keys_to_ignore_on_load_unexpected is not None: + for pat in model._keys_to_ignore_on_load_unexpected: + unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] + + if len(unexpected_keys) > 0: + logger.warning( + f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}" + ) + else: + model.load_state_dict(diffusers_format_checkpoint) + + if torch_dtype is not None: + model.to(torch_dtype) + + model.eval() + + return model diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index c23e594c3976..4add9fd29654 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -26,7 +26,6 @@ from ..models.modeling_utils import load_state_dict from ..schedulers import ( DDIMScheduler, - DDPMScheduler, DPMSolverMultistepScheduler, EDMDPMSolverMultistepScheduler, EulerAncestralDiscreteScheduler, @@ -35,17 +34,19 @@ LMSDiscreteScheduler, PNDMScheduler, ) -from ..utils import is_accelerate_available, is_transformers_available, logging +from ..utils import ( + SAFETENSORS_WEIGHTS_NAME, + WEIGHTS_NAME, + deprecate, + is_accelerate_available, + is_transformers_available, + logging, +) from ..utils.hub_utils import _get_model_file if is_transformers_available(): - from transformers import ( - CLIPTextConfig, - CLIPTextModel, - CLIPTextModelWithProjection, - CLIPTokenizer, - ) + from transformers import AutoImageProcessor if is_accelerate_available(): from accelerate import init_empty_weights @@ -54,116 +55,64 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -CONFIG_URLS = { - "v1": "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml", - "v2": "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml", - "xl": "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml", - "xl_refiner": "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_refiner.yaml", - "upscale": "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/x4-upscaling.yaml", - "controlnet": "https://raw.githubusercontent.com/lllyasviel/ControlNet/main/models/cldm_v15.yaml", -} - CHECKPOINT_KEY_NAMES = { "v2": "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight", "xl_base": "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_proj.bias", "xl_refiner": "conditioner.embedders.0.model.transformer.resblocks.9.mlp.c_proj.bias", + "upscale": "model.diffusion_model.input_blocks.10.0.skip_connection.bias", + "controlnet": "control_model.time_embed.0.weight", + "playground-v2-5": "edm_mean", + "inpainting": "model.diffusion_model.input_blocks.0.0.weight", + "clip": "cond_stage_model.transformer.text_model.embeddings.position_ids", + "clip_sdxl": "conditioner.embedders.0.transformer.text_model.embeddings.position_embedding.weight", + "open_clip": "cond_stage_model.model.token_embedding.weight", + "open_clip_sdxl": "conditioner.embedders.1.model.positional_embedding", + "open_clip_sdxl_refiner": "conditioner.embedders.0.model.text_projection", + "stable_cascade_stage_b": "down_blocks.1.0.channelwise.0.weight", + "stable_cascade_stage_c": "clip_txt_mapper.weight", } -SCHEDULER_DEFAULT_CONFIG = { - "beta_schedule": "scaled_linear", - "beta_start": 0.00085, - "beta_end": 0.012, - "interpolation_type": "linear", - "num_train_timesteps": 1000, - "prediction_type": "epsilon", - "sample_max_value": 1.0, - "set_alpha_to_one": False, - "skip_prk_steps": True, - "steps_offset": 1, - "timestep_spacing": "leading", +DIFFUSERS_DEFAULT_PIPELINE_PATHS = { + "xl_base": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-xl-base-1.0"}, + "xl_refiner": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-xl-refiner-1.0"}, + "xl_inpaint": {"pretrained_model_name_or_path": "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"}, + "playground-v2-5": {"pretrained_model_name_or_path": "playgroundai/playground-v2.5-1024px-aesthetic"}, + "upscale": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-x4-upscaler"}, + "inpainting": {"pretrained_model_name_or_path": "runwayml/stable-diffusion-inpainting"}, + "inpainting_v2": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-2-inpainting"}, + "controlnet": {"pretrained_model_name_or_path": "lllyasviel/control_v11p_sd15_canny"}, + "v2": {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-2-1"}, + "v1": {"pretrained_model_name_or_path": "runwayml/stable-diffusion-v1-5"}, + "stable_cascade_stage_b": {"pretrained_model_name_or_path": "stabilityai/stable-cascade", "subfolder": "decoder"}, + "stable_cascade_stage_b_lite": { + "pretrained_model_name_or_path": "stabilityai/stable-cascade", + "subfolder": "decoder_lite", + }, + "stable_cascade_stage_c": { + "pretrained_model_name_or_path": "stabilityai/stable-cascade-prior", + "subfolder": "prior", + }, + "stable_cascade_stage_c_lite": { + "pretrained_model_name_or_path": "stabilityai/stable-cascade-prior", + "subfolder": "prior_lite", + }, } - -STABLE_CASCADE_DEFAULT_CONFIGS = { - "stage_c": {"pretrained_model_name_or_path": "diffusers/stable-cascade-configs", "subfolder": "prior"}, - "stage_c_lite": {"pretrained_model_name_or_path": "diffusers/stable-cascade-configs", "subfolder": "prior_lite"}, - "stage_b": {"pretrained_model_name_or_path": "diffusers/stable-cascade-configs", "subfolder": "decoder"}, - "stage_b_lite": {"pretrained_model_name_or_path": "diffusers/stable-cascade-configs", "subfolder": "decoder_lite"}, +# Use to configure model sample size when original config is provided +DIFFUSERS_TO_LDM_DEFAULT_IMAGE_SIZE_MAP = { + "xl_base": 1024, + "xl_refiner": 1024, + "xl_inpaint": 1024, + "playground-v2-5": 1024, + "upscale": 512, + "inpainting": 512, + "inpainting_v2": 512, + "controlnet": 512, + "v2": 768, + "v1": 512, } -def convert_stable_cascade_unet_single_file_to_diffusers(original_state_dict): - is_stage_c = "clip_txt_mapper.weight" in original_state_dict - - if is_stage_c: - state_dict = {} - for key in original_state_dict.keys(): - if key.endswith("in_proj_weight"): - weights = original_state_dict[key].chunk(3, 0) - state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0] - state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1] - state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2] - elif key.endswith("in_proj_bias"): - weights = original_state_dict[key].chunk(3, 0) - state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0] - state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1] - state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2] - elif key.endswith("out_proj.weight"): - weights = original_state_dict[key] - state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights - elif key.endswith("out_proj.bias"): - weights = original_state_dict[key] - state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights - else: - state_dict[key] = original_state_dict[key] - else: - state_dict = {} - for key in original_state_dict.keys(): - if key.endswith("in_proj_weight"): - weights = original_state_dict[key].chunk(3, 0) - state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0] - state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1] - state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2] - elif key.endswith("in_proj_bias"): - weights = original_state_dict[key].chunk(3, 0) - state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0] - state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1] - state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2] - elif key.endswith("out_proj.weight"): - weights = original_state_dict[key] - state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights - elif key.endswith("out_proj.bias"): - weights = original_state_dict[key] - state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights - # rename clip_mapper to clip_txt_pooled_mapper - elif key.endswith("clip_mapper.weight"): - weights = original_state_dict[key] - state_dict[key.replace("clip_mapper.weight", "clip_txt_pooled_mapper.weight")] = weights - elif key.endswith("clip_mapper.bias"): - weights = original_state_dict[key] - state_dict[key.replace("clip_mapper.bias", "clip_txt_pooled_mapper.bias")] = weights - else: - state_dict[key] = original_state_dict[key] - - return state_dict - - -def infer_stable_cascade_single_file_config(checkpoint): - is_stage_c = "clip_txt_mapper.weight" in checkpoint - is_stage_b = "down_blocks.1.0.channelwise.0.weight" in checkpoint - - if is_stage_c and (checkpoint["clip_txt_mapper.weight"].shape[0] == 1536): - config_type = "stage_c_lite" - elif is_stage_c and (checkpoint["clip_txt_mapper.weight"].shape[0] == 2048): - config_type = "stage_c" - elif is_stage_b and checkpoint["down_blocks.1.0.channelwise.0.weight"].shape[-1] == 576: - config_type = "stage_b_lite" - elif is_stage_b and checkpoint["down_blocks.1.0.channelwise.0.weight"].shape[-1] == 640: - config_type = "stage_b" - - return STABLE_CASCADE_DEFAULT_CONFIGS[config_type] - - DIFFUSERS_TO_LDM_MAPPING = { "unet": { "layers": { @@ -257,14 +206,6 @@ def infer_stable_cascade_single_file_config(checkpoint): }, } -LDM_VAE_KEY = "first_stage_model." -LDM_VAE_DEFAULT_SCALING_FACTOR = 0.18215 -PLAYGROUND_VAE_SCALING_FACTOR = 0.5 -LDM_UNET_KEY = "model.diffusion_model." -LDM_CONTROLNET_KEY = "control_model." -LDM_CLIP_PREFIX_TO_REMOVE = ["cond_stage_model.transformer.", "conditioner.embedders.0.transformer."] -LDM_OPEN_CLIP_TEXT_PROJECTION_DIM = 1024 - SD_2_TEXT_ENCODER_KEYS_TO_IGNORE = [ "cond_stage_model.model.transformer.resblocks.23.attn.in_proj_bias", "cond_stage_model.model.transformer.resblocks.23.attn.in_proj_weight", @@ -281,11 +222,51 @@ def infer_stable_cascade_single_file_config(checkpoint): "cond_stage_model.model.text_projection", ] +# To support legacy scheduler_type argument +SCHEDULER_DEFAULT_CONFIG = { + "beta_schedule": "scaled_linear", + "beta_start": 0.00085, + "beta_end": 0.012, + "interpolation_type": "linear", + "num_train_timesteps": 1000, + "prediction_type": "epsilon", + "sample_max_value": 1.0, + "set_alpha_to_one": False, + "skip_prk_steps": True, + "steps_offset": 1, + "timestep_spacing": "leading", +} + +LDM_VAE_KEY = "first_stage_model." +LDM_VAE_DEFAULT_SCALING_FACTOR = 0.18215 +PLAYGROUND_VAE_SCALING_FACTOR = 0.5 +LDM_UNET_KEY = "model.diffusion_model." +LDM_CONTROLNET_KEY = "control_model." +LDM_CLIP_PREFIX_TO_REMOVE = ["cond_stage_model.transformer.", "conditioner.embedders.0.transformer."] +OPEN_CLIP_PREFIX = "conditioner.embedders.0.model." +LDM_OPEN_CLIP_TEXT_PROJECTION_DIM = 1024 VALID_URL_PREFIXES = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"] +class SingleFileComponentError(Exception): + def __init__(self, message=None): + self.message = message + super().__init__(self.message) + + +def is_valid_url(url): + result = urlparse(url) + if result.scheme and result.netloc: + return True + + return False + + def _extract_repo_id_and_weights_name(pretrained_model_name_or_path): + if not is_valid_url(pretrained_model_name_or_path): + raise ValueError("Invalid `pretrained_model_name_or_path` provided. Please set it to a valid URL.") + pattern = r"([^/]+)/([^/]+)/(?:blob/main/)?(.+)" weights_name = None repo_id = (None,) @@ -293,6 +274,7 @@ def _extract_repo_id_and_weights_name(pretrained_model_name_or_path): pretrained_model_name_or_path = pretrained_model_name_or_path.replace(prefix, "") match = re.match(pattern, pretrained_model_name_or_path) if not match: + logger.warning("Unable to identify the repo_id and weights_name from the provided URL.") return repo_id, weights_name repo_id = f"{match.group(1)}/{match.group(2)}" @@ -301,34 +283,18 @@ def _extract_repo_id_and_weights_name(pretrained_model_name_or_path): return repo_id, weights_name -def fetch_ldm_config_and_checkpoint( - pretrained_model_link_or_path, - class_name, - original_config_file=None, - resume_download=None, - force_download=False, - proxies=None, - token=None, - cache_dir=None, - local_files_only=None, - revision=None, -): - checkpoint = load_single_file_model_checkpoint( - pretrained_model_link_or_path, - resume_download=resume_download, - force_download=force_download, - proxies=proxies, - token=token, - cache_dir=cache_dir, - local_files_only=local_files_only, - revision=revision, - ) - original_config = fetch_original_config(class_name, checkpoint, original_config_file) +def _is_model_weights_in_cached_folder(cached_folder, name): + pretrained_model_name_or_path = os.path.join(cached_folder, name) + weights_exist = False - return original_config, checkpoint + for weights_name in [WEIGHTS_NAME, SAFETENSORS_WEIGHTS_NAME]: + if os.path.isfile(os.path.join(pretrained_model_name_or_path, weights_name)): + weights_exist = True + return weights_exist -def load_single_file_model_checkpoint( + +def load_single_file_checkpoint( pretrained_model_link_or_path, resume_download=False, force_download=False, @@ -339,10 +305,11 @@ def load_single_file_model_checkpoint( revision=None, ): if os.path.isfile(pretrained_model_link_or_path): - checkpoint = load_state_dict(pretrained_model_link_or_path) + pretrained_model_link_or_path = pretrained_model_link_or_path + else: repo_id, weights_name = _extract_repo_id_and_weights_name(pretrained_model_link_or_path) - checkpoint_path = _get_model_file( + pretrained_model_link_or_path = _get_model_file( repo_id, weights_name=weights_name, force_download=force_download, @@ -353,7 +320,8 @@ def load_single_file_model_checkpoint( token=token, revision=revision, ) - checkpoint = load_state_dict(checkpoint_path) + + checkpoint = load_state_dict(pretrained_model_link_or_path) # some checkpoints contain the model state dict under a "state_dict" key while "state_dict" in checkpoint: @@ -362,120 +330,154 @@ def load_single_file_model_checkpoint( return checkpoint -def infer_original_config_file(class_name, checkpoint): - if CHECKPOINT_KEY_NAMES["v2"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["v2"]].shape[-1] == 1024: - config_url = CONFIG_URLS["v2"] +def fetch_original_config(original_config_file, local_files_only=False): + if os.path.isfile(original_config_file): + with open(original_config_file, "r") as fp: + original_config_file = fp.read() - elif CHECKPOINT_KEY_NAMES["xl_base"] in checkpoint: - config_url = CONFIG_URLS["xl"] + elif is_valid_url(original_config_file): + if local_files_only: + raise ValueError( + "`local_files_only` is set to True, but a URL was provided as `original_config_file`. " + "Please provide a valid local file path." + ) - elif CHECKPOINT_KEY_NAMES["xl_refiner"] in checkpoint: - config_url = CONFIG_URLS["xl_refiner"] + original_config_file = BytesIO(requests.get(original_config_file).content) - elif class_name == "StableDiffusionUpscalePipeline": - config_url = CONFIG_URLS["upscale"] + else: + raise ValueError("Invalid `original_config_file` provided. Please set it to a valid file path or URL.") - elif class_name == "ControlNetModel": - config_url = CONFIG_URLS["controlnet"] + original_config = yaml.safe_load(original_config_file) - else: - config_url = CONFIG_URLS["v1"] + return original_config - original_config_file = BytesIO(requests.get(config_url).content) - return original_config_file +def is_clip_model(checkpoint): + if CHECKPOINT_KEY_NAMES["clip"] in checkpoint: + return True + return False -def fetch_original_config(pipeline_class_name, checkpoint, original_config_file=None): - def is_valid_url(url): - result = urlparse(url) - if result.scheme and result.netloc: - return True - return False +def is_clip_sdxl_model(checkpoint): + if CHECKPOINT_KEY_NAMES["clip_sdxl"] in checkpoint: + return True - if original_config_file is None: - original_config_file = infer_original_config_file(pipeline_class_name, checkpoint) + return False - elif os.path.isfile(original_config_file): - with open(original_config_file, "r") as fp: - original_config_file = fp.read() - elif is_valid_url(original_config_file): - original_config_file = BytesIO(requests.get(original_config_file).content) +def is_open_clip_model(checkpoint): + if CHECKPOINT_KEY_NAMES["open_clip"] in checkpoint: + return True - else: - raise ValueError("Invalid `original_config_file` provided. Please set it to a valid file path or URL.") + return False - original_config = yaml.safe_load(original_config_file) - return original_config +def is_open_clip_sdxl_model(checkpoint): + if CHECKPOINT_KEY_NAMES["open_clip_sdxl"] in checkpoint: + return True + return False -def infer_model_type(original_config, checkpoint, model_type=None): - if model_type is not None: - return model_type - has_cond_stage_config = ( - "cond_stage_config" in original_config["model"]["params"] - and original_config["model"]["params"]["cond_stage_config"] is not None - ) - has_network_config = ( - "network_config" in original_config["model"]["params"] - and original_config["model"]["params"]["network_config"] is not None +def is_open_clip_sdxl_refiner_model(checkpoint): + if CHECKPOINT_KEY_NAMES["open_clip_sdxl_refiner"] in checkpoint: + return True + + return False + + +def is_clip_model_in_single_file(class_obj, checkpoint): + is_clip_in_checkpoint = any( + [ + is_clip_model(checkpoint), + is_open_clip_model(checkpoint), + is_open_clip_sdxl_model(checkpoint), + is_open_clip_sdxl_refiner_model(checkpoint), + ] ) + if ( + class_obj.__name__ == "CLIPTextModel" or class_obj.__name__ == "CLIPTextModelWithProjection" + ) and is_clip_in_checkpoint: + return True - if has_cond_stage_config: - model_type = original_config["model"]["params"]["cond_stage_config"]["target"].split(".")[-1] + return False - elif has_network_config: - context_dim = original_config["model"]["params"]["network_config"]["params"]["context_dim"] - if "edm_mean" in checkpoint and "edm_std" in checkpoint: - model_type = "Playground" - elif context_dim == 2048: - model_type = "SDXL" + +def infer_diffusers_model_type(checkpoint): + if ( + CHECKPOINT_KEY_NAMES["inpainting"] in checkpoint + and checkpoint[CHECKPOINT_KEY_NAMES["inpainting"]].shape[1] == 9 + ): + if CHECKPOINT_KEY_NAMES["v2"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["v2"]].shape[-1] == 1024: + model_type = "inpainting_v2" else: - model_type = "SDXL-Refiner" - else: - raise ValueError("Unable to infer model type from config") + model_type = "inpainting" - logger.debug(f"No `model_type` given, `model_type` inferred as: {model_type}") + elif CHECKPOINT_KEY_NAMES["v2"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["v2"]].shape[-1] == 1024: + model_type = "v2" - return model_type + elif CHECKPOINT_KEY_NAMES["playground-v2-5"] in checkpoint: + model_type = "playground-v2-5" + elif CHECKPOINT_KEY_NAMES["xl_base"] in checkpoint: + model_type = "xl_base" -def get_default_scheduler_config(): - return SCHEDULER_DEFAULT_CONFIG + elif CHECKPOINT_KEY_NAMES["xl_refiner"] in checkpoint: + model_type = "xl_refiner" + elif CHECKPOINT_KEY_NAMES["upscale"] in checkpoint: + model_type = "upscale" -def set_image_size(pipeline_class_name, original_config, checkpoint, image_size=None, model_type=None): - if image_size: - return image_size + elif CHECKPOINT_KEY_NAMES["controlnet"] in checkpoint: + model_type = "controlnet" - global_step = checkpoint["global_step"] if "global_step" in checkpoint else None - model_type = infer_model_type(original_config, checkpoint, model_type) + elif ( + CHECKPOINT_KEY_NAMES["stable_cascade_stage_c"] in checkpoint + and checkpoint[CHECKPOINT_KEY_NAMES["stable_cascade_stage_c"]].shape[0] == 1536 + ): + model_type = "stable_cascade_stage_c_lite" - if pipeline_class_name == "StableDiffusionUpscalePipeline": - image_size = original_config["model"]["params"]["unet_config"]["params"]["image_size"] - return image_size + elif ( + CHECKPOINT_KEY_NAMES["stable_cascade_stage_c"] in checkpoint + and checkpoint[CHECKPOINT_KEY_NAMES["stable_cascade_stage_c"]].shape[0] == 2048 + ): + model_type = "stable_cascade_stage_c" - elif model_type in ["SDXL", "SDXL-Refiner", "Playground"]: - image_size = 1024 - return image_size + elif ( + CHECKPOINT_KEY_NAMES["stable_cascade_stage_b"] in checkpoint + and checkpoint[CHECKPOINT_KEY_NAMES["stable_cascade_stage_b"]].shape[-1] == 576 + ): + model_type = "stable_cascade_stage_b_lite" elif ( - "parameterization" in original_config["model"]["params"] - and original_config["model"]["params"]["parameterization"] == "v" + CHECKPOINT_KEY_NAMES["stable_cascade_stage_b"] in checkpoint + and checkpoint[CHECKPOINT_KEY_NAMES["stable_cascade_stage_b"]].shape[-1] == 640 ): - # NOTE: For stable diffusion 2 base one has to pass `image_size==512` - # as it relies on a brittle global step parameter here - image_size = 512 if global_step == 875000 else 768 - return image_size + model_type = "stable_cascade_stage_b" else: - image_size = 512 + model_type = "v1" + + return model_type + + +def fetch_diffusers_config(checkpoint): + model_type = infer_diffusers_model_type(checkpoint) + model_path = DIFFUSERS_DEFAULT_PIPELINE_PATHS[model_type] + + return model_path + + +def set_image_size(checkpoint, image_size=None): + if image_size: return image_size + model_type = infer_diffusers_model_type(checkpoint) + image_size = DIFFUSERS_TO_LDM_DEFAULT_IMAGE_SIZE_MAP[model_type] + + return image_size + # Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.conv_attn_to_linear def conv_attn_to_linear(checkpoint): @@ -490,10 +492,21 @@ def conv_attn_to_linear(checkpoint): checkpoint[key] = checkpoint[key][:, :, 0] -def create_unet_diffusers_config(original_config, image_size: int): +def create_unet_diffusers_config_from_ldm( + original_config, checkpoint, image_size=None, upcast_attention=None, num_in_channels=None +): """ Creates a config for the diffusers based on the config of the LDM model. """ + if image_size is not None: + deprecation_message = ( + "Configuring UNet2DConditionModel with the `image_size` argument to `from_single_file`" + "is deprecated and will be ignored in future versions." + ) + deprecate("image_size", "1.0.0", deprecation_message) + + image_size = set_image_size(checkpoint, image_size=image_size) + if ( "unet_config" in original_config["model"]["params"] and original_config["model"]["params"]["unet_config"] is not None @@ -502,6 +515,16 @@ def create_unet_diffusers_config(original_config, image_size: int): else: unet_params = original_config["model"]["params"]["network_config"]["params"] + if num_in_channels is not None: + deprecation_message = ( + "Configuring UNet2DConditionModel with the `num_in_channels` argument to `from_single_file`" + "is deprecated and will be ignored in future versions." + ) + deprecate("image_size", "1.0.0", deprecation_message) + in_channels = num_in_channels + else: + in_channels = unet_params["in_channels"] + vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"] block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]] @@ -566,7 +589,7 @@ def create_unet_diffusers_config(original_config, image_size: int): config = { "sample_size": image_size // vae_scale_factor, - "in_channels": unet_params["in_channels"], + "in_channels": in_channels, "down_block_types": down_block_types, "block_out_channels": block_out_channels, "layers_per_block": unet_params["num_res_blocks"], @@ -580,6 +603,14 @@ def create_unet_diffusers_config(original_config, image_size: int): "transformer_layers_per_block": transformer_layers_per_block, } + if upcast_attention is not None: + deprecation_message = ( + "Configuring UNet2DConditionModel with the `upcast_attention` argument to `from_single_file`" + "is deprecated and will be ignored in future versions." + ) + deprecate("image_size", "1.0.0", deprecation_message) + config["upcast_attention"] = upcast_attention + if "disable_self_attentions" in unet_params: config["only_cross_attention"] = unet_params["disable_self_attentions"] @@ -592,9 +623,18 @@ def create_unet_diffusers_config(original_config, image_size: int): return config -def create_controlnet_diffusers_config(original_config, image_size: int): +def create_controlnet_diffusers_config_from_ldm(original_config, checkpoint, image_size=None, **kwargs): + if image_size is not None: + deprecation_message = ( + "Configuring ControlNetModel with the `image_size` argument" + "is deprecated and will be ignored in future versions." + ) + deprecate("image_size", "1.0.0", deprecation_message) + + image_size = set_image_size(checkpoint, image_size=image_size) + unet_params = original_config["model"]["params"]["control_stage_config"]["params"] - diffusers_unet_config = create_unet_diffusers_config(original_config, image_size=image_size) + diffusers_unet_config = create_unet_diffusers_config_from_ldm(original_config, image_size=image_size) controlnet_config = { "conditioning_channels": unet_params["hint_channels"], @@ -615,15 +655,33 @@ def create_controlnet_diffusers_config(original_config, image_size: int): return controlnet_config -def create_vae_diffusers_config(original_config, image_size, scaling_factor=None, latents_mean=None, latents_std=None): +def create_vae_diffusers_config_from_ldm(original_config, checkpoint, image_size=None, scaling_factor=None): """ Creates a config for the diffusers based on the config of the LDM model. """ + if image_size is not None: + deprecation_message = ( + "Configuring AutoencoderKL with the `image_size` argument" + "is deprecated and will be ignored in future versions." + ) + deprecate("image_size", "1.0.0", deprecation_message) + + image_size = set_image_size(checkpoint, image_size=image_size) + + if "edm_mean" in checkpoint and "edm_std" in checkpoint: + latents_mean = checkpoint["edm_mean"] + latents_std = checkpoint["edm_std"] + else: + latents_mean = None + latents_std = None + vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"] if (scaling_factor is None) and (latents_mean is not None) and (latents_std is not None): scaling_factor = PLAYGROUND_VAE_SCALING_FACTOR + elif (scaling_factor is None) and ("scale_factor" in original_config["model"]["params"]): scaling_factor = original_config["model"]["params"]["scale_factor"] + elif scaling_factor is None: scaling_factor = LDM_VAE_DEFAULT_SCALING_FACTOR @@ -660,16 +718,104 @@ def update_unet_resnet_ldm_to_diffusers(ldm_keys, new_checkpoint, checkpoint, ma ) if mapping: diffusers_key = diffusers_key.replace(mapping["old"], mapping["new"]) - new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key) + new_checkpoint[diffusers_key] = checkpoint.get(ldm_key) def update_unet_attention_ldm_to_diffusers(ldm_keys, new_checkpoint, checkpoint, mapping): for ldm_key in ldm_keys: diffusers_key = ldm_key.replace(mapping["old"], mapping["new"]) - new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key) + new_checkpoint[diffusers_key] = checkpoint.get(ldm_key) + + +def update_vae_resnet_ldm_to_diffusers(keys, new_checkpoint, checkpoint, mapping): + for ldm_key in keys: + diffusers_key = ldm_key.replace(mapping["old"], mapping["new"]).replace("nin_shortcut", "conv_shortcut") + new_checkpoint[diffusers_key] = checkpoint.get(ldm_key) + + +def update_vae_attentions_ldm_to_diffusers(keys, new_checkpoint, checkpoint, mapping): + for ldm_key in keys: + diffusers_key = ( + ldm_key.replace(mapping["old"], mapping["new"]) + .replace("norm.weight", "group_norm.weight") + .replace("norm.bias", "group_norm.bias") + .replace("q.weight", "to_q.weight") + .replace("q.bias", "to_q.bias") + .replace("k.weight", "to_k.weight") + .replace("k.bias", "to_k.bias") + .replace("v.weight", "to_v.weight") + .replace("v.bias", "to_v.bias") + .replace("proj_out.weight", "to_out.0.weight") + .replace("proj_out.bias", "to_out.0.bias") + ) + new_checkpoint[diffusers_key] = checkpoint.get(ldm_key) + + # proj_attn.weight has to be converted from conv 1D to linear + shape = new_checkpoint[diffusers_key].shape + + if len(shape) == 3: + new_checkpoint[diffusers_key] = new_checkpoint[diffusers_key][:, :, 0] + elif len(shape) == 4: + new_checkpoint[diffusers_key] = new_checkpoint[diffusers_key][:, :, 0, 0] + + +def convert_stable_cascade_unet_single_file_to_diffusers(checkpoint, **kwargs): + is_stage_c = "clip_txt_mapper.weight" in checkpoint + + if is_stage_c: + state_dict = {} + for key in checkpoint.keys(): + if key.endswith("in_proj_weight"): + weights = checkpoint[key].chunk(3, 0) + state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0] + state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1] + state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2] + elif key.endswith("in_proj_bias"): + weights = checkpoint[key].chunk(3, 0) + state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0] + state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1] + state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2] + elif key.endswith("out_proj.weight"): + weights = checkpoint[key] + state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights + elif key.endswith("out_proj.bias"): + weights = checkpoint[key] + state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights + else: + state_dict[key] = checkpoint[key] + else: + state_dict = {} + for key in checkpoint.keys(): + if key.endswith("in_proj_weight"): + weights = checkpoint[key].chunk(3, 0) + state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0] + state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1] + state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2] + elif key.endswith("in_proj_bias"): + weights = checkpoint[key].chunk(3, 0) + state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0] + state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1] + state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2] + elif key.endswith("out_proj.weight"): + weights = checkpoint[key] + state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights + elif key.endswith("out_proj.bias"): + weights = checkpoint[key] + state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights + # rename clip_mapper to clip_txt_pooled_mapper + elif key.endswith("clip_mapper.weight"): + weights = checkpoint[key] + state_dict[key.replace("clip_mapper.weight", "clip_txt_pooled_mapper.weight")] = weights + elif key.endswith("clip_mapper.bias"): + weights = checkpoint[key] + state_dict[key.replace("clip_mapper.bias", "clip_txt_pooled_mapper.bias")] = weights + else: + state_dict[key] = checkpoint[key] + + return state_dict -def convert_ldm_unet_checkpoint(checkpoint, config, extract_ema=False): +def convert_ldm_unet_checkpoint(checkpoint, config, extract_ema=False, **kwargs): """ Takes a state dict and a config, and returns a converted checkpoint. """ @@ -688,7 +834,7 @@ def convert_ldm_unet_checkpoint(checkpoint, config, extract_ema=False): for key in keys: if key.startswith("model.diffusion_model"): flat_ema_key = "model_ema." + "".join(key.split(".")[1:]) - unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key) + unet_state_dict[key.replace(unet_key, "")] = checkpoint.get(flat_ema_key) else: if sum(k.startswith("model_ema") for k in keys) > 100: logger.warning( @@ -697,7 +843,7 @@ def convert_ldm_unet_checkpoint(checkpoint, config, extract_ema=False): ) for key in keys: if key.startswith(unet_key): - unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key) + unet_state_dict[key.replace(unet_key, "")] = checkpoint.get(key) new_checkpoint = {} ldm_unet_keys = DIFFUSERS_TO_LDM_MAPPING["unet"]["layers"] @@ -758,10 +904,10 @@ def convert_ldm_unet_checkpoint(checkpoint, config, extract_ema=False): ) if f"input_blocks.{i}.0.op.weight" in unet_state_dict: - new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.get( f"input_blocks.{i}.0.op.weight" ) - new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.get( f"input_blocks.{i}.0.op.bias" ) @@ -775,19 +921,22 @@ def convert_ldm_unet_checkpoint(checkpoint, config, extract_ema=False): ) # Mid blocks - resnet_0 = middle_blocks[0] - attentions = middle_blocks[1] - resnet_1 = middle_blocks[2] - - update_unet_resnet_ldm_to_diffusers( - resnet_0, new_checkpoint, unet_state_dict, mapping={"old": "middle_block.0", "new": "mid_block.resnets.0"} - ) - update_unet_resnet_ldm_to_diffusers( - resnet_1, new_checkpoint, unet_state_dict, mapping={"old": "middle_block.2", "new": "mid_block.resnets.1"} - ) - update_unet_attention_ldm_to_diffusers( - attentions, new_checkpoint, unet_state_dict, mapping={"old": "middle_block.1", "new": "mid_block.attentions.0"} - ) + for key in middle_blocks.keys(): + diffusers_key = max(key - 1, 0) + if key % 2 == 0: + update_unet_resnet_ldm_to_diffusers( + middle_blocks[key], + new_checkpoint, + unet_state_dict, + mapping={"old": f"middle_block.{key}", "new": f"mid_block.resnets.{diffusers_key}"}, + ) + else: + update_unet_attention_ldm_to_diffusers( + middle_blocks[key], + new_checkpoint, + unet_state_dict, + mapping={"old": f"middle_block.{key}", "new": f"mid_block.attentions.{diffusers_key}"}, + ) # Up Blocks for i in range(num_output_blocks): @@ -836,6 +985,7 @@ def convert_ldm_unet_checkpoint(checkpoint, config, extract_ema=False): def convert_controlnet_checkpoint( checkpoint, config, + **kwargs, ): # Some controlnet ckpt files are distributed independently from the rest of the # model components i.e. https://huggingface.co/thibaud/controlnet-sd21/ @@ -848,7 +998,7 @@ def convert_controlnet_checkpoint( controlnet_key = LDM_CONTROLNET_KEY for key in keys: if key.startswith(controlnet_key): - controlnet_state_dict[key.replace(controlnet_key, "")] = checkpoint.pop(key) + controlnet_state_dict[key.replace(controlnet_key, "")] = checkpoint.get(key) new_checkpoint = {} ldm_controlnet_keys = DIFFUSERS_TO_LDM_MAPPING["controlnet"]["layers"] @@ -882,10 +1032,10 @@ def convert_controlnet_checkpoint( ) if f"input_blocks.{i}.0.op.weight" in controlnet_state_dict: - new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = controlnet_state_dict.pop( + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = controlnet_state_dict.get( f"input_blocks.{i}.0.op.weight" ) - new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = controlnet_state_dict.pop( + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = controlnet_state_dict.get( f"input_blocks.{i}.0.op.bias" ) @@ -900,8 +1050,8 @@ def convert_controlnet_checkpoint( # controlnet down blocks for i in range(num_input_blocks): - new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = controlnet_state_dict.pop(f"zero_convs.{i}.0.weight") - new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = controlnet_state_dict.pop(f"zero_convs.{i}.0.bias") + new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = controlnet_state_dict.get(f"zero_convs.{i}.0.weight") + new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = controlnet_state_dict.get(f"zero_convs.{i}.0.bias") # Retrieves the keys for the middle blocks only num_middle_blocks = len( @@ -911,33 +1061,28 @@ def convert_controlnet_checkpoint( layer_id: [key for key in controlnet_state_dict if f"middle_block.{layer_id}" in key] for layer_id in range(num_middle_blocks) } - if middle_blocks: - resnet_0 = middle_blocks[0] - attentions = middle_blocks[1] - resnet_1 = middle_blocks[2] - update_unet_resnet_ldm_to_diffusers( - resnet_0, - new_checkpoint, - controlnet_state_dict, - mapping={"old": "middle_block.0", "new": "mid_block.resnets.0"}, - ) - update_unet_resnet_ldm_to_diffusers( - resnet_1, - new_checkpoint, - controlnet_state_dict, - mapping={"old": "middle_block.2", "new": "mid_block.resnets.1"}, - ) - update_unet_attention_ldm_to_diffusers( - attentions, - new_checkpoint, - controlnet_state_dict, - mapping={"old": "middle_block.1", "new": "mid_block.attentions.0"}, - ) + # Mid blocks + for key in middle_blocks.keys(): + diffusers_key = max(key - 1, 0) + if key % 2 == 0: + update_unet_resnet_ldm_to_diffusers( + middle_blocks[key], + new_checkpoint, + controlnet_state_dict, + mapping={"old": f"middle_block.{key}", "new": f"mid_block.resnets.{diffusers_key}"}, + ) + else: + update_unet_attention_ldm_to_diffusers( + middle_blocks[key], + new_checkpoint, + controlnet_state_dict, + mapping={"old": f"middle_block.{key}", "new": f"mid_block.attentions.{diffusers_key}"}, + ) # mid block - new_checkpoint["controlnet_mid_block.weight"] = controlnet_state_dict.pop("middle_block_out.0.weight") - new_checkpoint["controlnet_mid_block.bias"] = controlnet_state_dict.pop("middle_block_out.0.bias") + new_checkpoint["controlnet_mid_block.weight"] = controlnet_state_dict.get("middle_block_out.0.weight") + new_checkpoint["controlnet_mid_block.bias"] = controlnet_state_dict.get("middle_block_out.0.bias") # controlnet cond embedding blocks cond_embedding_blocks = { @@ -951,86 +1096,16 @@ def convert_controlnet_checkpoint( diffusers_idx = idx - 1 cond_block_id = 2 * idx - new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_idx}.weight"] = controlnet_state_dict.pop( + new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_idx}.weight"] = controlnet_state_dict.get( f"input_hint_block.{cond_block_id}.weight" ) - new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_idx}.bias"] = controlnet_state_dict.pop( + new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_idx}.bias"] = controlnet_state_dict.get( f"input_hint_block.{cond_block_id}.bias" ) return new_checkpoint -def create_diffusers_controlnet_model_from_ldm( - pipeline_class_name, original_config, checkpoint, upcast_attention=False, image_size=None, torch_dtype=None -): - # import here to avoid circular imports - from ..models import ControlNetModel - - image_size = set_image_size(pipeline_class_name, original_config, checkpoint, image_size=image_size) - - diffusers_config = create_controlnet_diffusers_config(original_config, image_size=image_size) - diffusers_config["upcast_attention"] = upcast_attention - - diffusers_format_controlnet_checkpoint = convert_controlnet_checkpoint(checkpoint, diffusers_config) - - ctx = init_empty_weights if is_accelerate_available() else nullcontext - with ctx(): - controlnet = ControlNetModel(**diffusers_config) - - if is_accelerate_available(): - unexpected_keys = load_model_dict_into_meta( - controlnet, diffusers_format_controlnet_checkpoint, dtype=torch_dtype - ) - if controlnet._keys_to_ignore_on_load_unexpected is not None: - for pat in controlnet._keys_to_ignore_on_load_unexpected: - unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] - - if len(unexpected_keys) > 0: - logger.warning( - f"Some weights of the model checkpoint were not used when initializing {controlnet.__name__}: \n {[', '.join(unexpected_keys)]}" - ) - else: - controlnet.load_state_dict(diffusers_format_controlnet_checkpoint) - - if torch_dtype is not None: - controlnet = controlnet.to(torch_dtype) - - return {"controlnet": controlnet} - - -def update_vae_resnet_ldm_to_diffusers(keys, new_checkpoint, checkpoint, mapping): - for ldm_key in keys: - diffusers_key = ldm_key.replace(mapping["old"], mapping["new"]).replace("nin_shortcut", "conv_shortcut") - new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key) - - -def update_vae_attentions_ldm_to_diffusers(keys, new_checkpoint, checkpoint, mapping): - for ldm_key in keys: - diffusers_key = ( - ldm_key.replace(mapping["old"], mapping["new"]) - .replace("norm.weight", "group_norm.weight") - .replace("norm.bias", "group_norm.bias") - .replace("q.weight", "to_q.weight") - .replace("q.bias", "to_q.bias") - .replace("k.weight", "to_k.weight") - .replace("k.bias", "to_k.bias") - .replace("v.weight", "to_v.weight") - .replace("v.bias", "to_v.bias") - .replace("proj_out.weight", "to_out.0.weight") - .replace("proj_out.bias", "to_out.0.bias") - ) - new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key) - - # proj_attn.weight has to be converted from conv 1D to linear - shape = new_checkpoint[diffusers_key].shape - - if len(shape) == 3: - new_checkpoint[diffusers_key] = new_checkpoint[diffusers_key][:, :, 0] - elif len(shape) == 4: - new_checkpoint[diffusers_key] = new_checkpoint[diffusers_key][:, :, 0, 0] - - def convert_ldm_vae_checkpoint(checkpoint, config): # extract state dict for VAE # remove the LDM_VAE_KEY prefix from the ldm checkpoint keys so that it is easier to map them to diffusers keys @@ -1063,10 +1138,10 @@ def convert_ldm_vae_checkpoint(checkpoint, config): mapping={"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}, ) if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: - new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.get( f"encoder.down.{i}.downsample.conv.weight" ) - new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.get( f"encoder.down.{i}.downsample.conv.bias" ) @@ -1131,18 +1206,7 @@ def convert_ldm_vae_checkpoint(checkpoint, config): return new_checkpoint -def create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_files_only=False, torch_dtype=None): - try: - config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only) - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: 'openai/clip-vit-large-patch14'." - ) - - ctx = init_empty_weights if is_accelerate_available() else nullcontext - with ctx(): - text_model = CLIPTextModel(config) - +def convert_ldm_clip_checkpoint(checkpoint): keys = list(checkpoint.keys()) text_model_dict = {} @@ -1152,55 +1216,26 @@ def create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_ for prefix in remove_prefixes: if key.startswith(prefix): diffusers_key = key.replace(prefix, "") - text_model_dict[diffusers_key] = checkpoint[key] + text_model_dict[diffusers_key] = checkpoint.get(key) - if is_accelerate_available(): - unexpected_keys = load_model_dict_into_meta(text_model, text_model_dict, dtype=torch_dtype) - if text_model._keys_to_ignore_on_load_unexpected is not None: - for pat in text_model._keys_to_ignore_on_load_unexpected: - unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] + return text_model_dict - if len(unexpected_keys) > 0: - logger.warning( - f"Some weights of the model checkpoint were not used when initializing {text_model.__class__.__name__}: \n {[', '.join(unexpected_keys)]}" - ) - else: - if not (hasattr(text_model, "embeddings") and hasattr(text_model.embeddings.position_ids)): - text_model_dict.pop("text_model.embeddings.position_ids", None) - text_model.load_state_dict(text_model_dict) - - if torch_dtype is not None: - text_model = text_model.to(torch_dtype) - - return text_model - - -def create_text_encoder_from_open_clip_checkpoint( - config_name, +def convert_open_clip_checkpoint( + text_model, checkpoint, prefix="cond_stage_model.model.", - has_projection=False, - local_files_only=False, - torch_dtype=None, - **config_kwargs, ): - try: - config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs, local_files_only=local_files_only) - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: '{config_name}'." - ) - - ctx = init_empty_weights if is_accelerate_available() else nullcontext - with ctx(): - text_model = CLIPTextModelWithProjection(config) if has_projection else CLIPTextModel(config) - text_model_dict = {} text_proj_key = prefix + "text_projection" - text_proj_dim = ( - int(checkpoint[text_proj_key].shape[0]) if text_proj_key in checkpoint else LDM_OPEN_CLIP_TEXT_PROJECTION_DIM - ) + + if text_proj_key in checkpoint: + text_proj_dim = int(checkpoint[text_proj_key].shape[0]) + elif hasattr(text_model.config, "projection_dim"): + text_proj_dim = text_model.config.projection_dim + else: + text_proj_dim = LDM_OPEN_CLIP_TEXT_PROJECTION_DIM + text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids") keys = list(checkpoint.keys()) @@ -1233,303 +1268,165 @@ def create_text_encoder_from_open_clip_checkpoint( ) if key.endswith(".in_proj_weight"): - weight_value = checkpoint[key] + weight_value = checkpoint.get(key) - text_model_dict[diffusers_key + ".q_proj.weight"] = weight_value[:text_proj_dim, :] - text_model_dict[diffusers_key + ".k_proj.weight"] = weight_value[text_proj_dim : text_proj_dim * 2, :] - text_model_dict[diffusers_key + ".v_proj.weight"] = weight_value[text_proj_dim * 2 :, :] + text_model_dict[diffusers_key + ".q_proj.weight"] = weight_value[:text_proj_dim, :].clone().detach() + text_model_dict[diffusers_key + ".k_proj.weight"] = ( + weight_value[text_proj_dim : text_proj_dim * 2, :].clone().detach() + ) + text_model_dict[diffusers_key + ".v_proj.weight"] = weight_value[text_proj_dim * 2 :, :].clone().detach() elif key.endswith(".in_proj_bias"): - weight_value = checkpoint[key] - text_model_dict[diffusers_key + ".q_proj.bias"] = weight_value[:text_proj_dim] - text_model_dict[diffusers_key + ".k_proj.bias"] = weight_value[text_proj_dim : text_proj_dim * 2] - text_model_dict[diffusers_key + ".v_proj.bias"] = weight_value[text_proj_dim * 2 :] - else: - text_model_dict[diffusers_key] = checkpoint[key] - - if is_accelerate_available(): - unexpected_keys = load_model_dict_into_meta(text_model, text_model_dict, dtype=torch_dtype) - if text_model._keys_to_ignore_on_load_unexpected is not None: - for pat in text_model._keys_to_ignore_on_load_unexpected: - unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] - - if len(unexpected_keys) > 0: - logger.warning( - f"Some weights of the model checkpoint were not used when initializing {text_model.__class__.__name__}: \n {[', '.join(unexpected_keys)]}" + weight_value = checkpoint.get(key) + text_model_dict[diffusers_key + ".q_proj.bias"] = weight_value[:text_proj_dim].clone().detach() + text_model_dict[diffusers_key + ".k_proj.bias"] = ( + weight_value[text_proj_dim : text_proj_dim * 2].clone().detach() ) + text_model_dict[diffusers_key + ".v_proj.bias"] = weight_value[text_proj_dim * 2 :].clone().detach() + else: + text_model_dict[diffusers_key] = checkpoint.get(key) - else: - if not (hasattr(text_model, "embeddings") and hasattr(text_model.embeddings.position_ids)): - text_model_dict.pop("text_model.embeddings.position_ids", None) - - text_model.load_state_dict(text_model_dict) - - if torch_dtype is not None: - text_model = text_model.to(torch_dtype) + if not (hasattr(text_model, "embeddings") and hasattr(text_model.embeddings.position_ids)): + text_model_dict.pop("text_model.embeddings.position_ids", None) - return text_model + return text_model_dict -def create_diffusers_unet_model_from_ldm( - pipeline_class_name, - original_config, +def create_diffusers_clip_model_from_ldm( + cls, checkpoint, - num_in_channels=None, - upcast_attention=None, - extract_ema=False, - image_size=None, + subfolder="", + config=None, torch_dtype=None, - model_type=None, + local_files_only=None, + is_legacy_loading=False, ): - from ..models import UNet2DConditionModel + if config: + config = {"pretrained_model_name_or_path": config} + else: + config = fetch_diffusers_config(checkpoint) - if num_in_channels is None: - if pipeline_class_name in [ - "StableDiffusionInpaintPipeline", - "StableDiffusionControlNetInpaintPipeline", - "StableDiffusionXLInpaintPipeline", - "StableDiffusionXLControlNetInpaintPipeline", - ]: - num_in_channels = 9 + # For backwards compatibility + # Older versions of `from_single_file` expected CLIP configs to be placed in their original transformers model repo + # in the cache_dir, rather than in a subfolder of the Diffusers model + if is_legacy_loading: + logger.warning( + ( + "Detected legacy CLIP loading behavior. Please run `from_single_file` with `local_files_only=False once to update " + "the local cache directory with the necessary CLIP model config files. " + "Attempting to load CLIP model from legacy cache directory." + ) + ) - elif pipeline_class_name == "StableDiffusionUpscalePipeline": - num_in_channels = 7 + if is_clip_model(checkpoint) or is_clip_sdxl_model(checkpoint): + clip_config = "openai/clip-vit-large-patch14" + config["pretrained_model_name_or_path"] = clip_config + subfolder = "" - else: - num_in_channels = 4 + elif is_open_clip_model(checkpoint): + clip_config = "stabilityai/stable-diffusion-2" + config["pretrained_model_name_or_path"] = clip_config + subfolder = "text_encoder" - image_size = set_image_size( - pipeline_class_name, original_config, checkpoint, image_size=image_size, model_type=model_type - ) - unet_config = create_unet_diffusers_config(original_config, image_size=image_size) - unet_config["in_channels"] = num_in_channels - if upcast_attention is not None: - unet_config["upcast_attention"] = upcast_attention + else: + clip_config = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" + config["pretrained_model_name_or_path"] = clip_config + subfolder = "" - diffusers_format_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config, extract_ema=extract_ema) + model_config = cls.config_class.from_pretrained(**config, subfolder=subfolder, local_files_only=local_files_only) ctx = init_empty_weights if is_accelerate_available() else nullcontext - with ctx(): - unet = UNet2DConditionModel(**unet_config) + model = cls(model_config) - if is_accelerate_available(): - unexpected_keys = load_model_dict_into_meta(unet, diffusers_format_unet_checkpoint, dtype=torch_dtype) - if unet._keys_to_ignore_on_load_unexpected is not None: - for pat in unet._keys_to_ignore_on_load_unexpected: - unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] + position_embedding_dim = model.text_model.embeddings.position_embedding.weight.shape[-1] - if len(unexpected_keys) > 0: - logger.warning( - f"Some weights of the model checkpoint were not used when initializing {unet.__name__}: \n {[', '.join(unexpected_keys)]}" - ) - else: - unet.load_state_dict(diffusers_format_unet_checkpoint) + if is_clip_model(checkpoint): + diffusers_format_checkpoint = convert_ldm_clip_checkpoint(checkpoint) - if torch_dtype is not None: - unet = unet.to(torch_dtype) - - return {"unet": unet} + elif ( + is_clip_sdxl_model(checkpoint) + and checkpoint[CHECKPOINT_KEY_NAMES["clip_sdxl"]].shape[-1] == position_embedding_dim + ): + diffusers_format_checkpoint = convert_ldm_clip_checkpoint(checkpoint) + elif is_open_clip_model(checkpoint): + prefix = "cond_stage_model.model." + diffusers_format_checkpoint = convert_open_clip_checkpoint(model, checkpoint, prefix=prefix) -def create_diffusers_vae_model_from_ldm( - pipeline_class_name, - original_config, - checkpoint, - image_size=None, - scaling_factor=None, - torch_dtype=None, - model_type=None, -): - # import here to avoid circular imports - from ..models import AutoencoderKL + elif ( + is_open_clip_sdxl_model(checkpoint) + and checkpoint[CHECKPOINT_KEY_NAMES["open_clip_sdxl"]].shape[-1] == position_embedding_dim + ): + prefix = "conditioner.embedders.1.model." + diffusers_format_checkpoint = convert_open_clip_checkpoint(model, checkpoint, prefix=prefix) - image_size = set_image_size( - pipeline_class_name, original_config, checkpoint, image_size=image_size, model_type=model_type - ) - model_type = infer_model_type(original_config, checkpoint, model_type) + elif is_open_clip_sdxl_refiner_model(checkpoint): + prefix = "conditioner.embedders.0.model." + diffusers_format_checkpoint = convert_open_clip_checkpoint(model, checkpoint, prefix=prefix) - if model_type == "Playground": - edm_mean = ( - checkpoint["edm_mean"].to(dtype=torch_dtype).tolist() if torch_dtype else checkpoint["edm_mean"].tolist() - ) - edm_std = ( - checkpoint["edm_std"].to(dtype=torch_dtype).tolist() if torch_dtype else checkpoint["edm_std"].tolist() - ) else: - edm_mean = None - edm_std = None - - vae_config = create_vae_diffusers_config( - original_config, - image_size=image_size, - scaling_factor=scaling_factor, - latents_mean=edm_mean, - latents_std=edm_std, - ) - diffusers_format_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) - ctx = init_empty_weights if is_accelerate_available() else nullcontext - - with ctx(): - vae = AutoencoderKL(**vae_config) + raise ValueError("The provided checkpoint does not seem to contain a valid CLIP model.") if is_accelerate_available(): - unexpected_keys = load_model_dict_into_meta(vae, diffusers_format_vae_checkpoint, dtype=torch_dtype) - if vae._keys_to_ignore_on_load_unexpected is not None: - for pat in vae._keys_to_ignore_on_load_unexpected: + unexpected_keys = load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype) + if model._keys_to_ignore_on_load_unexpected is not None: + for pat in model._keys_to_ignore_on_load_unexpected: unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] if len(unexpected_keys) > 0: logger.warning( - f"Some weights of the model checkpoint were not used when initializing {vae.__name__}: \n {[', '.join(unexpected_keys)]}" + f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}" ) + else: - vae.load_state_dict(diffusers_format_vae_checkpoint) + model.load_state_dict(diffusers_format_checkpoint) if torch_dtype is not None: - vae = vae.to(torch_dtype) + model.to(torch_dtype) - return {"vae": vae} + model.eval() + return model -def create_text_encoders_and_tokenizers_from_ldm( - original_config, + +def _legacy_load_scheduler( + cls, checkpoint, - model_type=None, - local_files_only=False, - torch_dtype=None, + component_name, + original_config=None, + **kwargs, ): - model_type = infer_model_type(original_config, checkpoint=checkpoint, model_type=model_type) - - if model_type == "FrozenOpenCLIPEmbedder": - config_name = "stabilityai/stable-diffusion-2" - config_kwargs = {"subfolder": "text_encoder"} - - try: - text_encoder = create_text_encoder_from_open_clip_checkpoint( - config_name, checkpoint, local_files_only=local_files_only, torch_dtype=torch_dtype, **config_kwargs - ) - tokenizer = CLIPTokenizer.from_pretrained( - config_name, subfolder="tokenizer", local_files_only=local_files_only - ) - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder in the following path: '{config_name}'." - ) - else: - return {"text_encoder": text_encoder, "tokenizer": tokenizer} - - elif model_type == "FrozenCLIPEmbedder": - try: - config_name = "openai/clip-vit-large-patch14" - text_encoder = create_text_encoder_from_ldm_clip_checkpoint( - config_name, - checkpoint, - local_files_only=local_files_only, - torch_dtype=torch_dtype, - ) - tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only) - - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: '{config_name}'." - ) - else: - return {"text_encoder": text_encoder, "tokenizer": tokenizer} - - elif model_type == "SDXL-Refiner": - config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" - config_kwargs = {"projection_dim": 1280} - prefix = "conditioner.embedders.0.model." - - try: - tokenizer_2 = CLIPTokenizer.from_pretrained(config_name, pad_token="!", local_files_only=local_files_only) - text_encoder_2 = create_text_encoder_from_open_clip_checkpoint( - config_name, - checkpoint, - prefix=prefix, - has_projection=True, - local_files_only=local_files_only, - torch_dtype=torch_dtype, - **config_kwargs, - ) - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder_2 and tokenizer_2 in the following path: {config_name} with `pad_token` set to '!'." - ) + scheduler_type = kwargs.get("scheduler_type", None) + prediction_type = kwargs.get("prediction_type", None) - else: - return { - "text_encoder": None, - "tokenizer": None, - "tokenizer_2": tokenizer_2, - "text_encoder_2": text_encoder_2, - } - - elif model_type in ["SDXL", "Playground"]: - try: - config_name = "openai/clip-vit-large-patch14" - tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only) - text_encoder = create_text_encoder_from_ldm_clip_checkpoint( - config_name, checkpoint, local_files_only=local_files_only, torch_dtype=torch_dtype - ) - - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder and tokenizer in the following path: 'openai/clip-vit-large-patch14'." - ) - - try: - config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" - config_kwargs = {"projection_dim": 1280} - prefix = "conditioner.embedders.1.model." - tokenizer_2 = CLIPTokenizer.from_pretrained(config_name, pad_token="!", local_files_only=local_files_only) - text_encoder_2 = create_text_encoder_from_open_clip_checkpoint( - config_name, - checkpoint, - prefix=prefix, - has_projection=True, - local_files_only=local_files_only, - torch_dtype=torch_dtype, - **config_kwargs, - ) - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder_2 and tokenizer_2 in the following path: {config_name} with `pad_token` set to '!'." - ) - - return { - "tokenizer": tokenizer, - "text_encoder": text_encoder, - "tokenizer_2": tokenizer_2, - "text_encoder_2": text_encoder_2, - } - - return + if scheduler_type is not None: + deprecation_message = ( + "Please pass an instance of a Scheduler object directly to the `scheduler` argument in `from_single_file`." + ) + deprecate("scheduler_type", "1.0.0", deprecation_message) + if prediction_type is not None: + deprecation_message = ( + "Please configure an instance of a Scheduler with the appropriate `prediction_type` " + "and pass the object directly to the `scheduler` argument in `from_single_file`." + ) + deprecate("prediction_type", "1.0.0", deprecation_message) -def create_scheduler_from_ldm( - pipeline_class_name, - original_config, - checkpoint, - prediction_type=None, - scheduler_type="ddim", - model_type=None, -): - scheduler_config = get_default_scheduler_config() - model_type = infer_model_type(original_config, checkpoint=checkpoint, model_type=model_type) + scheduler_config = SCHEDULER_DEFAULT_CONFIG + model_type = infer_diffusers_model_type(checkpoint=checkpoint) global_step = checkpoint["global_step"] if "global_step" in checkpoint else None - num_train_timesteps = getattr(original_config["model"]["params"], "timesteps", None) or 1000 + if original_config: + num_train_timesteps = getattr(original_config["model"]["params"], "timesteps", 1000) + else: + num_train_timesteps = 1000 + scheduler_config["num_train_timesteps"] = num_train_timesteps - if ( - "parameterization" in original_config["model"]["params"] - and original_config["model"]["params"]["parameterization"] == "v" - ): + if model_type == "v2": if prediction_type is None: - # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"` - # as it relies on a brittle global step parameter here + # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"` # as it relies on a brittle global step parameter here prediction_type = "epsilon" if global_step == 875000 else "v_prediction" else: @@ -1537,20 +1434,44 @@ def create_scheduler_from_ldm( scheduler_config["prediction_type"] = prediction_type - if model_type in ["SDXL", "SDXL-Refiner"]: + if model_type in ["xl_base", "xl_refiner"]: scheduler_type = "euler" - elif model_type == "Playground": + elif model_type == "playground": scheduler_type = "edm_dpm_solver_multistep" else: - beta_start = original_config["model"]["params"].get("linear_start", 0.02) - beta_end = original_config["model"]["params"].get("linear_end", 0.085) + if original_config: + beta_start = original_config["model"]["params"].get("linear_start") + beta_end = original_config["model"]["params"].get("linear_end") + + else: + beta_start = 0.02 + beta_end = 0.085 + scheduler_config["beta_start"] = beta_start scheduler_config["beta_end"] = beta_end scheduler_config["beta_schedule"] = "scaled_linear" scheduler_config["clip_sample"] = False scheduler_config["set_alpha_to_one"] = False - if scheduler_type == "pndm": + # to deal with an edge case StableDiffusionUpscale pipeline has two schedulers + if component_name == "low_res_scheduler": + return cls.from_config( + { + "beta_end": 0.02, + "beta_schedule": "scaled_linear", + "beta_start": 0.0001, + "clip_sample": True, + "num_train_timesteps": 1000, + "prediction_type": "epsilon", + "trained_betas": None, + "variance_type": "fixed_small", + } + ) + + if scheduler_type is None: + return cls.from_config(scheduler_config) + + elif scheduler_type == "pndm": scheduler_config["skip_prk_steps"] = True scheduler = PNDMScheduler.from_config(scheduler_config) @@ -1595,15 +1516,46 @@ def create_scheduler_from_ldm( else: raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") - if pipeline_class_name == "StableDiffusionUpscalePipeline": - scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-x4-upscaler", subfolder="scheduler") - low_res_scheduler = DDPMScheduler.from_pretrained( - "stabilityai/stable-diffusion-x4-upscaler", subfolder="low_res_scheduler" - ) + return scheduler - return { - "scheduler": scheduler, - "low_res_scheduler": low_res_scheduler, - } - return {"scheduler": scheduler} +def _legacy_load_clip_tokenizer(cls, checkpoint, config=None, local_files_only=False): + if config: + config = {"pretrained_model_name_or_path": config} + else: + config = fetch_diffusers_config(checkpoint) + + if is_clip_model(checkpoint) or is_clip_sdxl_model(checkpoint): + clip_config = "openai/clip-vit-large-patch14" + config["pretrained_model_name_or_path"] = clip_config + subfolder = "" + + elif is_open_clip_model(checkpoint): + clip_config = "stabilityai/stable-diffusion-2" + config["pretrained_model_name_or_path"] = clip_config + subfolder = "tokenizer" + + else: + clip_config = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" + config["pretrained_model_name_or_path"] = clip_config + subfolder = "" + + tokenizer = cls.from_pretrained(**config, subfolder=subfolder, local_files_only=local_files_only) + + return tokenizer + + +def _legacy_load_safety_checker(local_files_only, torch_dtype): + # Support for loading safety checker components using the deprecated + # `load_safety_checker` argument. + + from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker + + feature_extractor = AutoImageProcessor.from_pretrained( + "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only, torch_dtype=torch_dtype + ) + safety_checker = StableDiffusionSafetyChecker.from_pretrained( + "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only, torch_dtype=torch_dtype + ) + + return {"safety_checker": safety_checker, "feature_extractor": feature_extractor} diff --git a/src/diffusers/loaders/textual_inversion.py b/src/diffusers/loaders/textual_inversion.py index a9b9a9aae052..b6e1545e16dd 100644 --- a/src/diffusers/loaders/textual_inversion.py +++ b/src/diffusers/loaders/textual_inversion.py @@ -419,19 +419,20 @@ def load_textual_inversion( # 7.1 Offload all hooks in case the pipeline was cpu offloaded before make sure, we offload and onload again is_model_cpu_offload = False is_sequential_cpu_offload = False - for _, component in self.components.items(): - if isinstance(component, nn.Module): - if hasattr(component, "_hf_hook"): - is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload) - is_sequential_cpu_offload = ( - isinstance(getattr(component, "_hf_hook"), AlignDevicesHook) - or hasattr(component._hf_hook, "hooks") - and isinstance(component._hf_hook.hooks[0], AlignDevicesHook) - ) - logger.info( - "Accelerate hooks detected. Since you have called `load_textual_inversion()`, the previous hooks will be first removed. Then the textual inversion parameters will be loaded and the hooks will be applied again." - ) - remove_hook_from_module(component, recurse=is_sequential_cpu_offload) + if self.hf_device_map is None: + for _, component in self.components.items(): + if isinstance(component, nn.Module): + if hasattr(component, "_hf_hook"): + is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload) + is_sequential_cpu_offload = ( + isinstance(getattr(component, "_hf_hook"), AlignDevicesHook) + or hasattr(component._hf_hook, "hooks") + and isinstance(component._hf_hook.hooks[0], AlignDevicesHook) + ) + logger.info( + "Accelerate hooks detected. Since you have called `load_textual_inversion()`, the previous hooks will be first removed. Then the textual inversion parameters will be loaded and the hooks will be applied again." + ) + remove_hook_from_module(component, recurse=is_sequential_cpu_offload) # 7.2 save expected device and dtype device = text_encoder.device diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py index 5d5ed30dc35f..7db7bfeda600 100644 --- a/src/diffusers/loaders/unet.py +++ b/src/diffusers/loaders/unet.py @@ -44,11 +44,6 @@ set_adapter_layers, set_weights_and_activate_adapters, ) -from .single_file_utils import ( - convert_stable_cascade_unet_single_file_to_diffusers, - infer_stable_cascade_single_file_config, - load_single_file_model_checkpoint, -) from .unet_loader_utils import _maybe_expand_lora_scales from .utils import AttnProcsLayers @@ -1059,103 +1054,3 @@ def _load_ip_adapter_loras(self, state_dicts): } ) return lora_dicts - - -class FromOriginalUNetMixin: - """ - Load pretrained UNet model weights saved in the `.ckpt` or `.safetensors` format into a [`StableCascadeUNet`]. - """ - - @classmethod - @validate_hf_hub_args - def from_single_file(cls, pretrained_model_link_or_path, **kwargs): - r""" - Instantiate a [`StableCascadeUNet`] from pretrained StableCascadeUNet weights saved in the original `.ckpt` or - `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default. - - Parameters: - pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*): - Can be either: - - A link to the `.ckpt` file (for example - `"https://huggingface.co//blob/main/.ckpt"`) on the Hub. - - A path to a *file* containing all pipeline weights. - config: (`dict`, *optional*): - Dictionary containing the configuration of the model: - torch_dtype (`str` or `torch.dtype`, *optional*): - Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the - dtype is automatically derived from the model's weights. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - resume_download: - Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 - of Diffusers. - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to True, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - kwargs (remaining dictionary of keyword arguments, *optional*): - Can be used to overwrite load and saveable variables of the model. - - """ - class_name = cls.__name__ - if class_name != "StableCascadeUNet": - raise ValueError("FromOriginalUNetMixin is currently only compatible with StableCascadeUNet") - - config = kwargs.pop("config", None) - resume_download = kwargs.pop("resume_download", None) - force_download = kwargs.pop("force_download", False) - proxies = kwargs.pop("proxies", None) - token = kwargs.pop("token", None) - cache_dir = kwargs.pop("cache_dir", None) - local_files_only = kwargs.pop("local_files_only", None) - revision = kwargs.pop("revision", None) - torch_dtype = kwargs.pop("torch_dtype", None) - - checkpoint = load_single_file_model_checkpoint( - pretrained_model_link_or_path, - resume_download=resume_download, - force_download=force_download, - proxies=proxies, - token=token, - cache_dir=cache_dir, - local_files_only=local_files_only, - revision=revision, - ) - - if config is None: - config = infer_stable_cascade_single_file_config(checkpoint) - model_config = cls.load_config(**config, **kwargs) - else: - model_config = config - - ctx = init_empty_weights if is_accelerate_available() else nullcontext - with ctx(): - model = cls.from_config(model_config, **kwargs) - - diffusers_format_checkpoint = convert_stable_cascade_unet_single_file_to_diffusers(checkpoint) - if is_accelerate_available(): - unexpected_keys = load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype) - if len(unexpected_keys) > 0: - logger.warning( - f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}" - ) - - else: - model.load_state_dict(diffusers_format_checkpoint) - - if torch_dtype is not None: - model.to(torch_dtype) - - return model diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py index 50866e3a7a8c..3d4fccb20779 100644 --- a/src/diffusers/models/attention.py +++ b/src/diffusers/models/attention.py @@ -282,15 +282,15 @@ def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0): def forward( self, - hidden_states: torch.FloatTensor, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, timestep: Optional[torch.LongTensor] = None, cross_attention_kwargs: Dict[str, Any] = None, class_labels: Optional[torch.LongTensor] = None, added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -477,10 +477,10 @@ def set_chunk_feed_forward(self, chunk_size: Optional[int], **kwargs): def forward( self, - hidden_states: torch.FloatTensor, + hidden_states: torch.Tensor, num_frames: int, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_hidden_states: Optional[torch.Tensor] = None, + ) -> torch.Tensor: # Notice that normalization is always applied before the real computation in the following blocks. # 0. Self-Attention batch_size = hidden_states.shape[0] diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index ea1c987e95c6..cbb07eafa37f 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -503,9 +503,9 @@ def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProce def forward( self, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, **cross_attention_kwargs, ) -> torch.Tensor: r""" @@ -751,10 +751,10 @@ class AttnProcessor: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, ) -> torch.Tensor: @@ -863,9 +863,9 @@ def __init__( def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: batch_size, sequence_length, _ = hidden_states.shape attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) @@ -928,9 +928,9 @@ class AttnAddedKVProcessor: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, *args, **kwargs, ) -> torch.Tensor: @@ -1001,9 +1001,9 @@ def __init__(self): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, *args, **kwargs, ) -> torch.Tensor: @@ -1080,9 +1080,9 @@ def __init__(self, attention_op: Optional[Callable] = None): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: residual = hidden_states hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2) @@ -1151,13 +1151,13 @@ def __init__(self, attention_op: Optional[Callable] = None): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1243,13 +1243,13 @@ def __init__(self): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1349,13 +1349,13 @@ def __init__(self): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1448,13 +1448,13 @@ def __init__(self): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1581,10 +1581,10 @@ def __init__( def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: batch_size, sequence_length, _ = ( hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape ) @@ -1692,10 +1692,10 @@ def __init__( def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: batch_size, sequence_length, _ = hidden_states.shape attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) if self.train_q_out: @@ -1773,10 +1773,10 @@ def __init__(self, slice_size: int): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: residual = hidden_states input_ndim = hidden_states.ndim @@ -1860,11 +1860,11 @@ def __init__(self, slice_size): def __call__( self, attn: "Attention", - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, + ) -> torch.Tensor: residual = hidden_states if attn.spatial_norm is not None: @@ -1957,7 +1957,7 @@ def __init__( self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0) self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0) - def forward(self, f: torch.FloatTensor, zq: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, f: torch.Tensor, zq: torch.Tensor) -> torch.Tensor: f_size = f.shape[-2:] zq = F.interpolate(zq, size=f_size, mode="nearest") norm_f = self.norm_layer(f) @@ -2003,7 +2003,7 @@ def __init__( self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha) self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha) - def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, **kwargs) -> torch.FloatTensor: + def __call__(self, attn: Attention, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: self_cls_name = self.__class__.__name__ deprecate( self_cls_name, @@ -2064,7 +2064,7 @@ def __init__( self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha) self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha) - def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, **kwargs) -> torch.FloatTensor: + def __call__(self, attn: Attention, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: self_cls_name = self.__class__.__name__ deprecate( self_cls_name, @@ -2143,7 +2143,7 @@ def __init__( self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha) self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha) - def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, **kwargs) -> torch.FloatTensor: + def __call__(self, attn: Attention, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: self_cls_name = self.__class__.__name__ deprecate( self_cls_name, @@ -2202,7 +2202,7 @@ def __init__( self.to_v_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) - def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, **kwargs) -> torch.FloatTensor: + def __call__(self, attn: Attention, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: self_cls_name = self.__class__.__name__ deprecate( self_cls_name, @@ -2264,12 +2264,12 @@ def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, scale: float = 1.0, - ip_adapter_masks: Optional[torch.FloatTensor] = None, + ip_adapter_masks: Optional[torch.Tensor] = None, ): residual = hidden_states @@ -2467,12 +2467,12 @@ def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, scale: float = 1.0, - ip_adapter_masks: Optional[torch.FloatTensor] = None, + ip_adapter_masks: Optional[torch.Tensor] = None, ): residual = hidden_states diff --git a/src/diffusers/models/autoencoders/autoencoder_asym_kl.py b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py index fc2041d2e930..85b4195077df 100644 --- a/src/diffusers/models/autoencoders/autoencoder_asym_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py @@ -112,9 +112,7 @@ def __init__( self.register_to_config(force_upcast=False) @apply_forward_hook - def encode( - self, x: torch.FloatTensor, return_dict: bool = True - ) -> Union[AutoencoderKLOutput, Tuple[torch.FloatTensor]]: + def encode(self, x: torch.Tensor, return_dict: bool = True) -> Union[AutoencoderKLOutput, Tuple[torch.Tensor]]: h = self.encoder(x) moments = self.quant_conv(h) posterior = DiagonalGaussianDistribution(moments) @@ -126,11 +124,11 @@ def encode( def _decode( self, - z: torch.FloatTensor, - image: Optional[torch.FloatTensor] = None, - mask: Optional[torch.FloatTensor] = None, + z: torch.Tensor, + image: Optional[torch.Tensor] = None, + mask: Optional[torch.Tensor] = None, return_dict: bool = True, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: z = self.post_quant_conv(z) dec = self.decoder(z, image, mask) @@ -142,12 +140,12 @@ def _decode( @apply_forward_hook def decode( self, - z: torch.FloatTensor, + z: torch.Tensor, generator: Optional[torch.Generator] = None, - image: Optional[torch.FloatTensor] = None, - mask: Optional[torch.FloatTensor] = None, + image: Optional[torch.Tensor] = None, + mask: Optional[torch.Tensor] = None, return_dict: bool = True, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: decoded = self._decode(z, image, mask).sample if not return_dict: @@ -157,16 +155,16 @@ def decode( def forward( self, - sample: torch.FloatTensor, - mask: Optional[torch.FloatTensor] = None, + sample: torch.Tensor, + mask: Optional[torch.Tensor] = None, sample_posterior: bool = False, return_dict: bool = True, generator: Optional[torch.Generator] = None, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: r""" Args: - sample (`torch.FloatTensor`): Input sample. - mask (`torch.FloatTensor`, *optional*, defaults to `None`): Optional inpainting mask. + sample (`torch.Tensor`): Input sample. + mask (`torch.Tensor`, *optional*, defaults to `None`): Optional inpainting mask. sample_posterior (`bool`, *optional*, defaults to `False`): Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py index 0b9b9d4d47e5..9d919d374ae6 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl.py @@ -17,7 +17,7 @@ import torch.nn as nn from ...configuration_utils import ConfigMixin, register_to_config -from ...loaders import FromOriginalVAEMixin +from ...loaders.single_file_model import FromOriginalModelMixin from ...utils.accelerate_utils import apply_forward_hook from ..attention_processor import ( ADDED_KV_ATTENTION_PROCESSORS, @@ -32,7 +32,7 @@ from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder -class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin): +class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin): r""" A VAE model with KL loss for encoding images into latents and decoding latent representations into images. @@ -237,13 +237,13 @@ def set_default_attn_processor(self): @apply_forward_hook def encode( - self, x: torch.FloatTensor, return_dict: bool = True + self, x: torch.Tensor, return_dict: bool = True ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]: """ Encode a batch of images into latents. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple. @@ -268,7 +268,7 @@ def encode( return AutoencoderKLOutput(latent_dist=posterior) - def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: + def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size): return self.tiled_decode(z, return_dict=return_dict) @@ -281,14 +281,12 @@ def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[Decod return DecoderOutput(sample=dec) @apply_forward_hook - def decode( - self, z: torch.FloatTensor, return_dict: bool = True, generator=None - ) -> Union[DecoderOutput, torch.FloatTensor]: + def decode(self, z: torch.Tensor, return_dict: bool = True, generator=None) -> Union[DecoderOutput, torch.Tensor]: """ Decode a batch of images. Args: - z (`torch.FloatTensor`): Input batch of latent vectors. + z (`torch.Tensor`): Input batch of latent vectors. return_dict (`bool`, *optional*, defaults to `True`): Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple. @@ -302,7 +300,7 @@ def decode( decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)] decoded = torch.cat(decoded_slices) else: - decoded = self._decode(z).sample + decoded = self._decode(z, return_dict=False)[0] if not return_dict: return (decoded,) @@ -321,7 +319,7 @@ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch. b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent) return b - def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput: + def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> AutoencoderKLOutput: r"""Encode a batch of images using a tiled encoder. When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several @@ -331,7 +329,7 @@ def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> Autoen output, but they should be much less noticeable. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple. @@ -375,12 +373,12 @@ def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> Autoen return AutoencoderKLOutput(latent_dist=posterior) - def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: + def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: r""" Decode a batch of images using a tiled decoder. Args: - z (`torch.FloatTensor`): Input batch of latent vectors. + z (`torch.Tensor`): Input batch of latent vectors. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple. @@ -425,14 +423,14 @@ def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[ def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, sample_posterior: bool = False, return_dict: bool = True, generator: Optional[torch.Generator] = None, - ) -> Union[DecoderOutput, torch.FloatTensor]: + ) -> Union[DecoderOutput, torch.Tensor]: r""" Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. sample_posterior (`bool`, *optional*, defaults to `False`): Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py index b12226fa4bac..67540cb7dc7f 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py @@ -86,10 +86,10 @@ def __init__( def forward( self, - sample: torch.FloatTensor, - image_only_indicator: torch.FloatTensor, + sample: torch.Tensor, + image_only_indicator: torch.Tensor, num_frames: int = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r"""The forward method of the `Decoder` class.""" sample = self.conv_in(sample) @@ -315,13 +315,13 @@ def set_default_attn_processor(self): @apply_forward_hook def encode( - self, x: torch.FloatTensor, return_dict: bool = True + self, x: torch.Tensor, return_dict: bool = True ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]: """ Encode a batch of images into latents. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple. @@ -341,15 +341,15 @@ def encode( @apply_forward_hook def decode( self, - z: torch.FloatTensor, + z: torch.Tensor, num_frames: int, return_dict: bool = True, - ) -> Union[DecoderOutput, torch.FloatTensor]: + ) -> Union[DecoderOutput, torch.Tensor]: """ Decode a batch of images. Args: - z (`torch.FloatTensor`): Input batch of latent vectors. + z (`torch.Tensor`): Input batch of latent vectors. return_dict (`bool`, *optional*, defaults to `True`): Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple. @@ -370,15 +370,15 @@ def decode( def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, sample_posterior: bool = False, return_dict: bool = True, generator: Optional[torch.Generator] = None, num_frames: int = 1, - ) -> Union[DecoderOutput, torch.FloatTensor]: + ) -> Union[DecoderOutput, torch.Tensor]: r""" Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. sample_posterior (`bool`, *optional*, defaults to `False`): Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/models/autoencoders/autoencoder_tiny.py b/src/diffusers/models/autoencoders/autoencoder_tiny.py index a7047acdfd74..39b885b45222 100644 --- a/src/diffusers/models/autoencoders/autoencoder_tiny.py +++ b/src/diffusers/models/autoencoders/autoencoder_tiny.py @@ -157,11 +157,11 @@ def _set_gradient_checkpointing(self, module, value: bool = False) -> None: if isinstance(module, (EncoderTiny, DecoderTiny)): module.gradient_checkpointing = value - def scale_latents(self, x: torch.FloatTensor) -> torch.FloatTensor: + def scale_latents(self, x: torch.Tensor) -> torch.Tensor: """raw latents -> [0, 1]""" return x.div(2 * self.latent_magnitude).add(self.latent_shift).clamp(0, 1) - def unscale_latents(self, x: torch.FloatTensor) -> torch.FloatTensor: + def unscale_latents(self, x: torch.Tensor) -> torch.Tensor: """[0, 1] -> raw latents""" return x.sub(self.latent_shift).mul(2 * self.latent_magnitude) @@ -194,7 +194,7 @@ def disable_tiling(self) -> None: """ self.enable_tiling(False) - def _tiled_encode(self, x: torch.FloatTensor) -> torch.FloatTensor: + def _tiled_encode(self, x: torch.Tensor) -> torch.Tensor: r"""Encode a batch of images using a tiled encoder. When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several @@ -202,10 +202,10 @@ def _tiled_encode(self, x: torch.FloatTensor) -> torch.FloatTensor: tiles overlap and are blended together to form a smooth output. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. Returns: - `torch.FloatTensor`: Encoded batch of images. + `torch.Tensor`: Encoded batch of images. """ # scale of encoder output relative to input sf = self.spatial_scale_factor @@ -242,7 +242,7 @@ def _tiled_encode(self, x: torch.FloatTensor) -> torch.FloatTensor: tile_out.copy_(blend_mask * tile + (1 - blend_mask) * tile_out) return out - def _tiled_decode(self, x: torch.FloatTensor) -> torch.FloatTensor: + def _tiled_decode(self, x: torch.Tensor) -> torch.Tensor: r"""Encode a batch of images using a tiled encoder. When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several @@ -250,10 +250,10 @@ def _tiled_decode(self, x: torch.FloatTensor) -> torch.FloatTensor: tiles overlap and are blended together to form a smooth output. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. Returns: - `torch.FloatTensor`: Encoded batch of images. + `torch.Tensor`: Encoded batch of images. """ # scale of decoder output relative to input sf = self.spatial_scale_factor @@ -290,9 +290,7 @@ def _tiled_decode(self, x: torch.FloatTensor) -> torch.FloatTensor: return out @apply_forward_hook - def encode( - self, x: torch.FloatTensor, return_dict: bool = True - ) -> Union[AutoencoderTinyOutput, Tuple[torch.FloatTensor]]: + def encode(self, x: torch.Tensor, return_dict: bool = True) -> Union[AutoencoderTinyOutput, Tuple[torch.Tensor]]: if self.use_slicing and x.shape[0] > 1: output = [ self._tiled_encode(x_slice) if self.use_tiling else self.encoder(x_slice) for x_slice in x.split(1) @@ -308,8 +306,8 @@ def encode( @apply_forward_hook def decode( - self, x: torch.FloatTensor, generator: Optional[torch.Generator] = None, return_dict: bool = True - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + self, x: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: if self.use_slicing and x.shape[0] > 1: output = [self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x) for x_slice in x.split(1)] output = torch.cat(output) @@ -323,12 +321,12 @@ def decode( def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: r""" Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. """ diff --git a/src/diffusers/models/autoencoders/consistency_decoder_vae.py b/src/diffusers/models/autoencoders/consistency_decoder_vae.py index 7287cbd43f63..212c46537706 100644 --- a/src/diffusers/models/autoencoders/consistency_decoder_vae.py +++ b/src/diffusers/models/autoencoders/consistency_decoder_vae.py @@ -276,13 +276,13 @@ def set_default_attn_processor(self): @apply_forward_hook def encode( - self, x: torch.FloatTensor, return_dict: bool = True + self, x: torch.Tensor, return_dict: bool = True ) -> Union[ConsistencyDecoderVAEOutput, Tuple[DiagonalGaussianDistribution]]: """ Encode a batch of images into latents. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): Whether to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a plain tuple. @@ -312,22 +312,22 @@ def encode( @apply_forward_hook def decode( self, - z: torch.FloatTensor, + z: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, num_inference_steps: int = 2, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: """ Decodes the input latent vector `z` using the consistency decoder VAE model. Args: - z (torch.FloatTensor): The input latent vector. + z (torch.Tensor): The input latent vector. generator (Optional[torch.Generator]): The random number generator. Default is None. return_dict (bool): Whether to return the output as a dictionary. Default is True. num_inference_steps (int): The number of inference steps. Default is 2. Returns: - Union[DecoderOutput, Tuple[torch.FloatTensor]]: The decoded output. + Union[DecoderOutput, Tuple[torch.Tensor]]: The decoded output. """ z = (z * self.config.scaling_factor - self.means) / self.stds @@ -370,9 +370,7 @@ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch. b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent) return b - def tiled_encode( - self, x: torch.FloatTensor, return_dict: bool = True - ) -> Union[ConsistencyDecoderVAEOutput, Tuple]: + def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> Union[ConsistencyDecoderVAEOutput, Tuple]: r"""Encode a batch of images using a tiled encoder. When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several @@ -382,7 +380,7 @@ def tiled_encode( output, but they should be much less noticeable. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a plain tuple. @@ -429,14 +427,14 @@ def tiled_encode( def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, sample_posterior: bool = False, return_dict: bool = True, generator: Optional[torch.Generator] = None, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: r""" Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. sample_posterior (`bool`, *optional*, defaults to `False`): Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/models/autoencoders/vae.py b/src/diffusers/models/autoencoders/vae.py index 75503dc6a4ad..bb80ce8605ba 100644 --- a/src/diffusers/models/autoencoders/vae.py +++ b/src/diffusers/models/autoencoders/vae.py @@ -36,11 +36,12 @@ class DecoderOutput(BaseOutput): Output of decoding method. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): The decoded output sample from the last layer of the model. """ - sample: torch.FloatTensor + sample: torch.Tensor + commit_loss: Optional[torch.FloatTensor] = None class Encoder(nn.Module): @@ -136,7 +137,7 @@ def __init__( self.gradient_checkpointing = False - def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, sample: torch.Tensor) -> torch.Tensor: r"""The forward method of the `Encoder` class.""" sample = self.conv_in(sample) @@ -282,9 +283,9 @@ def __init__( def forward( self, - sample: torch.FloatTensor, - latent_embeds: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + sample: torch.Tensor, + latent_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: r"""The forward method of the `Decoder` class.""" sample = self.conv_in(sample) @@ -367,7 +368,7 @@ def __init__( self.out_channels = out_channels self.deconv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1) - def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: r"""The forward method of the `UpSample` class.""" x = torch.relu(x) x = self.deconv(x) @@ -416,7 +417,7 @@ def __init__( self.layers = nn.Sequential(*layers) - def forward(self, x: torch.FloatTensor, mask=None) -> torch.FloatTensor: + def forward(self, x: torch.Tensor, mask=None) -> torch.Tensor: r"""The forward method of the `MaskConditionEncoder` class.""" out = {} for l in range(len(self.layers)): @@ -533,11 +534,11 @@ def __init__( def forward( self, - z: torch.FloatTensor, - image: Optional[torch.FloatTensor] = None, - mask: Optional[torch.FloatTensor] = None, - latent_embeds: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + z: torch.Tensor, + image: Optional[torch.Tensor] = None, + mask: Optional[torch.Tensor] = None, + latent_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: r"""The forward method of the `MaskConditionDecoder` class.""" sample = z sample = self.conv_in(sample) @@ -711,7 +712,7 @@ def unmap_to_all(self, inds: torch.LongTensor) -> torch.LongTensor: back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds) return back.reshape(ishape) - def forward(self, z: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.FloatTensor, Tuple]: + def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, Tuple]: # reshape z -> (batch, height, width, channel) and flatten z = z.permute(0, 2, 3, 1).contiguous() z_flattened = z.view(-1, self.vq_embed_dim) @@ -730,7 +731,7 @@ def forward(self, z: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.FloatT loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean((z_q - z.detach()) ** 2) # preserve gradients - z_q: torch.FloatTensor = z + (z_q - z).detach() + z_q: torch.Tensor = z + (z_q - z).detach() # reshape back to match original input shape z_q = z_q.permute(0, 3, 1, 2).contiguous() @@ -745,7 +746,7 @@ def forward(self, z: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.FloatT return z_q, loss, (perplexity, min_encodings, min_encoding_indices) - def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) -> torch.FloatTensor: + def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) -> torch.Tensor: # shape specifying (batch, height, width, channel) if self.remap is not None: indices = indices.reshape(shape[0], -1) # add batch axis @@ -753,7 +754,7 @@ def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) indices = indices.reshape(-1) # flatten again # get quantized latent vectors - z_q: torch.FloatTensor = self.embedding(indices) + z_q: torch.Tensor = self.embedding(indices) if shape is not None: z_q = z_q.view(shape) @@ -776,7 +777,7 @@ def __init__(self, parameters: torch.Tensor, deterministic: bool = False): self.mean, device=self.parameters.device, dtype=self.parameters.dtype ) - def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor: + def sample(self, generator: Optional[torch.Generator] = None) -> torch.Tensor: # make sure sample is on the same device as the parameters and has same dtype sample = randn_tensor( self.mean.shape, @@ -873,7 +874,7 @@ def __init__( self.layers = nn.Sequential(*layers) self.gradient_checkpointing = False - def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: r"""The forward method of the `EncoderTiny` class.""" if self.training and self.gradient_checkpointing: @@ -956,7 +957,7 @@ def __init__( self.layers = nn.Sequential(*layers) self.gradient_checkpointing = False - def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: r"""The forward method of the `DecoderTiny` class.""" # Clamp. x = torch.tanh(x / 3) * 3 diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py index 200881c0ed27..2618a8b15f82 100644 --- a/src/diffusers/models/controlnet.py +++ b/src/diffusers/models/controlnet.py @@ -19,7 +19,7 @@ from torch.nn import functional as F from ..configuration_utils import ConfigMixin, register_to_config -from ..loaders import FromOriginalControlNetMixin +from ..loaders.single_file_model import FromOriginalModelMixin from ..utils import BaseOutput, logging from .attention_processor import ( ADDED_KV_ATTENTION_PROCESSORS, @@ -108,7 +108,7 @@ def forward(self, conditioning): return embedding -class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlNetMixin): +class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin): """ A ControlNet model. @@ -665,10 +665,10 @@ def _set_gradient_checkpointing(self, module, value: bool = False) -> None: def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, - controlnet_cond: torch.FloatTensor, + controlnet_cond: torch.Tensor, conditioning_scale: float = 1.0, class_labels: Optional[torch.Tensor] = None, timestep_cond: Optional[torch.Tensor] = None, @@ -677,18 +677,18 @@ def forward( cross_attention_kwargs: Optional[Dict[str, Any]] = None, guess_mode: bool = False, return_dict: bool = True, - ) -> Union[ControlNetOutput, Tuple[Tuple[torch.FloatTensor, ...], torch.FloatTensor]]: + ) -> Union[ControlNetOutput, Tuple[Tuple[torch.Tensor, ...], torch.Tensor]]: """ The [`ControlNetModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor. timestep (`Union[torch.Tensor, float, int]`): The number of timesteps to denoise an input. encoder_hidden_states (`torch.Tensor`): The encoder hidden states. - controlnet_cond (`torch.FloatTensor`): + controlnet_cond (`torch.Tensor`): The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. conditioning_scale (`float`, defaults to `1.0`): The scale factor for ControlNet outputs. diff --git a/src/diffusers/models/controlnet_xs.py b/src/diffusers/models/controlnet_xs.py index a4f9e61f37c7..7c3ade26f1ac 100644 --- a/src/diffusers/models/controlnet_xs.py +++ b/src/diffusers/models/controlnet_xs.py @@ -17,7 +17,7 @@ import torch import torch.utils.checkpoint -from torch import FloatTensor, nn +from torch import Tensor, nn from ..configuration_utils import ConfigMixin, register_to_config from ..utils import BaseOutput, is_torch_version, logging @@ -54,12 +54,12 @@ class ControlNetXSOutput(BaseOutput): The output of [`UNetControlNetXSModel`]. Args: - sample (`FloatTensor` of shape `(batch_size, num_channels, height, width)`): + sample (`Tensor` of shape `(batch_size, num_channels, height, width)`): The output of the `UNetControlNetXSModel`. Unlike `ControlNetOutput` this is NOT to be added to the base model output, but is already the final output. """ - sample: FloatTensor = None + sample: Tensor = None class DownBlockControlNetXSAdapter(nn.Module): @@ -1001,7 +1001,7 @@ def unfuse_qkv_projections(self): def forward( self, - sample: FloatTensor, + sample: Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, controlnet_cond: Optional[torch.Tensor] = None, @@ -1018,13 +1018,13 @@ def forward( The [`ControlNetXSModel`] forward method. Args: - sample (`FloatTensor`): + sample (`Tensor`): The noisy input tensor. timestep (`Union[torch.Tensor, float, int]`): The number of timesteps to denoise an input. encoder_hidden_states (`torch.Tensor`): The encoder hidden states. - controlnet_cond (`FloatTensor`): + controlnet_cond (`Tensor`): The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. conditioning_scale (`float`, defaults to `1.0`): How much the control model affects the base model outputs. @@ -1402,16 +1402,16 @@ def freeze_base_params(self) -> None: def forward( self, - hidden_states_base: FloatTensor, - temb: FloatTensor, - encoder_hidden_states: Optional[FloatTensor] = None, - hidden_states_ctrl: Optional[FloatTensor] = None, + hidden_states_base: Tensor, + temb: Tensor, + encoder_hidden_states: Optional[Tensor] = None, + hidden_states_ctrl: Optional[Tensor] = None, conditioning_scale: Optional[float] = 1.0, - attention_mask: Optional[FloatTensor] = None, + attention_mask: Optional[Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[FloatTensor] = None, + encoder_attention_mask: Optional[Tensor] = None, apply_control: bool = True, - ) -> Tuple[FloatTensor, FloatTensor, Tuple[FloatTensor, ...], Tuple[FloatTensor, ...]]: + ) -> Tuple[Tensor, Tensor, Tuple[Tensor, ...], Tuple[Tensor, ...]]: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1626,16 +1626,16 @@ def freeze_base_params(self) -> None: def forward( self, - hidden_states_base: FloatTensor, - temb: FloatTensor, - encoder_hidden_states: FloatTensor, - hidden_states_ctrl: Optional[FloatTensor] = None, + hidden_states_base: Tensor, + temb: Tensor, + encoder_hidden_states: Tensor, + hidden_states_ctrl: Optional[Tensor] = None, conditioning_scale: Optional[float] = 1.0, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - attention_mask: Optional[FloatTensor] = None, - encoder_attention_mask: Optional[FloatTensor] = None, + attention_mask: Optional[Tensor] = None, + encoder_attention_mask: Optional[Tensor] = None, apply_control: bool = True, - ) -> Tuple[FloatTensor, FloatTensor]: + ) -> Tuple[Tensor, Tensor]: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1807,18 +1807,18 @@ def freeze_base_params(self) -> None: def forward( self, - hidden_states: FloatTensor, - res_hidden_states_tuple_base: Tuple[FloatTensor, ...], - res_hidden_states_tuple_ctrl: Tuple[FloatTensor, ...], - temb: FloatTensor, - encoder_hidden_states: Optional[FloatTensor] = None, + hidden_states: Tensor, + res_hidden_states_tuple_base: Tuple[Tensor, ...], + res_hidden_states_tuple_ctrl: Tuple[Tensor, ...], + temb: Tensor, + encoder_hidden_states: Optional[Tensor] = None, conditioning_scale: Optional[float] = 1.0, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - attention_mask: Optional[FloatTensor] = None, + attention_mask: Optional[Tensor] = None, upsample_size: Optional[int] = None, - encoder_attention_mask: Optional[FloatTensor] = None, + encoder_attention_mask: Optional[Tensor] = None, apply_control: bool = True, - ) -> FloatTensor: + ) -> Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") diff --git a/src/diffusers/models/downsampling.py b/src/diffusers/models/downsampling.py index 6d556e1e67ac..4e384e731c74 100644 --- a/src/diffusers/models/downsampling.py +++ b/src/diffusers/models/downsampling.py @@ -129,7 +129,7 @@ def __init__( else: self.conv = conv - def forward(self, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -180,24 +180,24 @@ def __init__( def _downsample_2d( self, - hidden_states: torch.FloatTensor, - weight: Optional[torch.FloatTensor] = None, - kernel: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + weight: Optional[torch.Tensor] = None, + kernel: Optional[torch.Tensor] = None, factor: int = 2, gain: float = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """Fused `Conv2d()` followed by `downsample_2d()`. Padding is performed only once at the beginning, not between the operations. The fused op is considerably more efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of arbitrary order. Args: - hidden_states (`torch.FloatTensor`): + hidden_states (`torch.Tensor`): Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`. - weight (`torch.FloatTensor`, *optional*): + weight (`torch.Tensor`, *optional*): Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`. Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`. - kernel (`torch.FloatTensor`, *optional*): + kernel (`torch.Tensor`, *optional*): FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which corresponds to average pooling. factor (`int`, *optional*, default to `2`): @@ -206,7 +206,7 @@ def _downsample_2d( Scaling factor for signal magnitude. Returns: - output (`torch.FloatTensor`): + output (`torch.Tensor`): Tensor of the shape `[N, C, H // factor, W // factor]` or `[N, H // factor, W // factor, C]`, and same datatype as `x`. """ @@ -244,7 +244,7 @@ def _downsample_2d( return output - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: if self.use_conv: downsample_input = self._downsample_2d(hidden_states, weight=self.Conv2d_0.weight, kernel=self.fir_kernel) hidden_states = downsample_input + self.Conv2d_0.bias.reshape(1, -1, 1, 1) @@ -286,11 +286,11 @@ def forward(self, inputs: torch.Tensor) -> torch.Tensor: def downsample_2d( - hidden_states: torch.FloatTensor, - kernel: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + kernel: Optional[torch.Tensor] = None, factor: int = 2, gain: float = 1, -) -> torch.FloatTensor: +) -> torch.Tensor: r"""Downsample2D a batch of 2D images with the given filter. Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and downsamples each image with the given filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the @@ -298,9 +298,9 @@ def downsample_2d( shape is a multiple of the downsampling factor. Args: - hidden_states (`torch.FloatTensor`) + hidden_states (`torch.Tensor`) Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`. - kernel (`torch.FloatTensor`, *optional*): + kernel (`torch.Tensor`, *optional*): FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which corresponds to average pooling. factor (`int`, *optional*, default to `2`): @@ -309,7 +309,7 @@ def downsample_2d( Scaling factor for signal magnitude. Returns: - output (`torch.FloatTensor`): + output (`torch.Tensor`): Tensor of the shape `[N, C, H // factor, W // factor]` """ diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py index ced520bb8204..d13f8a06cf63 100644 --- a/src/diffusers/models/embeddings.py +++ b/src/diffusers/models/embeddings.py @@ -424,7 +424,7 @@ def __init__( self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim) self.text_proj = nn.Linear(text_embed_dim, cross_attention_dim) - def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTensor): + def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor): batch_size = text_embeds.shape[0] # image @@ -450,7 +450,7 @@ def __init__( self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim) self.norm = nn.LayerNorm(cross_attention_dim) - def forward(self, image_embeds: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor): batch_size = image_embeds.shape[0] # image @@ -468,7 +468,7 @@ def __init__(self, image_embed_dim=1024, cross_attention_dim=1024): self.ff = FeedForward(image_embed_dim, cross_attention_dim, mult=1, activation_fn="gelu") self.norm = nn.LayerNorm(cross_attention_dim) - def forward(self, image_embeds: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor): return self.norm(self.ff(image_embeds)) @@ -482,7 +482,7 @@ def __init__(self, image_embed_dim=1024, cross_attention_dim=1024, mult=1, num_t self.ff = FeedForward(image_embed_dim, cross_attention_dim * num_tokens, mult=mult, activation_fn="gelu") self.norm = nn.LayerNorm(cross_attention_dim) - def forward(self, image_embeds: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor): x = self.ff(image_embeds) x = x.reshape(-1, self.num_tokens, self.cross_attention_dim) return self.norm(x) @@ -530,7 +530,7 @@ def __init__(self, text_embed_dim: int = 768, image_embed_dim: int = 768, time_e self.text_norm = nn.LayerNorm(time_embed_dim) self.image_proj = nn.Linear(image_embed_dim, time_embed_dim) - def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTensor): + def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor): # text time_text_embeds = self.text_proj(text_embeds) time_text_embeds = self.text_norm(time_text_embeds) @@ -547,7 +547,7 @@ def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1536): self.image_proj = nn.Linear(image_embed_dim, time_embed_dim) self.image_norm = nn.LayerNorm(time_embed_dim) - def forward(self, image_embeds: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor): # image time_image_embeds = self.image_proj(image_embeds) time_image_embeds = self.image_norm(time_image_embeds) @@ -577,7 +577,7 @@ def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1536): nn.Conv2d(256, 4, 3, padding=1), ) - def forward(self, image_embeds: torch.FloatTensor, hint: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor, hint: torch.Tensor): # image time_image_embeds = self.image_proj(image_embeds) time_image_embeds = self.image_norm(time_image_embeds) @@ -1007,7 +1007,7 @@ def __init__(self, IPAdapterImageProjectionLayers: Union[List[nn.Module], Tuple[ super().__init__() self.image_projection_layers = nn.ModuleList(IPAdapterImageProjectionLayers) - def forward(self, image_embeds: List[torch.FloatTensor]): + def forward(self, image_embeds: List[torch.Tensor]): projected_image_embeds = [] # currently, we accept `image_embeds` as diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py new file mode 100644 index 000000000000..635cd0ba5728 --- /dev/null +++ b/src/diffusers/models/model_loading_utils.py @@ -0,0 +1,149 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import os +from collections import OrderedDict +from typing import List, Optional, Union + +import safetensors +import torch + +from ..utils import ( + SAFETENSORS_FILE_EXTENSION, + is_accelerate_available, + is_torch_version, + logging, +) + + +logger = logging.get_logger(__name__) + + +if is_accelerate_available(): + from accelerate import infer_auto_device_map + from accelerate.utils import get_balanced_memory, get_max_memory, set_module_tensor_to_device + + +# Adapted from `transformers` (see modeling_utils.py) +def _determine_device_map(model: torch.nn.Module, device_map, max_memory, torch_dtype): + if isinstance(device_map, str): + no_split_modules = model._get_no_split_modules(device_map) + device_map_kwargs = {"no_split_module_classes": no_split_modules} + + if device_map != "sequential": + max_memory = get_balanced_memory( + model, + dtype=torch_dtype, + low_zero=(device_map == "balanced_low_0"), + max_memory=max_memory, + **device_map_kwargs, + ) + else: + max_memory = get_max_memory(max_memory) + + device_map_kwargs["max_memory"] = max_memory + device_map = infer_auto_device_map(model, dtype=torch_dtype, **device_map_kwargs) + + return device_map + + +def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[str] = None): + """ + Reads a checkpoint file, returning properly formatted errors if they arise. + """ + try: + file_extension = os.path.basename(checkpoint_file).split(".")[-1] + if file_extension == SAFETENSORS_FILE_EXTENSION: + return safetensors.torch.load_file(checkpoint_file, device="cpu") + else: + weights_only_kwarg = {"weights_only": True} if is_torch_version(">=", "1.13") else {} + return torch.load( + checkpoint_file, + map_location="cpu", + **weights_only_kwarg, + ) + except Exception as e: + try: + with open(checkpoint_file) as f: + if f.read().startswith("version"): + raise OSError( + "You seem to have cloned a repository without having git-lfs installed. Please install " + "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder " + "you cloned." + ) + else: + raise ValueError( + f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained " + "model. Make sure you have saved the model properly." + ) from e + except (UnicodeDecodeError, ValueError): + raise OSError( + f"Unable to load weights from checkpoint file for '{checkpoint_file}' " f"at '{checkpoint_file}'. " + ) + + +def load_model_dict_into_meta( + model, + state_dict: OrderedDict, + device: Optional[Union[str, torch.device]] = None, + dtype: Optional[Union[str, torch.dtype]] = None, + model_name_or_path: Optional[str] = None, +) -> List[str]: + device = device or torch.device("cpu") + dtype = dtype or torch.float32 + + accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys()) + + unexpected_keys = [] + empty_state_dict = model.state_dict() + for param_name, param in state_dict.items(): + if param_name not in empty_state_dict: + unexpected_keys.append(param_name) + continue + + if empty_state_dict[param_name].shape != param.shape: + model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else "" + raise ValueError( + f"Cannot load {model_name_or_path_str}because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example." + ) + + if accepts_dtype: + set_module_tensor_to_device(model, param_name, device, value=param, dtype=dtype) + else: + set_module_tensor_to_device(model, param_name, device, value=param) + return unexpected_keys + + +def _load_state_dict_into_model(model_to_load, state_dict: OrderedDict) -> List[str]: + # Convert old format to new format if needed from a PyTorch state_dict + # copy state_dict so _load_from_state_dict can modify it + state_dict = state_dict.copy() + error_msgs = [] + + # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants + # so we need to apply the function recursively. + def load(module: torch.nn.Module, prefix: str = ""): + args = (state_dict, prefix, {}, True, [], [], error_msgs) + module._load_from_state_dict(*args) + + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + ".") + + load(model_to_load) + + return error_msgs diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index 373f5453aa23..2ed5655c84a9 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -33,7 +33,6 @@ from ..utils import ( CONFIG_NAME, FLAX_WEIGHTS_NAME, - SAFETENSORS_FILE_EXTENSION, SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME, _add_variant, @@ -44,6 +43,12 @@ logging, ) from ..utils.hub_utils import PushToHubMixin, load_or_create_model_card, populate_model_card +from .model_loading_utils import ( + _determine_device_map, + _load_state_dict_into_model, + load_model_dict_into_meta, + load_state_dict, +) logger = logging.get_logger(__name__) @@ -57,9 +62,6 @@ if is_accelerate_available(): import accelerate - from accelerate import infer_auto_device_map - from accelerate.utils import get_balanced_memory, get_max_memory, set_module_tensor_to_device - from accelerate.utils.versions import is_torch_version def get_parameter_device(parameter: torch.nn.Module) -> torch.device: @@ -100,117 +102,6 @@ def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]: return first_tuple[1].dtype -# Adapted from `transformers` (see modeling_utils.py) -def _determine_device_map(model: "ModelMixin", device_map, max_memory, torch_dtype): - if isinstance(device_map, str): - no_split_modules = model._get_no_split_modules(device_map) - device_map_kwargs = {"no_split_module_classes": no_split_modules} - - if device_map != "sequential": - max_memory = get_balanced_memory( - model, - dtype=torch_dtype, - low_zero=(device_map == "balanced_low_0"), - max_memory=max_memory, - **device_map_kwargs, - ) - else: - max_memory = get_max_memory(max_memory) - - device_map_kwargs["max_memory"] = max_memory - device_map = infer_auto_device_map(model, dtype=torch_dtype, **device_map_kwargs) - - return device_map - - -def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[str] = None): - """ - Reads a checkpoint file, returning properly formatted errors if they arise. - """ - try: - file_extension = os.path.basename(checkpoint_file).split(".")[-1] - if file_extension == SAFETENSORS_FILE_EXTENSION: - return safetensors.torch.load_file(checkpoint_file, device="cpu") - else: - weights_only_kwarg = {"weights_only": True} if is_torch_version(">=", "1.13") else {} - return torch.load( - checkpoint_file, - map_location="cpu", - **weights_only_kwarg, - ) - except Exception as e: - try: - with open(checkpoint_file) as f: - if f.read().startswith("version"): - raise OSError( - "You seem to have cloned a repository without having git-lfs installed. Please install " - "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder " - "you cloned." - ) - else: - raise ValueError( - f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained " - "model. Make sure you have saved the model properly." - ) from e - except (UnicodeDecodeError, ValueError): - raise OSError( - f"Unable to load weights from checkpoint file for '{checkpoint_file}' " f"at '{checkpoint_file}'. " - ) - - -def load_model_dict_into_meta( - model, - state_dict: OrderedDict, - device: Optional[Union[str, torch.device]] = None, - dtype: Optional[Union[str, torch.dtype]] = None, - model_name_or_path: Optional[str] = None, -) -> List[str]: - device = device or torch.device("cpu") - dtype = dtype or torch.float32 - - accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys()) - - unexpected_keys = [] - empty_state_dict = model.state_dict() - for param_name, param in state_dict.items(): - if param_name not in empty_state_dict: - unexpected_keys.append(param_name) - continue - - if empty_state_dict[param_name].shape != param.shape: - model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else "" - raise ValueError( - f"Cannot load {model_name_or_path_str}because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example." - ) - - if accepts_dtype: - set_module_tensor_to_device(model, param_name, device, value=param, dtype=dtype) - else: - set_module_tensor_to_device(model, param_name, device, value=param) - return unexpected_keys - - -def _load_state_dict_into_model(model_to_load, state_dict: OrderedDict) -> List[str]: - # Convert old format to new format if needed from a PyTorch state_dict - # copy state_dict so _load_from_state_dict can modify it - state_dict = state_dict.copy() - error_msgs = [] - - # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants - # so we need to apply the function recursively. - def load(module: torch.nn.Module, prefix: str = ""): - args = (state_dict, prefix, {}, True, [], [], error_msgs) - module._load_from_state_dict(*args) - - for name, child in module._modules.items(): - if child is not None: - load(child, prefix + name + ".") - - load(model_to_load) - - return error_msgs - - class ModelMixin(torch.nn.Module, PushToHubMixin): r""" Base class for all models. @@ -963,6 +854,15 @@ def _find_mismatched_keys( return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs + @classmethod + def _get_signature_keys(cls, obj): + parameters = inspect.signature(obj.__init__).parameters + required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty} + optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty}) + expected_modules = set(required_parameters.keys()) - {"self"} + + return expected_modules, optional_parameters + # Adapted from `transformers` modeling_utils.py def _get_no_split_modules(self, device_map: str): """ diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py index adda53a11481..00b55cd9c9d6 100644 --- a/src/diffusers/models/resnet.py +++ b/src/diffusers/models/resnet.py @@ -58,7 +58,7 @@ class ResnetBlockCondNorm2D(nn.Module): non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use. time_embedding_norm (`str`, *optional*, default to `"ada_group"` ): The normalization layer for time embedding `temb`. Currently only support "ada_group" or "spatial". - kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see + kernel (`torch.Tensor`, optional, default to None): FIR filter, see [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`]. output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output. use_in_shortcut (`bool`, *optional*, default to `True`): @@ -146,7 +146,7 @@ def __init__( bias=conv_shortcut_bias, ) - def forward(self, input_tensor: torch.FloatTensor, temb: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def forward(self, input_tensor: torch.Tensor, temb: torch.Tensor, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -204,7 +204,7 @@ class ResnetBlock2D(nn.Module): time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config. By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" for a stronger conditioning with scale and shift. - kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see + kernel (`torch.Tensor`, optional, default to None): FIR filter, see [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`]. output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output. use_in_shortcut (`bool`, *optional*, default to `True`): @@ -232,7 +232,7 @@ def __init__( non_linearity: str = "swish", skip_time_act: bool = False, time_embedding_norm: str = "default", # default, scale_shift, - kernel: Optional[torch.FloatTensor] = None, + kernel: Optional[torch.Tensor] = None, output_scale_factor: float = 1.0, use_in_shortcut: Optional[bool] = None, up: bool = False, @@ -317,7 +317,7 @@ def __init__( bias=conv_shortcut_bias, ) - def forward(self, input_tensor: torch.FloatTensor, temb: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def forward(self, input_tensor: torch.Tensor, temb: torch.Tensor, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -605,7 +605,7 @@ def __init__( padding=0, ) - def forward(self, input_tensor: torch.FloatTensor, temb: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, input_tensor: torch.Tensor, temb: torch.Tensor) -> torch.Tensor: hidden_states = input_tensor hidden_states = self.norm1(hidden_states) @@ -685,8 +685,8 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, ): num_frames = image_only_indicator.shape[-1] diff --git a/src/diffusers/models/transformers/dual_transformer_2d.py b/src/diffusers/models/transformers/dual_transformer_2d.py index e2f1b8538ca0..edc8cbf78382 100644 --- a/src/diffusers/models/transformers/dual_transformer_2d.py +++ b/src/diffusers/models/transformers/dual_transformer_2d.py @@ -106,14 +106,13 @@ def forward( """ Args: hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`. - When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input - hidden_states. + When continuous, `torch.Tensor` of shape `(batch size, channel, height, width)`): Input hidden_states. encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to self-attention. timestep ( `torch.long`, *optional*): Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step. - attention_mask (`torch.FloatTensor`, *optional*): + attention_mask (`torch.Tensor`, *optional*): Optional attention mask to be applied in Attention. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under diff --git a/src/diffusers/models/transformers/prior_transformer.py b/src/diffusers/models/transformers/prior_transformer.py index 990eabe2c37a..8dbcfc64e09c 100644 --- a/src/diffusers/models/transformers/prior_transformer.py +++ b/src/diffusers/models/transformers/prior_transformer.py @@ -26,11 +26,11 @@ class PriorTransformerOutput(BaseOutput): The output of [`PriorTransformer`]. Args: - predicted_image_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`): + predicted_image_embedding (`torch.Tensor` of shape `(batch_size, embedding_dim)`): The predicted CLIP image embedding conditioned on the CLIP text embedding input. """ - predicted_image_embedding: torch.FloatTensor + predicted_image_embedding: torch.Tensor class PriorTransformer(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin): @@ -246,8 +246,8 @@ def forward( self, hidden_states, timestep: Union[torch.Tensor, float, int], - proj_embedding: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + proj_embedding: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, attention_mask: Optional[torch.BoolTensor] = None, return_dict: bool = True, ): @@ -255,13 +255,13 @@ def forward( The [`PriorTransformer`] forward method. Args: - hidden_states (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`): + hidden_states (`torch.Tensor` of shape `(batch_size, embedding_dim)`): The currently predicted image embeddings. timestep (`torch.LongTensor`): Current denoising step. - proj_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`): + proj_embedding (`torch.Tensor` of shape `(batch_size, embedding_dim)`): Projected embedding vector the denoising process is conditioned on. - encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_embeddings, embedding_dim)`): + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, num_embeddings, embedding_dim)`): Hidden states of the text embeddings the denoising process is conditioned on. attention_mask (`torch.BoolTensor` of shape `(batch_size, num_embeddings)`): Text mask for the text embeddings. diff --git a/src/diffusers/models/transformers/t5_film_transformer.py b/src/diffusers/models/transformers/t5_film_transformer.py index bff98db021cf..1dea37a25910 100644 --- a/src/diffusers/models/transformers/t5_film_transformer.py +++ b/src/diffusers/models/transformers/t5_film_transformer.py @@ -86,7 +86,7 @@ def __init__( self.post_dropout = nn.Dropout(p=dropout_rate) self.spec_out = nn.Linear(d_model, input_dims, bias=False) - def encoder_decoder_mask(self, query_input: torch.FloatTensor, key_input: torch.FloatTensor) -> torch.FloatTensor: + def encoder_decoder_mask(self, query_input: torch.Tensor, key_input: torch.Tensor) -> torch.Tensor: mask = torch.mul(query_input.unsqueeze(-1), key_input.unsqueeze(-2)) return mask.unsqueeze(-3) @@ -195,13 +195,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - conditioning_emb: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + conditioning_emb: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, encoder_decoder_position_bias=None, - ) -> Tuple[torch.FloatTensor]: + ) -> Tuple[torch.Tensor]: hidden_states = self.layer[0]( hidden_states, conditioning_emb=conditioning_emb, @@ -249,10 +249,10 @@ def __init__(self, d_model: int, d_kv: int, num_heads: int, dropout_rate: float) def forward( self, - hidden_states: torch.FloatTensor, - conditioning_emb: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + conditioning_emb: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: # pre_self_attention_layer_norm normed_hidden_states = self.layer_norm(hidden_states) @@ -292,10 +292,10 @@ def __init__(self, d_model: int, d_kv: int, num_heads: int, dropout_rate: float, def forward( self, - hidden_states: torch.FloatTensor, - key_value_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.attention( normed_hidden_states, @@ -328,9 +328,7 @@ def __init__(self, d_model: int, d_ff: int, dropout_rate: float, layer_norm_epsi self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon) self.dropout = nn.Dropout(dropout_rate) - def forward( - self, hidden_states: torch.FloatTensor, conditioning_emb: Optional[torch.FloatTensor] = None - ) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, conditioning_emb: Optional[torch.Tensor] = None) -> torch.Tensor: forwarded_states = self.layer_norm(hidden_states) if conditioning_emb is not None: forwarded_states = self.film(forwarded_states, conditioning_emb) @@ -361,7 +359,7 @@ def __init__(self, d_model: int, d_ff: int, dropout_rate: float): self.dropout = nn.Dropout(dropout_rate) self.act = NewGELUActivation() - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_gelu = self.act(self.wi_0(hidden_states)) hidden_linear = self.wi_1(hidden_states) hidden_states = hidden_gelu * hidden_linear @@ -390,7 +388,7 @@ def __init__(self, hidden_size: int, eps: float = 1e-6): self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for @@ -431,7 +429,7 @@ def __init__(self, in_features: int, out_features: int): super().__init__() self.scale_bias = nn.Linear(in_features, out_features * 2, bias=False) - def forward(self, x: torch.FloatTensor, conditioning_emb: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, x: torch.Tensor, conditioning_emb: torch.Tensor) -> torch.Tensor: emb = self.scale_bias(conditioning_emb) scale, shift = torch.chunk(emb, 2, -1) x = x * (1 + scale) + shift diff --git a/src/diffusers/models/transformers/transformer_2d.py b/src/diffusers/models/transformers/transformer_2d.py index 6a2695b9e436..ef9e0de0b662 100644 --- a/src/diffusers/models/transformers/transformer_2d.py +++ b/src/diffusers/models/transformers/transformer_2d.py @@ -35,12 +35,12 @@ class Transformer2DModelOutput(BaseOutput): The output of [`Transformer2DModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete): + sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete): The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability distributions for the unnoised latent pixels. """ - sample: torch.FloatTensor + sample: torch.Tensor class Transformer2DModel(ModelMixin, ConfigMixin): @@ -346,9 +346,9 @@ def forward( The [`Transformer2DModel`] forward method. Args: - hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous): + hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous): Input `hidden_states`. - encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*): + encoder_hidden_states ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to self-attention. timestep ( `torch.LongTensor`, *optional*): diff --git a/src/diffusers/models/transformers/transformer_temporal.py b/src/diffusers/models/transformers/transformer_temporal.py index c2d490f3d046..2e1bb041a207 100644 --- a/src/diffusers/models/transformers/transformer_temporal.py +++ b/src/diffusers/models/transformers/transformer_temporal.py @@ -31,11 +31,11 @@ class TransformerTemporalModelOutput(BaseOutput): The output of [`TransformerTemporalModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size x num_frames, num_channels, height, width)`): + sample (`torch.Tensor` of shape `(batch_size x num_frames, num_channels, height, width)`): The hidden states output conditioned on `encoder_hidden_states` input. """ - sample: torch.FloatTensor + sample: torch.Tensor class TransformerTemporalModel(ModelMixin, ConfigMixin): @@ -120,7 +120,7 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, + hidden_states: torch.Tensor, encoder_hidden_states: Optional[torch.LongTensor] = None, timestep: Optional[torch.LongTensor] = None, class_labels: torch.LongTensor = None, @@ -132,7 +132,7 @@ def forward( The [`TransformerTemporal`] forward method. Args: - hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous): + hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous): Input hidden_states. encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to @@ -283,7 +283,7 @@ def forward( ): """ Args: - hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`): + hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`): Input hidden_states. num_frames (`int`): The number of frames to be processed per batch. This is used to reshape the hidden states. diff --git a/src/diffusers/models/unets/unet_1d.py b/src/diffusers/models/unets/unet_1d.py index 59d70f67c959..d1538cdc61d9 100644 --- a/src/diffusers/models/unets/unet_1d.py +++ b/src/diffusers/models/unets/unet_1d.py @@ -31,11 +31,11 @@ class UNet1DOutput(BaseOutput): The output of [`UNet1DModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, sample_size)`): + sample (`torch.Tensor` of shape `(batch_size, num_channels, sample_size)`): The hidden states output from the last layer of the model. """ - sample: torch.FloatTensor + sample: torch.Tensor class UNet1DModel(ModelMixin, ConfigMixin): @@ -194,7 +194,7 @@ def __init__( def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], return_dict: bool = True, ) -> Union[UNet1DOutput, Tuple]: @@ -202,9 +202,9 @@ def forward( The [`UNet1DModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch_size, num_channels, sample_size)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.unet_1d.UNet1DOutput`] instead of a plain tuple. diff --git a/src/diffusers/models/unets/unet_1d_blocks.py b/src/diffusers/models/unets/unet_1d_blocks.py index e3163cd1d5b3..3c7c1cbecee9 100644 --- a/src/diffusers/models/unets/unet_1d_blocks.py +++ b/src/diffusers/models/unets/unet_1d_blocks.py @@ -66,7 +66,7 @@ def __init__( if add_downsample: self.downsample = Downsample1D(out_channels, use_conv=True, padding=1) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: output_states = () hidden_states = self.resnets[0](hidden_states, temb) @@ -128,10 +128,10 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Optional[Tuple[torch.FloatTensor, ...]] = None, - temb: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + res_hidden_states_tuple: Optional[Tuple[torch.Tensor, ...]] = None, + temb: Optional[torch.Tensor] = None, + ) -> torch.Tensor: if res_hidden_states_tuple is not None: res_hidden_states = res_hidden_states_tuple[-1] hidden_states = torch.cat((hidden_states, res_hidden_states), dim=1) @@ -161,7 +161,7 @@ def __init__(self, in_channels: int, out_channels: int, embed_dim: int): self.res2 = ResidualTemporalBlock1D(in_channels // 2, in_channels // 4, embed_dim=embed_dim) self.down2 = Downsample1D(out_channels // 4, use_conv=True) - def forward(self, x: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, x: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: x = self.res1(x, temb) x = self.down1(x) x = self.res2(x, temb) @@ -209,7 +209,7 @@ def __init__( if self.upsample and self.downsample: raise ValueError("Block cannot downsample and upsample") - def forward(self, hidden_states: torch.FloatTensor, temb: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: torch.Tensor) -> torch.Tensor: hidden_states = self.resnets[0](hidden_states, temb) for resnet in self.resnets[1:]: hidden_states = resnet(hidden_states, temb) @@ -230,7 +230,7 @@ def __init__(self, num_groups_out: int, out_channels: int, embed_dim: int, act_f self.final_conv1d_act = get_activation(act_fn) self.final_conv1d_2 = nn.Conv1d(embed_dim, out_channels, 1) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.final_conv1d_1(hidden_states) hidden_states = rearrange_dims(hidden_states) hidden_states = self.final_conv1d_gn(hidden_states) @@ -251,7 +251,7 @@ def __init__(self, fc_dim: int, embed_dim: int, act_fn: str = "mish"): ] ) - def forward(self, hidden_states: torch.FloatTensor, temb: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: torch.Tensor) -> torch.Tensor: hidden_states = hidden_states.view(hidden_states.shape[0], -1) hidden_states = torch.cat((hidden_states, temb), dim=-1) for layer in self.final_block: @@ -288,7 +288,7 @@ def __init__(self, kernel: str = "linear", pad_mode: str = "reflect"): self.pad = kernel_1d.shape[0] // 2 - 1 self.register_buffer("kernel", kernel_1d) - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = F.pad(hidden_states, (self.pad,) * 2, self.pad_mode) weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]]) indices = torch.arange(hidden_states.shape[1], device=hidden_states.device) @@ -305,7 +305,7 @@ def __init__(self, kernel: str = "linear", pad_mode: str = "reflect"): self.pad = kernel_1d.shape[0] // 2 - 1 self.register_buffer("kernel", kernel_1d) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2,) * 2, self.pad_mode) weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]]) indices = torch.arange(hidden_states.shape[1], device=hidden_states.device) @@ -335,7 +335,7 @@ def transpose_for_scores(self, projection: torch.Tensor) -> torch.Tensor: new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3) return new_projection - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: residual = hidden_states batch, channel_dim, seq = hidden_states.shape @@ -390,7 +390,7 @@ def __init__(self, in_channels: int, mid_channels: int, out_channels: int, is_la self.group_norm_2 = nn.GroupNorm(1, out_channels) self.gelu_2 = nn.GELU() - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: residual = self.conv_skip(hidden_states) if self.has_conv_skip else hidden_states hidden_states = self.conv_1(hidden_states) @@ -435,7 +435,7 @@ def __init__(self, mid_channels: int, in_channels: int, out_channels: Optional[i self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.down(hidden_states) for attn, resnet in zip(self.attentions, self.resnets): hidden_states = resnet(hidden_states) @@ -466,7 +466,7 @@ def __init__(self, out_channels: int, in_channels: int, mid_channels: Optional[i self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.down(hidden_states) for resnet, attn in zip(self.resnets, self.attentions): @@ -490,7 +490,7 @@ def __init__(self, out_channels: int, in_channels: int, mid_channels: Optional[i self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.down(hidden_states) for resnet in self.resnets: @@ -512,7 +512,7 @@ def __init__(self, out_channels: int, in_channels: int, mid_channels: Optional[i self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = torch.cat([hidden_states, temb], dim=1) for resnet in self.resnets: hidden_states = resnet(hidden_states) @@ -542,10 +542,10 @@ def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[i def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + ) -> torch.Tensor: res_hidden_states = res_hidden_states_tuple[-1] hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) @@ -574,10 +574,10 @@ def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[i def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + ) -> torch.Tensor: res_hidden_states = res_hidden_states_tuple[-1] hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) @@ -604,10 +604,10 @@ def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[i def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + ) -> torch.Tensor: res_hidden_states = res_hidden_states_tuple[-1] hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) diff --git a/src/diffusers/models/unets/unet_2d.py b/src/diffusers/models/unets/unet_2d.py index 5efb63822205..0f36afe3f931 100644 --- a/src/diffusers/models/unets/unet_2d.py +++ b/src/diffusers/models/unets/unet_2d.py @@ -30,11 +30,11 @@ class UNet2DOutput(BaseOutput): The output of [`UNet2DModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): The hidden states output from the last layer of the model. """ - sample: torch.FloatTensor + sample: torch.Tensor class UNet2DModel(ModelMixin, ConfigMixin): @@ -242,7 +242,7 @@ def __init__( def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], class_labels: Optional[torch.Tensor] = None, return_dict: bool = True, @@ -251,10 +251,10 @@ def forward( The [`UNet2DModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, channel, height, width)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - class_labels (`torch.FloatTensor`, *optional*, defaults to `None`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + class_labels (`torch.Tensor`, *optional*, defaults to `None`): Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.unet_2d.UNet2DOutput`] instead of a plain tuple. diff --git a/src/diffusers/models/unets/unet_2d_blocks.py b/src/diffusers/models/unets/unet_2d_blocks.py index ef75fad25e44..93a0a82cdcff 100644 --- a/src/diffusers/models/unets/unet_2d_blocks.py +++ b/src/diffusers/models/unets/unet_2d_blocks.py @@ -561,7 +561,7 @@ class AutoencoderTinyBlock(nn.Module): ` The activation function to use. Supported values are `"swish"`, `"mish"`, `"gelu"`, and `"relu"`. Returns: - `torch.FloatTensor`: A tensor with the same shape as the input tensor, but with the number of channels equal to + `torch.Tensor`: A tensor with the same shape as the input tensor, but with the number of channels equal to `out_channels`. """ @@ -582,7 +582,7 @@ def __init__(self, in_channels: int, out_channels: int, act_fn: str): ) self.fuse = nn.ReLU() - def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: return self.fuse(self.conv(x) + self.skip(x)) @@ -612,8 +612,8 @@ class UNetMidBlock2D(nn.Module): output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor. Returns: - `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size, - in_channels, height, width)`. + `torch.Tensor`: The output of the last residual block, which is a tensor of shape `(batch_size, in_channels, + height, width)`. """ @@ -731,7 +731,7 @@ def __init__( self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet in zip(self.attentions, self.resnets[1:]): if attn is not None: @@ -846,13 +846,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -986,13 +986,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1118,11 +1118,11 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1240,14 +1240,14 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - additional_residuals: Optional[torch.FloatTensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + encoder_attention_mask: Optional[torch.Tensor] = None, + additional_residuals: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1362,8 +1362,8 @@ def __init__( self.gradient_checkpointing = False def forward( - self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, *args, **kwargs - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None, *args, **kwargs + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1465,7 +1465,7 @@ def __init__( else: self.downsamplers = None - def forward(self, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1567,7 +1567,7 @@ def __init__( else: self.downsamplers = None - def forward(self, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1666,12 +1666,12 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - skip_sample: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + skip_sample: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...], torch.FloatTensor]: + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...], torch.Tensor]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1757,12 +1757,12 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - skip_sample: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + skip_sample: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...], torch.FloatTensor]: + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...], torch.Tensor]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1850,8 +1850,8 @@ def __init__( self.gradient_checkpointing = False def forward( - self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, *args, **kwargs - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None, *args, **kwargs + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1986,13 +1986,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -2097,8 +2097,8 @@ def __init__( self.gradient_checkpointing = False def forward( - self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, *args, **kwargs - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None, *args, **kwargs + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -2201,13 +2201,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -2358,13 +2358,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -2481,15 +2481,15 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -2616,13 +2616,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -2741,7 +2741,7 @@ def __init__( self.resolution_idx = resolution_idx - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: for resnet in self.resnets: hidden_states = resnet(hidden_states, temb=temb) @@ -2839,7 +2839,7 @@ def __init__( self.resolution_idx = resolution_idx - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: for resnet, attn in zip(self.resnets, self.attentions): hidden_states = resnet(hidden_states, temb=temb) hidden_states = attn(hidden_states, temb=temb) @@ -2947,13 +2947,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, skip_sample=None, *args, **kwargs, - ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -3059,13 +3059,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, skip_sample=None, *args, **kwargs, - ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -3166,13 +3166,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -3310,15 +3310,15 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -3428,13 +3428,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -3558,15 +3558,15 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: res_hidden_states_tuple = res_hidden_states_tuple[-1] if res_hidden_states_tuple is not None: hidden_states = torch.cat([hidden_states, res_hidden_states_tuple], dim=1) @@ -3684,23 +3684,23 @@ def __init__( cross_attention_norm=cross_attention_norm, ) - def _to_3d(self, hidden_states: torch.FloatTensor, height: int, weight: int) -> torch.FloatTensor: + def _to_3d(self, hidden_states: torch.Tensor, height: int, weight: int) -> torch.Tensor: return hidden_states.permute(0, 2, 3, 1).reshape(hidden_states.shape[0], height * weight, -1) - def _to_4d(self, hidden_states: torch.FloatTensor, height: int, weight: int) -> torch.FloatTensor: + def _to_4d(self, hidden_states: torch.Tensor, height: int, weight: int) -> torch.Tensor: return hidden_states.permute(0, 2, 1).reshape(hidden_states.shape[0], -1, height, weight) def forward( self, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, # TODO: mark emb as non-optional (self.norm2 requires it). # requires assessing impact of change to positional param interface. - emb: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + emb: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py index 697730b359ff..ad45a43b5023 100644 --- a/src/diffusers/models/unets/unet_2d_condition.py +++ b/src/diffusers/models/unets/unet_2d_condition.py @@ -20,6 +20,7 @@ from ...configuration_utils import ConfigMixin, register_to_config from ...loaders import PeftAdapterMixin, UNet2DConditionLoadersMixin +from ...loaders.single_file_model import FromOriginalModelMixin from ...utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers from ..activations import get_activation from ..attention_processor import ( @@ -59,14 +60,16 @@ class UNet2DConditionOutput(BaseOutput): The output of [`UNet2DConditionModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. """ - sample: torch.FloatTensor = None + sample: torch.Tensor = None -class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin): +class UNet2DConditionModel( + ModelMixin, ConfigMixin, FromOriginalModelMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin +): r""" A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample shaped output. @@ -682,7 +685,7 @@ def _set_pos_net_if_use_gligen(self, attention_type: str, cross_attention_dim: i positive_len = 768 if isinstance(cross_attention_dim, int): positive_len = cross_attention_dim - elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list): + elif isinstance(cross_attention_dim, (list, tuple)): positive_len = cross_attention_dim[0] feature_type = "text-only" if attention_type == "gated" else "text-image" @@ -1039,7 +1042,7 @@ def process_encoder_hidden_states( def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, class_labels: Optional[torch.Tensor] = None, @@ -1057,10 +1060,10 @@ def forward( The [`UNet2DConditionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, channel, height, width)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. class_labels (`torch.Tensor`, *optional*, defaults to `None`): Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. diff --git a/src/diffusers/models/unets/unet_3d_blocks.py b/src/diffusers/models/unets/unet_3d_blocks.py index 75827258f64c..35a732bdb9ec 100644 --- a/src/diffusers/models/unets/unet_3d_blocks.py +++ b/src/diffusers/models/unets/unet_3d_blocks.py @@ -411,13 +411,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: hidden_states = self.resnets[0](hidden_states, temb) hidden_states = self.temp_convs[0](hidden_states, num_frames=num_frames) for attn, temp_attn, resnet, temp_conv in zip( @@ -544,13 +544,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, cross_attention_kwargs: Dict[str, Any] = None, - ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]: # TODO(Patrick, William) - attention mask is not used output_states = () @@ -651,10 +651,10 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, num_frames: int = 1, - ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]: output_states = () for resnet, temp_conv in zip(self.resnets, self.temp_convs): @@ -769,15 +769,15 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, cross_attention_kwargs: Dict[str, Any] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: is_freeu_enabled = ( getattr(self, "s1", None) and getattr(self, "s2", None) @@ -891,12 +891,12 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, num_frames: int = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: is_freeu_enabled = ( getattr(self, "s1", None) and getattr(self, "s2", None) @@ -1008,12 +1008,12 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, num_frames: int = 1, *args, **kwargs, - ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1174,14 +1174,14 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - additional_residuals: Optional[torch.FloatTensor] = None, + additional_residuals: Optional[torch.Tensor] = None, ): if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: @@ -1357,16 +1357,16 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1518,14 +1518,14 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size=None, num_frames: int = 1, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1699,14 +1699,14 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1811,8 +1811,8 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - image_only_indicator: torch.FloatTensor, + hidden_states: torch.Tensor, + image_only_indicator: torch.Tensor, ): hidden_states = self.resnets[0]( hidden_states, @@ -1862,9 +1862,9 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - image_only_indicator: torch.FloatTensor, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + image_only_indicator: torch.Tensor, + ) -> torch.Tensor: for resnet in self.resnets: hidden_states = resnet( hidden_states, @@ -1935,11 +1935,11 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: hidden_states = self.resnets[0]( hidden_states, temb, @@ -2031,10 +2031,10 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: output_states = () for resnet in self.resnets: if self.training and self.gradient_checkpointing: @@ -2141,11 +2141,11 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: output_states = () blocks = list(zip(self.resnets, self.attentions)) @@ -2240,11 +2240,11 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: for resnet in self.resnets: # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] @@ -2349,12 +2349,12 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: for resnet, attn in zip(self.resnets, self.attentions): # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] diff --git a/src/diffusers/models/unets/unet_3d_condition.py b/src/diffusers/models/unets/unet_3d_condition.py index 6c353c425911..b4879fe9639c 100644 --- a/src/diffusers/models/unets/unet_3d_condition.py +++ b/src/diffusers/models/unets/unet_3d_condition.py @@ -55,11 +55,11 @@ class UNet3DConditionOutput(BaseOutput): The output of [`UNet3DConditionModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, num_frames, height, width)`): + sample (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`): The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. """ - sample: torch.FloatTensor + sample: torch.Tensor class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): @@ -560,7 +560,7 @@ def unload_lora(self): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, class_labels: Optional[torch.Tensor] = None, @@ -570,15 +570,15 @@ def forward( down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, mid_block_additional_residual: Optional[torch.Tensor] = None, return_dict: bool = True, - ) -> Union[UNet3DConditionOutput, Tuple[torch.FloatTensor]]: + ) -> Union[UNet3DConditionOutput, Tuple[torch.Tensor]]: r""" The [`UNet3DConditionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, num_channels, num_frames, height, width`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. class_labels (`torch.Tensor`, *optional*, defaults to `None`): Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py index 0a5f71ed0029..dbfb4f80259d 100644 --- a/src/diffusers/models/unets/unet_i2vgen_xl.py +++ b/src/diffusers/models/unets/unet_i2vgen_xl.py @@ -81,8 +81,8 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + ) -> torch.Tensor: norm_hidden_states = self.norm1(hidden_states) attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None) hidden_states = attn_output + hidden_states @@ -514,7 +514,7 @@ def unfuse_qkv_projections(self): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], fps: torch.Tensor, image_latents: torch.Tensor, @@ -523,19 +523,19 @@ def forward( timestep_cond: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, return_dict: bool = True, - ) -> Union[UNet3DConditionOutput, Tuple[torch.FloatTensor]]: + ) -> Union[UNet3DConditionOutput, Tuple[torch.Tensor]]: r""" The [`I2VGenXLUNet`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. fps (`torch.Tensor`): Frames per second for the video being generated. Used as a "micro-condition". - image_latents (`torch.FloatTensor`): Image encodings from the VAE. - image_embeddings (`torch.FloatTensor`): + image_latents (`torch.Tensor`): Image encodings from the VAE. + image_embeddings (`torch.Tensor`): Projection embeddings of the conditioning image computed with a vision encoder. - encoder_hidden_states (`torch.FloatTensor`): + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under diff --git a/src/diffusers/models/unets/unet_kandinsky3.py b/src/diffusers/models/unets/unet_kandinsky3.py index b981c8e17eb9..ff8ce25fd223 100644 --- a/src/diffusers/models/unets/unet_kandinsky3.py +++ b/src/diffusers/models/unets/unet_kandinsky3.py @@ -31,7 +31,7 @@ @dataclass class Kandinsky3UNetOutput(BaseOutput): - sample: torch.FloatTensor = None + sample: torch.Tensor = None class Kandinsky3EncoderProj(nn.Module): diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py index 81cc4b1f7ad4..1b62d16d5d77 100644 --- a/src/diffusers/models/unets/unet_motion_model.py +++ b/src/diffusers/models/unets/unet_motion_model.py @@ -15,6 +15,7 @@ import torch import torch.nn as nn +import torch.nn.functional as F import torch.utils.checkpoint from ...configuration_utils import ConfigMixin, FrozenDict, register_to_config @@ -27,6 +28,9 @@ AttentionProcessor, AttnAddedKVProcessor, AttnProcessor, + AttnProcessor2_0, + IPAdapterAttnProcessor, + IPAdapterAttnProcessor2_0, ) from ..embeddings import TimestepEmbedding, Timesteps from ..modeling_utils import ModelMixin @@ -490,6 +494,36 @@ def from_unet2d( model.time_proj.load_state_dict(unet.time_proj.state_dict()) model.time_embedding.load_state_dict(unet.time_embedding.state_dict()) + if any( + isinstance(proc, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)) + for proc in unet.attn_processors.values() + ): + attn_procs = {} + for name, processor in unet.attn_processors.items(): + if name.endswith("attn1.processor"): + attn_processor_class = ( + AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor + ) + attn_procs[name] = attn_processor_class() + else: + attn_processor_class = ( + IPAdapterAttnProcessor2_0 + if hasattr(F, "scaled_dot_product_attention") + else IPAdapterAttnProcessor + ) + attn_procs[name] = attn_processor_class( + hidden_size=processor.hidden_size, + cross_attention_dim=processor.cross_attention_dim, + scale=processor.scale, + num_tokens=processor.num_tokens, + ) + for name, processor in model.attn_processors.items(): + if name not in attn_procs: + attn_procs[name] = processor.__class__() + model.set_attn_processor(attn_procs) + model.config.encoder_hid_dim_type = "ip_image_proj" + model.encoder_hid_proj = unet.encoder_hid_proj + for i, down_block in enumerate(unet.down_blocks): model.down_blocks[i].resnets.load_state_dict(down_block.resnets.state_dict()) if hasattr(model.down_blocks[i], "attentions"): @@ -786,7 +820,7 @@ def unfuse_qkv_projections(self): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, timestep_cond: Optional[torch.Tensor] = None, @@ -801,10 +835,10 @@ def forward( The [`UNetMotionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`): Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed diff --git a/src/diffusers/models/unets/unet_spatio_temporal_condition.py b/src/diffusers/models/unets/unet_spatio_temporal_condition.py index 0f89df8c6bff..5613e3618d02 100644 --- a/src/diffusers/models/unets/unet_spatio_temporal_condition.py +++ b/src/diffusers/models/unets/unet_spatio_temporal_condition.py @@ -22,11 +22,11 @@ class UNetSpatioTemporalConditionOutput(BaseOutput): The output of [`UNetSpatioTemporalConditionModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`): + sample (`torch.Tensor` of shape `(batch_size, num_frames, num_channels, height, width)`): The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. """ - sample: torch.FloatTensor = None + sample: torch.Tensor = None class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): @@ -356,7 +356,7 @@ def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, added_time_ids: torch.Tensor, @@ -366,12 +366,12 @@ def forward( The [`UNetSpatioTemporalConditionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`. - added_time_ids: (`torch.FloatTensor`): + added_time_ids: (`torch.Tensor`): The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal embeddings and added to the time embeddings. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/models/unets/unet_stable_cascade.py b/src/diffusers/models/unets/unet_stable_cascade.py index ff76415ecf0e..75a3dbc8edee 100644 --- a/src/diffusers/models/unets/unet_stable_cascade.py +++ b/src/diffusers/models/unets/unet_stable_cascade.py @@ -21,7 +21,7 @@ import torch.nn as nn from ...configuration_utils import ConfigMixin, register_to_config -from ...loaders.unet import FromOriginalUNetMixin +from ...loaders import FromOriginalModelMixin from ...utils import BaseOutput from ..attention_processor import Attention from ..modeling_utils import ModelMixin @@ -131,10 +131,10 @@ def forward(self, x): @dataclass class StableCascadeUNetOutput(BaseOutput): - sample: torch.FloatTensor = None + sample: torch.Tensor = None -class StableCascadeUNet(ModelMixin, ConfigMixin, FromOriginalUNetMixin): +class StableCascadeUNet(ModelMixin, ConfigMixin, FromOriginalModelMixin): _supports_gradient_checkpointing = True @register_to_config diff --git a/src/diffusers/models/upsampling.py b/src/diffusers/models/upsampling.py index af6e15db308b..572844d2de0a 100644 --- a/src/diffusers/models/upsampling.py +++ b/src/diffusers/models/upsampling.py @@ -138,9 +138,7 @@ def __init__( else: self.Conv2d_0 = conv - def forward( - self, hidden_states: torch.FloatTensor, output_size: Optional[int] = None, *args, **kwargs - ) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, output_size: Optional[int] = None, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -217,12 +215,12 @@ def __init__( def _upsample_2d( self, - hidden_states: torch.FloatTensor, - weight: Optional[torch.FloatTensor] = None, - kernel: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + weight: Optional[torch.Tensor] = None, + kernel: Optional[torch.Tensor] = None, factor: int = 2, gain: float = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """Fused `upsample_2d()` followed by `Conv2d()`. Padding is performed only once at the beginning, not between the operations. The fused op is considerably more @@ -230,19 +228,19 @@ def _upsample_2d( arbitrary order. Args: - hidden_states (`torch.FloatTensor`): + hidden_states (`torch.Tensor`): Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`. - weight (`torch.FloatTensor`, *optional*): + weight (`torch.Tensor`, *optional*): Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`. Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`. - kernel (`torch.FloatTensor`, *optional*): + kernel (`torch.Tensor`, *optional*): FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling. factor (`int`, *optional*): Integer upsampling factor (default: 2). gain (`float`, *optional*): Scaling factor for signal magnitude (default: 1.0). Returns: - output (`torch.FloatTensor`): + output (`torch.Tensor`): Tensor of the shape `[N, C, H * factor, W * factor]` or `[N, H * factor, W * factor, C]`, and same datatype as `hidden_states`. """ @@ -310,7 +308,7 @@ def _upsample_2d( return output - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: if self.use_conv: height = self._upsample_2d(hidden_states, self.Conv2d_0.weight, kernel=self.fir_kernel) height = height + self.Conv2d_0.bias.reshape(1, -1, 1, 1) @@ -401,11 +399,11 @@ def upfirdn2d_native( def upsample_2d( - hidden_states: torch.FloatTensor, - kernel: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + kernel: Optional[torch.Tensor] = None, factor: int = 2, gain: float = 1, -) -> torch.FloatTensor: +) -> torch.Tensor: r"""Upsample2D a batch of 2D images with the given filter. Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and upsamples each image with the given filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the specified @@ -413,9 +411,9 @@ def upsample_2d( a: multiple of the upsampling factor. Args: - hidden_states (`torch.FloatTensor`): + hidden_states (`torch.Tensor`): Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`. - kernel (`torch.FloatTensor`, *optional*): + kernel (`torch.Tensor`, *optional*): FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling. factor (`int`, *optional*, default to `2`): @@ -424,7 +422,7 @@ def upsample_2d( Scaling factor for signal magnitude (default: 1.0). Returns: - output (`torch.FloatTensor`): + output (`torch.Tensor`): Tensor of the shape `[N, C, H * factor, W * factor]` """ assert isinstance(factor, int) and factor >= 1 diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py index e5184446cee7..cb32b1f40734 100644 --- a/src/diffusers/models/vq_model.py +++ b/src/diffusers/models/vq_model.py @@ -30,11 +30,11 @@ class VQEncoderOutput(BaseOutput): Output of VQModel encoding method. Args: - latents (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + latents (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): The encoded output sample from the last layer of the model. """ - latents: torch.FloatTensor + latents: torch.Tensor class VQModel(ModelMixin, ConfigMixin): @@ -127,7 +127,7 @@ def __init__( ) @apply_forward_hook - def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput: + def encode(self, x: torch.Tensor, return_dict: bool = True) -> VQEncoderOutput: h = self.encoder(x) h = self.quant_conv(h) @@ -138,31 +138,33 @@ def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOut @apply_forward_hook def decode( - self, h: torch.FloatTensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None - ) -> Union[DecoderOutput, torch.FloatTensor]: + self, h: torch.Tensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None + ) -> Union[DecoderOutput, torch.Tensor]: # also go through quantization layer if not force_not_quantize: - quant, _, _ = self.quantize(h) + quant, commit_loss, _ = self.quantize(h) elif self.config.lookup_from_codebook: quant = self.quantize.get_codebook_entry(h, shape) + commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype) else: quant = h + commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype) quant2 = self.post_quant_conv(quant) dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None) if not return_dict: - return (dec,) + return dec, commit_loss - return DecoderOutput(sample=dec) + return DecoderOutput(sample=dec, commit_loss=commit_loss) def forward( - self, sample: torch.FloatTensor, return_dict: bool = True - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor, ...]]: + self, sample: torch.Tensor, return_dict: bool = True + ) -> Union[DecoderOutput, Tuple[torch.Tensor, ...]]: r""" The [`VQModel`] forward method. Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`models.vq_model.VQEncoderOutput`] instead of a plain tuple. @@ -173,9 +175,8 @@ def forward( """ h = self.encode(sample).latents - dec = self.decode(h).sample + dec = self.decode(h) if not return_dict: - return (dec,) - - return DecoderOutput(sample=dec) + return dec.sample, dec.commit_loss + return dec diff --git a/src/diffusers/pipelines/amused/pipeline_amused.py b/src/diffusers/pipelines/amused/pipeline_amused.py index 994455ff29db..a8c24b0aeecc 100644 --- a/src/diffusers/pipelines/amused/pipeline_amused.py +++ b/src/diffusers/pipelines/amused/pipeline_amused.py @@ -88,7 +88,7 @@ def __call__( negative_encoder_hidden_states: Optional[torch.Tensor] = None, output_type="pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, micro_conditioning_aesthetic_score: int = 6, @@ -122,16 +122,16 @@ def __call__( latents (`torch.IntTensor`, *optional*): Pre-generated tokens representing latent vectors in `self.vqvae`, to be used as inputs for image gneration. If not provided, the starting latents will be completely masked. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. A single vector from the pooled and projected final hidden states. - encoder_hidden_states (`torch.FloatTensor`, *optional*): + encoder_hidden_states (`torch.Tensor`, *optional*): Pre-generated penultimate hidden states from the text encoder providing additional text conditioning. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - negative_encoder_hidden_states (`torch.FloatTensor`, *optional*): + negative_encoder_hidden_states (`torch.Tensor`, *optional*): Analogous to `encoder_hidden_states` for the positive prompt. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. @@ -140,7 +140,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/amused/pipeline_amused_img2img.py b/src/diffusers/pipelines/amused/pipeline_amused_img2img.py index 1218e7a44c4d..c74275b414d4 100644 --- a/src/diffusers/pipelines/amused/pipeline_amused_img2img.py +++ b/src/diffusers/pipelines/amused/pipeline_amused_img2img.py @@ -102,7 +102,7 @@ def __call__( negative_encoder_hidden_states: Optional[torch.Tensor] = None, output_type="pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, micro_conditioning_aesthetic_score: int = 6, @@ -115,7 +115,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a @@ -141,16 +141,16 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. A single vector from the pooled and projected final hidden states. - encoder_hidden_states (`torch.FloatTensor`, *optional*): + encoder_hidden_states (`torch.Tensor`, *optional*): Pre-generated penultimate hidden states from the text encoder providing additional text conditioning. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - negative_encoder_hidden_states (`torch.FloatTensor`, *optional*): + negative_encoder_hidden_states (`torch.Tensor`, *optional*): Analogous to `encoder_hidden_states` for the positive prompt. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. @@ -159,7 +159,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py b/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py index ab0a55cdd388..24801e0ef977 100644 --- a/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py +++ b/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py @@ -119,7 +119,7 @@ def __call__( negative_encoder_hidden_states: Optional[torch.Tensor] = None, output_type="pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, micro_conditioning_aesthetic_score: int = 6, @@ -132,13 +132,13 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but if passing latents directly it is not encoded again. - mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one @@ -165,16 +165,16 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. A single vector from the pooled and projected final hidden states. - encoder_hidden_states (`torch.FloatTensor`, *optional*): + encoder_hidden_states (`torch.Tensor`, *optional*): Pre-generated penultimate hidden states from the text encoder providing additional text conditioning. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - negative_encoder_hidden_states (`torch.FloatTensor`, *optional*): + negative_encoder_hidden_states (`torch.Tensor`, *optional*): Analogous to `encoder_hidden_states` for the positive prompt. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. @@ -183,7 +183,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py index 3765db938cd5..175671ece5d7 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py @@ -15,11 +15,10 @@ import inspect from typing import Any, Callable, Dict, List, Optional, Union -import numpy as np import torch from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection -from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel from ...models.lora import adjust_lora_scale_text_encoder @@ -41,6 +40,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor +from ...video_processor import VideoProcessor from ..free_init_utils import FreeInitMixin from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from .pipeline_output import AnimateDiffPipelineOutput @@ -65,27 +65,6 @@ """ -def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"): - batch_size, channels, num_frames, height, width = video.shape - outputs = [] - for batch_idx in range(batch_size): - batch_vid = video[batch_idx].permute(1, 0, 2, 3) - batch_output = processor.postprocess(batch_vid, output_type) - - outputs.append(batch_output) - - if output_type == "np": - outputs = np.stack(outputs) - - elif output_type == "pt": - outputs = torch.stack(outputs) - - elif not output_type == "pil": - raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']") - - return outputs - - class AnimateDiffPipeline( DiffusionPipeline, StableDiffusionMixin, @@ -159,7 +138,7 @@ def __init__( image_encoder=image_encoder, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt def encode_prompt( @@ -169,8 +148,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -190,10 +169,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -584,11 +563,11 @@ def __call__( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -625,27 +604,26 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or - `np.array`. + The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead of a plain tuple. @@ -836,7 +814,7 @@ def __call__( video = latents else: video_tensor = self.decode_latents(latents) - video = tensor2vid(video_tensor, self.image_processor, output_type=output_type) + video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type) # 10. Offload all models self.maybe_free_model_hooks() diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py index f15cd5dbebf7..50ed54001e16 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py @@ -15,7 +15,6 @@ import inspect from typing import Any, Callable, Dict, List, Optional, Tuple, Union -import numpy as np import torch from transformers import ( CLIPImageProcessor, @@ -25,7 +24,7 @@ CLIPVisionModelWithProjection, ) -from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput from ...loaders import ( FromSingleFileMixin, IPAdapterMixin, @@ -57,6 +56,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor +from ...video_processor import VideoProcessor from ..free_init_utils import FreeInitMixin from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from .pipeline_output import AnimateDiffPipelineOutput @@ -113,28 +113,6 @@ """ -# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid -def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"): - batch_size, channels, num_frames, height, width = video.shape - outputs = [] - for batch_idx in range(batch_size): - batch_vid = video[batch_idx].permute(1, 0, 2, 3) - batch_output = processor.postprocess(batch_vid, output_type) - - outputs.append(batch_output) - - if output_type == "np": - outputs = np.stack(outputs) - - elif output_type == "pt": - outputs = torch.stack(outputs) - - elif not output_type == "pil": - raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']") - - return outputs - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): """ @@ -156,6 +134,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -171,14 +150,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -189,6 +172,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -305,7 +298,7 @@ def __init__( ) self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor) self.default_sample_size = self.unet.config.sample_size @@ -319,10 +312,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -348,17 +341,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -791,7 +784,7 @@ def upcast_vae(self): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -804,7 +797,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -865,6 +858,7 @@ def __call__( width: Optional[int] = None, num_inference_steps: int = 50, timesteps: List[int] = None, + sigmas: List[float] = None, denoising_end: Optional[float] = None, guidance_scale: float = 5.0, negative_prompt: Optional[Union[str, List[str]]] = None, @@ -872,13 +866,13 @@ def __call__( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -923,6 +917,10 @@ def __call__( Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. denoising_end (`float`, *optional*): When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be completed before it is intentionally prematurely terminated. As a result, the returned sample will @@ -951,27 +949,27 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -1104,7 +1102,9 @@ def __call__( ) # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) # 5. Prepare latent variables num_channels_latents = self.unet.config.in_channels @@ -1269,7 +1269,7 @@ def __call__( video = latents else: video_tensor = self.decode_latents(latents) - video = tensor2vid(video_tensor, self.image_processor, output_type=output_type) + video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type) # cast back to fp16 if needed if needs_upcasting: diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py index 106fabba721b..2adc5cdf82e5 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py @@ -15,11 +15,10 @@ import inspect from typing import Any, Callable, Dict, List, Optional, Union -import numpy as np import torch from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection -from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel from ...models.lora import adjust_lora_scale_text_encoder @@ -34,6 +33,7 @@ ) from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers from ...utils.torch_utils import randn_tensor +from ...video_processor import VideoProcessor from ..free_init_utils import FreeInitMixin from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from .pipeline_output import AnimateDiffPipelineOutput @@ -95,28 +95,6 @@ """ -# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid -def tensor2vid(video: torch.Tensor, processor, output_type="np"): - batch_size, channels, num_frames, height, width = video.shape - outputs = [] - for batch_idx in range(batch_size): - batch_vid = video[batch_idx].permute(1, 0, 2, 3) - batch_output = processor.postprocess(batch_vid, output_type) - - outputs.append(batch_output) - - if output_type == "np": - outputs = np.stack(outputs) - - elif output_type == "pt": - outputs = torch.stack(outputs) - - elif not output_type == "pil": - raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']") - - return outputs - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents def retrieve_latents( encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample" @@ -137,6 +115,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -152,14 +131,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -170,6 +153,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -249,7 +242,7 @@ def __init__( image_encoder=image_encoder, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt def encode_prompt( @@ -259,8 +252,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -280,10 +273,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -635,16 +628,7 @@ def prepare_latents( generator, latents=None, ): - # video must be a list of list of images - # the outer list denotes having multiple videos as input, whereas inner list means the frames of the video - # as a list of images - if video and not isinstance(video[0], list): - video = [video] if latents is None: - video = torch.cat( - [self.image_processor.preprocess(vid, height=height, width=width).unsqueeze(0) for vid in video], dim=0 - ) - video = video.to(device=device, dtype=dtype) num_frames = video.shape[1] else: num_frames = latents.shape[2] @@ -750,17 +734,18 @@ def __call__( width: Optional[int] = None, num_inference_steps: int = 50, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, guidance_scale: float = 7.5, strength: float = 0.8, negative_prompt: Optional[Union[str, List[str]]] = None, num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -783,6 +768,14 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality videos at the expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. strength (`float`, *optional*, defaults to 0.8): Higher strength leads to more differences between original video and generated video. guidance_scale (`float`, *optional*, defaults to 7.5): @@ -797,27 +790,26 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or - `np.array`. + The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`AnimateDiffPipelineOutput`] instead of a plain tuple. cross_attention_kwargs (`dict`, *optional*): @@ -912,11 +904,18 @@ def __call__( ) # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device) latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt) # 5. Prepare latent variables + if latents is None: + video = self.video_processor.preprocess_video(video, height=height, width=width) + # Move the number of frames before the number of channels. + video = video.permute(0, 2, 1, 3, 4) + video = video.to(device=device, dtype=prompt_embeds.dtype) num_channels_latents = self.unet.config.in_channels latents = self.prepare_latents( video=video, @@ -997,7 +996,7 @@ def __call__( video = latents else: video_tensor = self.decode_latents(latents) - video = tensor2vid(video_tensor, self.image_processor, output_type=output_type) + video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type) # 10. Offload all models self.maybe_free_model_hooks() diff --git a/src/diffusers/pipelines/animatediff/pipeline_output.py b/src/diffusers/pipelines/animatediff/pipeline_output.py index 97e7c87ad7f7..2417223cf95e 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_output.py +++ b/src/diffusers/pipelines/animatediff/pipeline_output.py @@ -13,7 +13,7 @@ class AnimateDiffPipelineOutput(BaseOutput): r""" Output class for AnimateDiff pipelines. - Args: + Args: frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py index 78b730ea916c..105ca40f773f 100644 --- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py +++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py @@ -103,8 +103,8 @@ def _encode_prompt( num_waveforms_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -122,10 +122,10 @@ def _encode_prompt( The prompt or prompts not to guide the audio generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -360,11 +360,11 @@ def __call__( num_waveforms_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, output_type: Optional[str] = "np", @@ -394,21 +394,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py index 948caf97d27b..71310834351e 100644 --- a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py @@ -64,7 +64,7 @@ class AudioLDM2ProjectionModelOutput(BaseOutput): """ Args: Class for AudioLDM2 projection layer's outputs. - hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states obtained by linearly projecting the hidden-states for each of the text encoders and subsequently concatenating them together. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -75,7 +75,7 @@ class AudioLDM2ProjectionModelOutput(BaseOutput): - 0 for tokens that are **masked**. """ - hidden_states: torch.FloatTensor + hidden_states: torch.Tensor attention_mask: Optional[torch.LongTensor] = None @@ -125,8 +125,8 @@ def __init__( def forward( self, - hidden_states: Optional[torch.FloatTensor] = None, - hidden_states_1: Optional[torch.FloatTensor] = None, + hidden_states: Optional[torch.Tensor] = None, + hidden_states_1: Optional[torch.Tensor] = None, attention_mask: Optional[torch.LongTensor] = None, attention_mask_1: Optional[torch.LongTensor] = None, ): @@ -680,7 +680,7 @@ def _set_gradient_checkpointing(self, module, value=False): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, class_labels: Optional[torch.Tensor] = None, @@ -696,10 +696,10 @@ def forward( The [`AudioLDM2UNet2DConditionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, channel, height, width)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. encoder_attention_mask (`torch.Tensor`): A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If @@ -710,7 +710,7 @@ def forward( tuple. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. - encoder_hidden_states_1 (`torch.FloatTensor`, *optional*): + encoder_hidden_states_1 (`torch.Tensor`, *optional*): A second set of encoder hidden states with shape `(batch, sequence_length_2, feature_dim_2)`. Can be used to condition the model on a different set of embeddings to `encoder_hidden_states`. encoder_attention_mask_1 (`torch.Tensor`, *optional*): @@ -1091,14 +1091,14 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states_1: Optional[torch.FloatTensor] = None, - encoder_attention_mask_1: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states_1: Optional[torch.Tensor] = None, + encoder_attention_mask_1: Optional[torch.Tensor] = None, ): output_states = () num_layers = len(self.resnets) @@ -1270,15 +1270,15 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states_1: Optional[torch.FloatTensor] = None, - encoder_attention_mask_1: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states_1: Optional[torch.Tensor] = None, + encoder_attention_mask_1: Optional[torch.Tensor] = None, + ) -> torch.Tensor: hidden_states = self.resnets[0](hidden_states, temb) num_attention_per_layer = len(self.attentions) // (len(self.resnets) - 1) @@ -1437,16 +1437,16 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states_1: Optional[torch.FloatTensor] = None, - encoder_attention_mask_1: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states_1: Optional[torch.Tensor] = None, + encoder_attention_mask_1: Optional[torch.Tensor] = None, ): num_layers = len(self.resnets) num_attention_per_layer = len(self.attentions) // num_layers diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py index a498831877c9..49440830d0c0 100644 --- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py @@ -273,7 +273,7 @@ def generate_language_model( Generates a sequence of hidden-states from the language model, conditioned on the embedding inputs. Parameters: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): The sequence used as a prompt for the generation. max_new_tokens (`int`): Number of new tokens to generate. @@ -282,7 +282,7 @@ def generate_language_model( function of the model. Return: - `inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + `inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): The sequence of generated hidden-states. """ max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens @@ -311,10 +311,10 @@ def encode_prompt( do_classifier_free_guidance, transcription=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - generated_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_generated_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + generated_prompt_embeds: Optional[torch.Tensor] = None, + negative_generated_prompt_embeds: Optional[torch.Tensor] = None, attention_mask: Optional[torch.LongTensor] = None, negative_attention_mask: Optional[torch.LongTensor] = None, max_new_tokens: Optional[int] = None, @@ -337,18 +337,18 @@ def encode_prompt( The prompt or prompts not to guide the audio generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-computed text embeddings from the Flan T5 model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be computed from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-computed negative text embeddings from the Flan T5 model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from `negative_prompt` input argument. - generated_prompt_embeds (`torch.FloatTensor`, *optional*): + generated_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_generated_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_generated_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from `negative_prompt` input argument. @@ -361,11 +361,11 @@ def encode_prompt( max_new_tokens (`int`, *optional*, defaults to None): The number of new tokens to generate with the GPT2 language model. Returns: - prompt_embeds (`torch.FloatTensor`): + prompt_embeds (`torch.Tensor`): Text embeddings from the Flan T5 model. attention_mask (`torch.LongTensor`): Attention mask to be applied to the `prompt_embeds`. - generated_prompt_embeds (`torch.FloatTensor`): + generated_prompt_embeds (`torch.Tensor`): Text embeddings generated from the GPT2 langauge model. Example: @@ -821,16 +821,16 @@ def __call__( num_waveforms_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - generated_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_generated_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + generated_prompt_embeds: Optional[torch.Tensor] = None, + negative_generated_prompt_embeds: Optional[torch.Tensor] = None, attention_mask: Optional[torch.LongTensor] = None, negative_attention_mask: Optional[torch.LongTensor] = None, max_new_tokens: Optional[int] = None, return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, output_type: Optional[str] = "np", @@ -865,21 +865,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for spectrogram generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - generated_prompt_embeds (`torch.FloatTensor`, *optional*): + generated_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_generated_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_generated_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from `negative_prompt` input argument. @@ -897,7 +897,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py b/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py index d71a1481034a..d92a07669059 100644 --- a/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py +++ b/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py @@ -298,7 +298,7 @@ def preprocess( return encoded_outputs # Follows diffusers.VaeImageProcessor.postprocess - def postprocess(self, sample: torch.FloatTensor, output_type: str = "pil"): + def postprocess(self, sample: torch.Tensor, output_type: str = "pil"): if output_type not in ["pt", "np", "pil"]: raise ValueError( f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']" diff --git a/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py b/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py index c8869ad9db23..1be4761a9987 100644 --- a/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py +++ b/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py @@ -117,7 +117,7 @@ def __init__(self, config: Blip2VisionConfig): self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim)) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: batch_size = pixel_values.shape[0] target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] @@ -376,7 +376,7 @@ def __init__(self, config: Blip2VisionConfig): @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Blip2VisionConfig) def forward( self, - pixel_values: Optional[torch.FloatTensor] = None, + pixel_values: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, @@ -524,15 +524,15 @@ def forward( return_dict=None, ): r""" - encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. - encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`): + encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. - past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of: + past_key_values (`tuple(tuple(torch.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of: shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key diff --git a/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py b/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py index c6772fc88807..d29dddf64b01 100644 --- a/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +++ b/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py @@ -186,7 +186,7 @@ def forward( ctx_begin_pos: list, input_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: if ctx_embeddings is None: ctx_len = 0 diff --git a/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py b/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py index 6206f3e7a136..c1b0141f8536 100644 --- a/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +++ b/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py @@ -192,7 +192,7 @@ def __call__( reference_image: PIL.Image.Image, source_subject_category: List[str], target_subject_category: List[str], - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 7.5, height: int = 512, width: int = 512, @@ -218,7 +218,7 @@ def __call__( The source subject category. target_subject_category (`List[str]`): The target subject category. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by random sampling. diff --git a/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py b/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py index befac79c63f3..b0c11362ffe1 100644 --- a/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py +++ b/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py @@ -105,7 +105,7 @@ def prepare_latents(self, batch_size, num_channels, height, width, dtype, device return latents # Follows diffusers.VaeImageProcessor.postprocess - def postprocess_image(self, sample: torch.FloatTensor, output_type: str = "pil"): + def postprocess_image(self, sample: torch.Tensor, output_type: str = "pil"): if output_type not in ["pt", "np", "pil"]: raise ValueError( f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']" @@ -173,10 +173,10 @@ def __call__( num_inference_steps: int = 1, timesteps: List[int] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -195,7 +195,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -205,7 +205,7 @@ def __call__( Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/controlnet/multicontrolnet.py b/src/diffusers/pipelines/controlnet/multicontrolnet.py index 7d284f2d26d3..e3c5ec6eed03 100644 --- a/src/diffusers/pipelines/controlnet/multicontrolnet.py +++ b/src/diffusers/pipelines/controlnet/multicontrolnet.py @@ -31,7 +31,7 @@ def __init__(self, controlnets: Union[List[ControlNetModel], Tuple[ControlNetMod def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, controlnet_cond: List[torch.tensor], @@ -100,20 +100,16 @@ def save_pretrained( variant (`str`, *optional*): If specified, weights are saved in the format pytorch_model..bin. """ - idx = 0 - model_path_to_save = save_directory - for controlnet in self.nets: + for idx, controlnet in enumerate(self.nets): + suffix = "" if idx == 0 else f"_{idx}" controlnet.save_pretrained( - model_path_to_save, + save_directory + suffix, is_main_process=is_main_process, save_function=save_function, safe_serialization=safe_serialization, variant=variant, ) - idx += 1 - model_path_to_save = model_path_to_save + f"_{idx}" - @classmethod def from_pretrained(cls, pretrained_model_path: Optional[Union[str, os.PathLike]], **kwargs): r""" diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index f099d54e57cd..cf979c352cfb 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -22,6 +22,7 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection +from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel @@ -97,6 +98,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -112,14 +114,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -130,6 +136,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -246,8 +262,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -279,8 +295,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -300,10 +316,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -831,7 +847,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -844,7 +860,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -892,16 +908,17 @@ def __call__( width: Optional[int] = None, num_inference_steps: int = 50, timesteps: List[int] = None, + sigmas: List[float] = None, guidance_scale: float = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -910,7 +927,9 @@ def __call__( control_guidance_start: Union[float, List[float]] = 0.0, control_guidance_end: Union[float, List[float]] = 1.0, clip_skip: Optional[int] = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], **kwargs, ): @@ -920,14 +939,14 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single ControlNet, each will be paired with each prompt in the `prompt` list. This also applies to multiple ControlNets, where a list of image lists can be passed to batch for each prompt and each ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -941,6 +960,10 @@ def __call__( Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.5): A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. @@ -955,18 +978,18 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -978,7 +1001,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -999,11 +1022,11 @@ def __call__( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -1035,6 +1058,9 @@ def __call__( "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", ) + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet # align format for control guidance @@ -1162,7 +1188,9 @@ def __call__( assert False # 5. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) self._num_timesteps = len(timesteps) # 6. Prepare latent variables diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py index 0a55e0c09d27..5e4069ae604f 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py @@ -241,7 +241,7 @@ def __call__( condtioning_image: PIL.Image.Image, source_subject_category: List[str], target_subject_category: List[str], - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 7.5, height: int = 512, width: int = 512, @@ -269,7 +269,7 @@ def __call__( The source subject category. target_subject_category (`List[str]`): The target subject category. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by random sampling. diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index 022f30d819d8..e6f1a06bddc2 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -21,6 +21,7 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection +from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel @@ -239,8 +240,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -272,8 +273,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -293,10 +294,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -904,11 +905,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -917,7 +918,9 @@ def __call__( control_guidance_start: Union[float, List[float]] = 0.0, control_guidance_end: Union[float, List[float]] = 1.0, clip_skip: Optional[int] = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], **kwargs, ): @@ -927,18 +930,18 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The initial image to be used as the starting point for the image generation process. Can also accept image latents as `image`, and if passing latents directly they are not encoded again. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -966,18 +969,18 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -1004,11 +1007,11 @@ def __call__( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -1040,6 +1043,9 @@ def __call__( "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", ) + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet # align format for control guidance diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 47dbb26eb3e4..d29e3ac8f90e 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -23,6 +23,7 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection +from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel @@ -364,8 +365,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -397,8 +398,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -418,10 +419,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1121,11 +1122,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1134,7 +1135,9 @@ def __call__( control_guidance_start: Union[float, List[float]] = 0.0, control_guidance_end: Union[float, List[float]] = 1.0, clip_skip: Optional[int] = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], **kwargs, ): @@ -1144,14 +1147,14 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, NumPy array or tensor representing an image batch to be used as the starting point. For both NumPy array and PyTorch tensor, the expected value range is between `[0, 1]`. If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a NumPy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`. It can also accept image latents as `image`, but if passing latents directly it is not encoded again. - mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, + mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, NumPy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a @@ -1159,14 +1162,14 @@ def __call__( color channel (L) instead of 3, so the expected shape for PyTorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for NumPy array, it would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, - `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, + `List[List[torch.Tensor]]`, or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -1201,18 +1204,18 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -1239,11 +1242,11 @@ def __call__( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -1275,6 +1278,9 @@ def __call__( "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", ) + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet # align format for control guidance diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py index b9c4e3c0032c..966196b4469c 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py @@ -27,6 +27,7 @@ CLIPVisionModelWithProjection, ) +from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import ( FromSingleFileMixin, @@ -197,8 +198,26 @@ class StableDiffusionXLControlNetInpaintPipeline( """ model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae" - _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"] - _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] + + _optional_components = [ + "tokenizer", + "tokenizer_2", + "text_encoder", + "text_encoder_2", + "image_encoder", + "feature_extractor", + ] + _callback_tensor_inputs = [ + "latents", + "prompt_embeds", + "negative_prompt_embeds", + "add_text_embeds", + "add_time_ids", + "negative_pooled_prompt_embeds", + "add_neg_time_ids", + "mask", + "masked_image_latents", + ] def __init__( self, @@ -208,7 +227,7 @@ def __init__( tokenizer: CLIPTokenizer, tokenizer_2: CLIPTokenizer, unet: UNet2DConditionModel, - controlnet: ControlNetModel, + controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel], scheduler: KarrasDiffusionSchedulers, requires_aesthetics_score: bool = False, force_zeros_for_empty_prompt: bool = True, @@ -261,10 +280,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -290,17 +309,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1157,13 +1176,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1178,7 +1197,9 @@ def __call__( aesthetic_score: float = 6.0, negative_aesthetic_score: float = 2.5, clip_skip: Optional[int] = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], **kwargs, ): @@ -1250,23 +1271,23 @@ def __call__( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1278,7 +1299,7 @@ def __call__( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -1317,11 +1338,11 @@ def __call__( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -1351,6 +1372,9 @@ def __call__( "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", ) + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet # align format for control guidance @@ -1730,7 +1754,7 @@ def denoising_value_valid(dnv): down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample]) - if ip_adapter_image is not None: + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: added_cond_kwargs["image_embeds"] = image_embeds if num_channels_unet == 9: diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py index 2307b856ad63..f6992ac08c31 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py @@ -30,6 +30,7 @@ from diffusers.utils.import_utils import is_invisible_watermark_available +from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import ( FromSingleFileMixin, @@ -114,6 +115,66 @@ """ +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, + **kwargs, +): + """ + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. + + Returns: + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + class StableDiffusionXLControlNetPipeline( DiffusionPipeline, StableDiffusionMixin, @@ -175,7 +236,15 @@ class StableDiffusionXLControlNetPipeline( "feature_extractor", "image_encoder", ] - _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] + _callback_tensor_inputs = [ + "latents", + "prompt_embeds", + "negative_prompt_embeds", + "add_text_embeds", + "add_time_ids", + "negative_pooled_prompt_embeds", + "negative_add_time_ids", + ] def __init__( self, @@ -233,10 +302,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -262,17 +331,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -876,7 +945,7 @@ def upcast_vae(self): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -889,7 +958,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -941,6 +1010,8 @@ def __call__( height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, + timesteps: List[int] = None, + sigmas: List[float] = None, denoising_end: Optional[float] = None, guidance_scale: float = 5.0, negative_prompt: Optional[Union[str, List[str]]] = None, @@ -948,13 +1019,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -969,7 +1040,9 @@ def __call__( negative_crops_coords_top_left: Tuple[int, int] = (0, 0), negative_target_size: Optional[Tuple[int, int]] = None, clip_skip: Optional[int] = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], **kwargs, ): @@ -982,14 +1055,14 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. Anything below 512 pixels won't work well for [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) @@ -1001,6 +1074,14 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. denoising_end (`float`, *optional*): When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be completed before it is intentionally prematurely terminated. As a result, the returned sample will @@ -1025,25 +1106,25 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled text embeddings are generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -1099,11 +1180,11 @@ def __call__( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -1133,6 +1214,9 @@ def __call__( "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", ) + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet # align format for control guidance @@ -1265,8 +1349,9 @@ def __call__( assert False # 5. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) self._num_timesteps = len(timesteps) # 6. Prepare latent variables @@ -1451,6 +1536,12 @@ def __call__( latents = callback_outputs.pop("latents", latents) prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds) + negative_pooled_prompt_embeds = callback_outputs.pop( + "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds + ) + add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids) + negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids) # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index dfd3cc239b36..0059fb1a729f 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -30,6 +30,7 @@ from diffusers.utils.import_utils import is_invisible_watermark_available +from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import ( FromSingleFileMixin, @@ -227,7 +228,15 @@ class StableDiffusionXLControlNetImg2ImgPipeline( "feature_extractor", "image_encoder", ] - _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] + _callback_tensor_inputs = [ + "latents", + "prompt_embeds", + "negative_prompt_embeds", + "add_text_embeds", + "add_time_ids", + "negative_pooled_prompt_embeds", + "add_neg_time_ids", + ] def __init__( self, @@ -287,10 +296,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -316,17 +325,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1082,13 +1091,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1105,7 +1114,9 @@ def __call__( aesthetic_score: float = 6.0, negative_aesthetic_score: float = 2.5, clip_skip: Optional[int] = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], **kwargs, ): @@ -1119,18 +1130,18 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The initial image will be used as the starting point for the image generation process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can - also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If - height and/or width are passed, `image` is resized according to them. If multiple ControlNets are - specified in init, images must be passed as a list such that each element of the list can be correctly - batched for input to a single controlnet. + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also + be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height + and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in + init, images must be passed as a list such that each element of the list can be correctly batched for + input to a single controlnet. height (`int`, *optional*, defaults to the size of control_image): The height in pixels of the generated image. Anything below 512 pixels won't work well for [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) @@ -1169,26 +1180,26 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -1254,11 +1265,11 @@ def __call__( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -1288,6 +1299,9 @@ def __call__( "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", ) + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet # align format for control guidance @@ -1578,6 +1592,12 @@ def __call__( latents = callback_outputs.pop("latents", latents) prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds) + negative_pooled_prompt_embeds = callback_outputs.pop( + "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds + ) + add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids) + add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids) # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py index 622bac8c5f97..3675a99ba67f 100644 --- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py @@ -21,6 +21,7 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer +from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetXSAdapter, UNet2DConditionModel, UNetControlNetXSModel @@ -188,8 +189,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -221,8 +222,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -242,10 +243,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -638,9 +639,9 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -648,7 +649,9 @@ def __call__( control_guidance_start: float = 0.0, control_guidance_end: float = 1.0, clip_skip: Optional[int] = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], ): r""" @@ -657,14 +660,14 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -686,14 +689,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -715,11 +718,11 @@ def __call__( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -734,6 +737,9 @@ def __call__( "not-safe-for-work" (nsfw) content. """ + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + unet = self.unet._orig_mod if is_compiled_module(self.unet) else self.unet # 1. Check inputs. Raise error if not correct diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py index 3ab535a054bd..24640a6067c0 100644 --- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py @@ -28,6 +28,7 @@ from diffusers.utils.import_utils import is_invisible_watermark_available +from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetXSAdapter, UNet2DConditionModel, UNetControlNetXSModel @@ -157,7 +158,15 @@ class StableDiffusionXLControlNetXSPipeline( "text_encoder_2", "feature_extractor", ] - _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] + _callback_tensor_inputs = [ + "latents", + "prompt_embeds", + "negative_prompt_embeds", + "add_text_embeds", + "add_time_ids", + "negative_pooled_prompt_embeds", + "negative_add_time_ids", + ] def __init__( self, @@ -213,10 +222,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -242,17 +251,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -721,11 +730,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -739,7 +748,9 @@ def __call__( negative_crops_coords_top_left: Tuple[int, int] = (0, 0), negative_target_size: Optional[Tuple[int, int]] = None, clip_skip: Optional[int] = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], ): r""" @@ -751,14 +762,14 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. Anything below 512 pixels won't work well for [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) @@ -787,20 +798,20 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled text embeddings are generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input argument. @@ -851,11 +862,11 @@ def __call__( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -869,6 +880,9 @@ def __call__( returned, otherwise a `tuple` is returned containing the output images. """ + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + unet = self.unet._orig_mod if is_compiled_module(self.unet) else self.unet # 1. Check inputs. Raise error if not correct diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py index 5bd396b20fad..1d438bcf877d 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py @@ -164,8 +164,8 @@ def encode_prompt( num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -184,10 +184,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -549,11 +549,11 @@ def __call__( width: Optional[int] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -593,10 +593,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -607,7 +607,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py index 50e2cda25a48..c5d9eed3ca9c 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py @@ -188,8 +188,8 @@ def encode_prompt( num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -208,10 +208,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -421,7 +421,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -665,11 +665,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -681,7 +681,7 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.7): @@ -714,10 +714,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -728,7 +728,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py index 89eb97a087f8..cb7e9ef6f347 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py @@ -340,8 +340,8 @@ def encode_prompt( num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -360,10 +360,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -576,7 +576,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -607,7 +607,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`original_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`original_image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -735,7 +735,7 @@ def prepare_intermediate_images( @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor], + image: Union[PIL.Image.Image, np.ndarray, torch.Tensor], original_image: Union[ PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray] ] = None, @@ -748,11 +748,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, noise_level: int = 250, @@ -762,10 +762,10 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. - original_image (`torch.FloatTensor` or `PIL.Image.Image`): + original_image (`torch.Tensor` or `PIL.Image.Image`): The original image that `image` was varied from. strength (`float`, *optional*, defaults to 0.8): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` @@ -800,10 +800,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -814,7 +814,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py index aabe1107fc9e..cb592aa5675e 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py @@ -192,8 +192,8 @@ def encode_prompt( num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -212,10 +212,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -428,7 +428,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -459,7 +459,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`mask_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`mask_image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -760,11 +760,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -776,7 +776,7 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. mask_image (`PIL.Image.Image`): @@ -814,10 +814,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -828,7 +828,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py index 1798e0dec7eb..aa70eb7b40fe 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py @@ -342,8 +342,8 @@ def encode_prompt( num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -362,10 +362,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -579,7 +579,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -610,7 +610,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`original_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`original_image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -643,7 +643,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`mask_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`mask_image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -823,7 +823,7 @@ def prepare_intermediate_images( @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor], + image: Union[PIL.Image.Image, np.ndarray, torch.Tensor], original_image: Union[ PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray] ] = None, @@ -839,11 +839,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, noise_level: int = 0, @@ -853,10 +853,10 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. - original_image (`torch.FloatTensor` or `PIL.Image.Image`): + original_image (`torch.Tensor` or `PIL.Image.Image`): The original image that `image` was varied from. mask_image (`PIL.Image.Image`): `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be @@ -896,10 +896,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -910,7 +910,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py index 36ed34cba98e..fd38a8724308 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py @@ -298,8 +298,8 @@ def encode_prompt( num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -318,10 +318,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -537,7 +537,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -608,7 +608,7 @@ def __call__( prompt: Union[str, List[str]] = None, height: int = None, width: int = None, - image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor] = None, + image: Union[PIL.Image.Image, np.ndarray, torch.Tensor] = None, num_inference_steps: int = 50, timesteps: List[int] = None, guidance_scale: float = 4.0, @@ -616,11 +616,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, noise_level: int = 250, @@ -637,7 +637,7 @@ def __call__( The height in pixels of the generated image. width (`int`, *optional*, defaults to None): The width in pixels of the generated image. - image (`PIL.Image.Image`, `np.ndarray`, `torch.FloatTensor`): + image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`): The image to be upscaled. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -663,10 +663,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -677,7 +677,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py b/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py index f73ef15d7de7..f69f905b56c5 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py @@ -13,27 +13,27 @@ class TransformationModelOutput(ModelOutput): Base class for text model's outputs that also contains a pooling of the last hidden states. Args: - text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + text_embeds (`torch.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): The text embeddings obtained by applying the projection layer to the pooler_output. - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + last_hidden_state (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + - one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one + for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ - projection_state: Optional[torch.FloatTensor] = None - last_hidden_state: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None + projection_state: Optional[torch.Tensor] = None + last_hidden_state: torch.Tensor = None + hidden_states: Optional[Tuple[torch.Tensor]] = None + attentions: Optional[Tuple[torch.Tensor]] = None class RobertaSeriesConfig(XLMRobertaConfig): diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py index 92de940332c4..11d81b13ea1b 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py @@ -79,6 +79,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -94,14 +95,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -112,6 +117,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -263,8 +278,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -295,8 +310,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -316,10 +331,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -622,7 +637,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -673,14 +688,15 @@ def __call__( width: Optional[int] = None, num_inference_steps: int = 50, timesteps: List[int] = None, + sigmas: List[float] = None, guidance_scale: float = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -722,14 +738,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. @@ -848,7 +864,9 @@ def __call__( image_embeds = torch.cat([negative_image_embeds, image_embeds]) # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) # 5. Prepare latent variables num_channels_latents = self.unet.config.in_channels diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py index 48b3b96483d5..145579da0cb7 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -119,6 +119,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -134,14 +135,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -152,6 +157,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -303,8 +318,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -335,8 +350,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -356,10 +371,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -706,7 +721,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -753,13 +768,14 @@ def __call__( strength: float = 0.8, num_inference_steps: Optional[int] = 50, timesteps: List[int] = None, + sigmas: List[float] = None, guidance_scale: Optional[float] = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -775,7 +791,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a @@ -808,10 +824,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. @@ -919,7 +935,9 @@ def __call__( image = self.image_processor.preprocess(image) # 5. set timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) diff --git a/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py b/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py index 4314b78ca5e8..6bd175000fe6 100644 --- a/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +++ b/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py @@ -114,9 +114,9 @@ def __call__( The call function to the pipeline for generation. Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): The original image to inpaint on. - mask_image (`torch.FloatTensor` or `PIL.Image.Image`): + mask_image (`torch.Tensor` or `PIL.Image.Image`): The mask_image where 0.0 define which part of the original image to inpaint. num_inference_steps (`int`, *optional*, defaults to 1000): The number of denoising steps. More denoising steps usually lead to a higher quality image at the diff --git a/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 475da0b6d188..b8ac8e1416bf 100644 --- a/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -134,7 +134,7 @@ def __call__( num_inference_steps: int = 100, return_dict: bool = True, output_type: str = "np", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ) -> Union[AudioPipelineOutput, Tuple]: if (callback_steps is None) or ( @@ -161,7 +161,7 @@ def __call__( The output format of the generated audio. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py index 0581effef2fe..edcc0fed6818 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py @@ -255,8 +255,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -288,8 +288,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -309,10 +309,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -638,10 +638,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -652,7 +652,7 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be used as the starting point. Can also accept image latents as `image`, but if passing latents directly it is not encoded again. strength (`float`, *optional*, defaults to 0.8): @@ -678,10 +678,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -691,7 +691,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py index c7dff9eeefca..cd29665fdb53 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py @@ -48,7 +48,7 @@ def preprocess_image(image, batch_size): def preprocess_mask(mask, batch_size, scale_factor=8): - if not isinstance(mask, torch.FloatTensor): + if not isinstance(mask, torch.Tensor): mask = mask.convert("L") w, h = mask.size w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 @@ -225,8 +225,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -258,8 +258,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -279,10 +279,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -557,8 +557,8 @@ def prepare_latents(self, image, timestep, num_images_per_prompt, dtype, device, def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, + mask_image: Union[torch.Tensor, PIL.Image.Image] = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -567,11 +567,11 @@ def __call__( add_predicted_noise: Optional[bool] = False, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -583,10 +583,10 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. This is the image whose masked region will be inpainted. - mask_image (`torch.FloatTensor` or `PIL.Image.Image`): + mask_image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If mask is a tensor, the @@ -620,10 +620,10 @@ def __call__( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -635,7 +635,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -693,7 +693,7 @@ def __call__( prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) # 4. Preprocess image and mask - if not isinstance(image, torch.FloatTensor): + if not isinstance(image, torch.Tensor): image = preprocess_image(image, batch_size) mask_image = preprocess_mask(mask_image, batch_size, self.vae_scale_factor) diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py index f44a1ca74ee4..901816391dcc 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py @@ -163,8 +163,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -196,8 +196,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -217,10 +217,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -620,12 +620,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -657,14 +657,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -674,7 +674,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py index 9421531d273e..74ecaa995b7d 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py @@ -154,8 +154,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -187,8 +187,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -208,10 +208,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -492,12 +492,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, debug: bool = False, @@ -537,14 +537,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -554,7 +554,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py index 5f744578810b..d1f2c5de9728 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py @@ -60,14 +60,14 @@ class Pix2PixInversionPipelineOutput(BaseOutput, TextualInversionLoaderMixin): Output class for Stable Diffusion pipelines. Args: - latents (`torch.FloatTensor`) + latents (`torch.Tensor`) inverted latents tensor images (`List[PIL.Image.Image]` or `np.ndarray`) List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. """ - latents: torch.FloatTensor + latents: torch.Tensor images: Union[List[PIL.Image.Image], np.ndarray] @@ -377,8 +377,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -410,8 +410,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -431,10 +431,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -707,7 +707,7 @@ def construct_direction(self, embs_source: torch.Tensor, embs_target: torch.Tens return (embs_target.mean(0) - embs_source.mean(0)).unsqueeze(0) @torch.no_grad() - def get_embeds(self, prompt: List[str], batch_size: int = 16) -> torch.FloatTensor: + def get_embeds(self, prompt: List[str], batch_size: int = 16) -> torch.Tensor: num_prompts = len(prompt) embeds = [] for i in range(0, num_prompts, batch_size): @@ -827,13 +827,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, cross_attention_guidance_amount: float = 0.1, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -876,14 +876,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -897,7 +897,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -1112,12 +1112,12 @@ def invert( num_inference_steps: int = 50, guidance_scale: float = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, cross_attention_guidance_amount: float = 0.1, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, lambda_auto_corr: float = 20.0, @@ -1132,7 +1132,7 @@ def invert( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch which will be used for conditioning. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. num_inference_steps (`int`, *optional*, defaults to 50): @@ -1147,11 +1147,11 @@ def invert( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. cross_attention_guidance_amount (`float`, defaults to 0.1): @@ -1164,7 +1164,7 @@ def invert( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py index c84caa1fad88..e1ba3879f568 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py @@ -817,7 +817,7 @@ def __init__( positive_len = 768 if isinstance(cross_attention_dim, int): positive_len = cross_attention_dim - elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list): + elif isinstance(cross_attention_dim, (list, tuple)): positive_len = cross_attention_dim[0] feature_type = "text-only" if attention_type == "gated" else "text-image" @@ -1048,7 +1048,7 @@ def unload_lora(self): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, class_labels: Optional[torch.Tensor] = None, @@ -1066,10 +1066,10 @@ def forward( The [`UNetFlatConditionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, channel, height, width)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. class_labels (`torch.Tensor`, *optional*, defaults to `None`): Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. @@ -1590,8 +1590,8 @@ def __init__( self.gradient_checkpointing = False def forward( - self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: output_states = () for resnet in self.resnets: @@ -1719,14 +1719,14 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - additional_residuals: Optional[torch.FloatTensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + encoder_attention_mask: Optional[torch.Tensor] = None, + additional_residuals: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: output_states = () blocks = list(zip(self.resnets, self.attentions)) @@ -1837,13 +1837,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1994,15 +1994,15 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -2104,8 +2104,8 @@ class UNetMidBlockFlat(nn.Module): output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor. Returns: - `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size, - in_channels, height, width)`. + `torch.Tensor`: The output of the last residual block, which is a tensor of shape `(batch_size, in_channels, + height, width)`. """ @@ -2223,7 +2223,7 @@ def __init__( self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet in zip(self.attentions, self.resnets[1:]): if attn is not None: @@ -2339,13 +2339,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -2480,13 +2480,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py index 4455d20df213..c8dc18e2e8ac 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py @@ -81,7 +81,7 @@ def __init__( @torch.no_grad() def image_variation( self, - image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -90,10 +90,10 @@ def image_variation( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -123,7 +123,7 @@ def image_variation( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -134,7 +134,7 @@ def image_variation( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -202,10 +202,10 @@ def text_to_image( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -235,7 +235,7 @@ def text_to_image( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -246,7 +246,7 @@ def text_to_image( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -311,10 +311,10 @@ def dual_guided( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -344,7 +344,7 @@ def dual_guided( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -355,7 +355,7 @@ def dual_guided( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py index b1117044cf18..2212651fbb5b 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py @@ -395,10 +395,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -429,7 +429,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -439,7 +439,7 @@ def __call__( Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py index 59aa370ec2f6..62d3e83a4790 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py @@ -197,7 +197,7 @@ def check_inputs(self, image, height, width, callback_steps): and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -247,10 +247,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -281,7 +281,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -292,7 +292,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py index 0c76e5837b99..de4c2ac9b7f4 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py @@ -333,10 +333,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -367,7 +367,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -378,7 +378,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py b/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py index 0c55d04e67de..8dee000df05f 100644 --- a/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +++ b/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py @@ -169,10 +169,10 @@ def __call__( truncation_rate: float = 1.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ) -> Union[ImagePipelineOutput, Tuple]: """ @@ -196,7 +196,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor` of shape (batch), *optional*): + latents (`torch.Tensor` of shape (batch), *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Must be valid embedding indices.If not provided, a latents tensor will be generated of completely masked latent pixels. @@ -206,7 +206,7 @@ def __call__( Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -301,7 +301,7 @@ def __call__( return ImagePipelineOutput(images=image) - def truncate(self, log_p_x_0: torch.FloatTensor, truncation_rate: float) -> torch.FloatTensor: + def truncate(self, log_p_x_0: torch.Tensor, truncation_rate: float) -> torch.Tensor: """ Truncates `log_p_x_0` such that for each column vector, the total cumulative probability is `truncation_rate` The lowest probabilities that would increase the cumulative probability above `truncation_rate` are set to diff --git a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py index a6b9499f5542..f528b60e6ed7 100644 --- a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +++ b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py @@ -31,6 +31,7 @@ replace_example_docstring, ) from ...utils.torch_utils import randn_tensor +from ...video_processor import VideoProcessor from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin @@ -70,34 +71,12 @@ """ -# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid -def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"): - batch_size, channels, num_frames, height, width = video.shape - outputs = [] - for batch_idx in range(batch_size): - batch_vid = video[batch_idx].permute(1, 0, 2, 3) - batch_output = processor.postprocess(batch_vid, output_type) - - outputs.append(batch_output) - - if output_type == "np": - outputs = np.stack(outputs) - - elif output_type == "pt": - outputs = torch.stack(outputs) - - elif not output_type == "pil": - raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']") - - return outputs - - @dataclass class I2VGenXLPipelineOutput(BaseOutput): r""" Output class for image-to-video pipeline. - Args: + Args: frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised @@ -156,7 +135,7 @@ def __init__( ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) # `do_resize=False` as we do custom resizing. - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_resize=False) + self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor, do_resize=False) @property def guidance_scale(self): @@ -175,8 +154,8 @@ def encode_prompt( device, num_videos_per_prompt, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clip_skip: Optional[int] = None, ): r""" @@ -195,10 +174,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -342,8 +321,8 @@ def _encode_image(self, image, device, num_videos_per_prompt): dtype = next(self.image_encoder.parameters()).dtype if not isinstance(image, torch.Tensor): - image = self.image_processor.pil_to_numpy(image) - image = self.image_processor.numpy_to_pt(image) + image = self.video_processor.pil_to_numpy(image) + image = self.video_processor.numpy_to_pt(image) # Normalize the image with CLIP training stats. image = self.feature_extractor( @@ -455,7 +434,7 @@ def check_inputs( and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -534,9 +513,9 @@ def __call__( num_videos_per_prompt: Optional[int] = 1, decode_chunk_size: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -548,7 +527,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): + image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`): Image or images to guide image generation. If you provide a tensor, it needs to be compatible with [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json). height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -580,14 +559,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -657,7 +636,7 @@ def __call__( # 3.2.2 Image latents. resized_image = _center_crop_wide(image, (width, height)) - image = self.image_processor.preprocess(resized_image).to(device=device, dtype=image_embeddings.dtype) + image = self.video_processor.preprocess(resized_image).to(device=device, dtype=image_embeddings.dtype) image_latents = self.prepare_image_latents( image, device=device, @@ -737,7 +716,7 @@ def __call__( video = latents else: video_tensor = self.decode_latents(latents, decode_chunk_size=decode_chunk_size) - video = tensor2vid(video_tensor, self.image_processor, output_type=output_type) + video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type) # 9. Offload all models self.maybe_free_model_hooks() diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py index 34b5a47c2541..b2041e101564 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py @@ -233,8 +233,8 @@ def _encode_prompt( def __call__( self, prompt: Union[str, List[str]], - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], negative_prompt: Optional[Union[str, List[str]]] = None, height: int = 512, width: int = 512, @@ -242,9 +242,9 @@ def __call__( guidance_scale: float = 4.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -254,9 +254,9 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored @@ -279,7 +279,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -288,7 +288,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py index cbe66a63f4c8..fe9909770376 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py @@ -226,9 +226,9 @@ def __call__( prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -268,7 +268,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -277,7 +277,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -436,7 +436,7 @@ def set_progress_bar_config(self, **kwargs): def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], negative_prompt: Optional[Union[str, List[str]]] = None, num_inference_steps: int = 100, guidance_scale: float = 4.0, @@ -447,9 +447,9 @@ def __call__( prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -459,7 +459,7 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -499,7 +499,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -508,7 +508,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -677,8 +677,8 @@ def set_progress_bar_config(self, **kwargs): def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], - mask_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], + mask_image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], negative_prompt: Optional[Union[str, List[str]]] = None, num_inference_steps: int = 100, guidance_scale: float = 4.0, @@ -688,9 +688,9 @@ def __call__( prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -700,7 +700,7 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -739,7 +739,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -748,7 +748,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py index 4d091e7d7a7c..ef5241fee5d2 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py @@ -266,10 +266,10 @@ def _encode_prompt( # add_noise method to overwrite the one in schedule because it use a different beta schedule for adding noise vs sampling def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: betas = torch.linspace(0.0001, 0.02, 1000, dtype=torch.float32) alphas = 1.0 - betas alphas_cumprod = torch.cumprod(alphas, dim=0) @@ -295,9 +295,9 @@ def add_noise( def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], - image_embeds: torch.FloatTensor, - negative_image_embeds: torch.FloatTensor, + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], + image_embeds: torch.Tensor, + negative_image_embeds: torch.Tensor, negative_prompt: Optional[Union[str, List[str]]] = None, height: int = 512, width: int = 512, @@ -307,7 +307,7 @@ def __call__( num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -317,12 +317,12 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`): + image (`torch.Tensor`, `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored @@ -356,7 +356,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py index d8d9e96e6fcd..778b6e314c0d 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py @@ -398,10 +398,10 @@ def _encode_prompt( def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], - image_embeds: torch.FloatTensor, - negative_image_embeds: torch.FloatTensor, + image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image, np.ndarray], + image_embeds: torch.Tensor, + negative_image_embeds: torch.Tensor, negative_prompt: Optional[Union[str, List[str]]] = None, height: int = 512, width: int = 512, @@ -409,9 +409,9 @@ def __call__( guidance_scale: float = 4.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -421,10 +421,10 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image` or `np.ndarray`): + image (`torch.Tensor`, `PIL.Image.Image` or `np.ndarray`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. - mask_image (`PIL.Image.Image`,`torch.FloatTensor` or `np.ndarray`): + mask_image (`PIL.Image.Image`,`torch.Tensor` or `np.ndarray`): `Image`, or a tensor representing an image batch, to mask `image`. White pixels in the mask will be repainted, while black pixels will be preserved. You can pass a pytorch tensor as mask only if the image you passed is a pytorch tensor, and it should contain one color channel (L) instead of 3, so the @@ -432,9 +432,9 @@ def __call__( image or numpy array, mask should also be a either PIL image or numpy array. If it is a PIL image, it will be converted to a single channel (luminance) before use. If it is a nummpy array, the expected shape is `(H, W)`. - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored @@ -457,7 +457,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -466,7 +466,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py index 7d9be4570f6e..b5152d71cb6b 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py @@ -115,14 +115,14 @@ class KandinskyPriorPipelineOutput(BaseOutput): Output class for KandinskyPriorPipeline. Args: - image_embeds (`torch.FloatTensor`) + image_embeds (`torch.Tensor`) clip image embeddings for text prompt negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`) clip image embeddings for unconditional tokens """ - image_embeds: Union[torch.FloatTensor, np.ndarray] - negative_image_embeds: Union[torch.FloatTensor, np.ndarray] + image_embeds: Union[torch.Tensor, np.ndarray] + negative_image_embeds: Union[torch.Tensor, np.ndarray] class KandinskyPriorPipeline(DiffusionPipeline): @@ -173,12 +173,12 @@ def __init__( @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING) def interpolate( self, - images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]], + images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]], weights: List[float], num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, negative_prior_prompt: Optional[str] = None, negative_prompt: str = "", guidance_scale: float = 4.0, @@ -188,7 +188,7 @@ def interpolate( Function invoked when using the prior pipeline for interpolation. Args: - images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`): + images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`): list of prompts and images to guide the image generation. weights: (`List[float]`): list of weights for each condition in `images_and_prompts` @@ -200,7 +200,7 @@ def interpolate( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -403,7 +403,7 @@ def __call__( num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 4.0, output_type: Optional[str] = "pt", return_dict: bool = True, @@ -425,7 +425,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py index 4b977af0d6f7..471db61556f5 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py @@ -123,15 +123,15 @@ def num_timesteps(self): @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], height: int = 512, width: int = 512, num_inference_steps: int = 100, guidance_scale: float = 4.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -142,9 +142,9 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. height (`int`, *optional*, defaults to 512): The height in pixels of the generated image. @@ -164,7 +164,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py index 06d94d2cb79f..9db767681b04 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py @@ -213,9 +213,9 @@ def __call__( prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -259,7 +259,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -442,7 +442,7 @@ def set_progress_bar_config(self, **kwargs): def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], negative_prompt: Optional[Union[str, List[str]]] = None, num_inference_steps: int = 100, guidance_scale: float = 4.0, @@ -453,9 +453,9 @@ def __call__( prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -469,7 +469,7 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -509,7 +509,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -518,7 +518,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -681,8 +681,8 @@ def set_progress_bar_config(self, **kwargs): def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], - mask_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], + mask_image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], negative_prompt: Optional[Union[str, List[str]]] = None, num_inference_steps: int = 100, guidance_scale: float = 4.0, @@ -692,7 +692,7 @@ def __call__( prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -707,7 +707,7 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -746,7 +746,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py index de87dd3c345d..0130c3951b38 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py @@ -151,18 +151,18 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler): @torch.no_grad() def __call__( self, - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - hint: torch.FloatTensor, + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], + hint: torch.Tensor, height: int = 512, width: int = 512, num_inference_steps: int = 100, guidance_scale: float = 4.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -172,11 +172,11 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - hint (`torch.FloatTensor`): + hint (`torch.Tensor`): The controlnet condition. - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored @@ -199,7 +199,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -208,7 +208,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py index c3ac7bcf6099..12be1534c642 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py @@ -206,10 +206,10 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt @torch.no_grad() def __call__( self, - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - hint: torch.FloatTensor, + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], + hint: torch.Tensor, height: int = 512, width: int = 512, num_inference_steps: int = 100, @@ -218,7 +218,7 @@ def __call__( num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -226,9 +226,9 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -238,9 +238,9 @@ def __call__( denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will be maximum and the denoising process will run for the full number of iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. - hint (`torch.FloatTensor`): + hint (`torch.Tensor`): The controlnet condition. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. height (`int`, *optional*, defaults to 512): The height in pixels of the generated image. @@ -265,7 +265,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py index 3fdae934adfe..899273a1a736 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py @@ -190,9 +190,9 @@ def num_timesteps(self): @torch.no_grad() def __call__( self, - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], height: int = 512, width: int = 512, num_inference_steps: int = 100, @@ -210,9 +210,9 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -222,7 +222,7 @@ def __call__( denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will be maximum and the denoising process will run for the full number of iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. height (`int`, *optional*, defaults to 512): The height in pixels of the generated image. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py index 2fb8731f8a7f..b5ba7a0011a1 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py @@ -294,17 +294,17 @@ def num_timesteps(self): @torch.no_grad() def __call__( self, - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image, np.ndarray], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], height: int = 512, width: int = 512, num_inference_steps: int = 100, guidance_scale: float = 4.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -315,7 +315,7 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. image (`PIL.Image.Image`): `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will @@ -325,7 +325,7 @@ def __call__( black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. height (`int`, *optional*, defaults to 512): The height in pixels of the generated image. @@ -345,7 +345,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py index 1455e20ab5d9..f2134b22b40b 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py @@ -132,12 +132,12 @@ def __init__( @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING) def interpolate( self, - images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]], + images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]], weights: List[float], num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, negative_prior_prompt: Optional[str] = None, negative_prompt: str = "", guidance_scale: float = 4.0, @@ -147,7 +147,7 @@ def interpolate( Function invoked when using the prior pipeline for interpolation. Args: - images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`): + images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`): list of prompts and images to guide the image generation. weights: (`List[float]`): list of weights for each condition in `images_and_prompts` @@ -159,7 +159,7 @@ def interpolate( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -376,7 +376,7 @@ def __call__( num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 4.0, output_type: Optional[str] = "pt", # pt only return_dict: bool = True, @@ -400,7 +400,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py index 26442735e80d..ec6509bb3cb5 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py @@ -156,12 +156,12 @@ def get_timesteps(self, num_inference_steps, strength, device): @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING) def interpolate( self, - images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]], + images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]], weights: List[float], num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, negative_prior_prompt: Optional[str] = None, negative_prompt: str = "", guidance_scale: float = 4.0, @@ -171,7 +171,7 @@ def interpolate( Function invoked when using the prior pipeline for interpolation. Args: - images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`): + images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`): list of prompts and images to guide the image generation. weights: (`List[float]`): list of weights for each condition in `images_and_prompts` @@ -183,7 +183,7 @@ def interpolate( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -418,7 +418,7 @@ def __call__( Conceptually, indicates how much to transform the reference `emb`. Must be between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of denoising steps depends on the amount of noise initially added. - emb (`torch.FloatTensor`): + emb (`torch.Tensor`): The image embedding. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored diff --git a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py index 85d6418d07cf..d7ff59e001e7 100644 --- a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +++ b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py @@ -87,11 +87,11 @@ def encode_prompt( num_images_per_prompt=1, device=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, _cut_context=False, - attention_mask: Optional[torch.FloatTensor] = None, - negative_attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + negative_attention_mask: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -109,16 +109,16 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - attention_mask (`torch.FloatTensor`, *optional*): + attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask. Must provide if passing `prompt_embeds` directly. - negative_attention_mask (`torch.FloatTensor`, *optional*): + negative_attention_mask (`torch.Tensor`, *optional*): Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly. """ if prompt is not None and negative_prompt is not None: @@ -334,10 +334,10 @@ def __call__( height: Optional[int] = 1024, width: Optional[int] = 1024, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - negative_attention_mask: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + negative_attention_mask: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, latents=None, @@ -380,16 +380,16 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - attention_mask (`torch.FloatTensor`, *optional*): + attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask. Must provide if passing `prompt_embeds` directly. - negative_attention_mask (`torch.FloatTensor`, *optional*): + negative_attention_mask (`torch.Tensor`, *optional*): Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between @@ -398,7 +398,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py index 16a57b6b8c3f..df46756a17ef 100644 --- a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +++ b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py @@ -112,11 +112,11 @@ def encode_prompt( num_images_per_prompt=1, device=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, _cut_context=False, - attention_mask: Optional[torch.FloatTensor] = None, - negative_attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + negative_attention_mask: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -134,16 +134,16 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - attention_mask (`torch.FloatTensor`, *optional*): + attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask. Must provide if passing `prompt_embeds` directly. - negative_attention_mask (`torch.FloatTensor`, *optional*): + negative_attention_mask (`torch.Tensor`, *optional*): Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly. """ if prompt is not None and negative_prompt is not None: @@ -403,17 +403,17 @@ def num_timesteps(self): def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None, + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]] = None, strength: float = 0.3, num_inference_steps: int = 25, guidance_scale: float = 3.0, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - negative_attention_mask: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + negative_attention_mask: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -427,7 +427,7 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.8): @@ -454,16 +454,16 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - attention_mask (`torch.FloatTensor`, *optional*): + attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask. Must provide if passing `prompt_embeds` directly. - negative_attention_mask (`torch.FloatTensor`, *optional*): + negative_attention_mask (`torch.Tensor`, *optional*): Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py index fce694d1d0bd..11e3781aaf7c 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py @@ -63,6 +63,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -78,14 +79,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -96,6 +101,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -222,8 +237,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -243,10 +258,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -550,7 +565,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -563,7 +578,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -613,7 +628,7 @@ def check_inputs( prompt: Union[str, List[str]], strength: float, callback_steps: int, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image=None, ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, @@ -694,10 +709,10 @@ def __call__( guidance_scale: float = 8.5, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -739,16 +754,16 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py index a69d49f1ffbb..7f3495258402 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py @@ -67,6 +67,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -82,14 +83,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -100,6 +105,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -206,8 +221,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -227,10 +242,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -497,7 +512,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -510,7 +525,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -550,7 +565,7 @@ def check_inputs( height: int, width: int, callback_steps: int, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image=None, ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, @@ -631,10 +646,10 @@ def __call__( guidance_scale: float = 8.5, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -676,16 +691,16 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py index f39cbc839652..f6f3531a8835 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py @@ -74,7 +74,7 @@ def __call__( guidance_scale: Optional[float] = 1.0, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, **kwargs, @@ -98,7 +98,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -465,17 +465,17 @@ def __init__(self, config: LDMBertConfig): def forward( self, - hidden_states: torch.FloatTensor, - attention_mask: torch.FloatTensor, - layer_head_mask: torch.FloatTensor, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + layer_head_mask: torch.Tensor, output_attentions: Optional[bool] = False, - ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]: + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size + hidden_states (`torch.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + attention_mask (`torch.Tensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + layer_head_mask (`torch.Tensor`): mask for attention heads in a given layer of size `(encoder_attention_heads,)`. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under @@ -587,7 +587,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, @@ -615,7 +615,7 @@ def forward( - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py index 619be13a8f36..9bba5d7719a9 100644 --- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py @@ -502,8 +502,8 @@ def encode_prompt( enable_edit_guidance, negative_prompt=None, editing_prompt=None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - editing_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + editing_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -523,10 +523,10 @@ def encode_prompt( less than `1`). editing_prompt (`str` or `List[str]`, *optional*): Editing prompt(s) to be encoded. If not defined, one has to pass `editing_prompt_embeds` instead. - editing_prompt_embeds (`torch.FloatTensor`, *optional*): + editing_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -704,13 +704,13 @@ def __call__( return_dict: bool = True, editing_prompt: Optional[Union[str, List[str]]] = None, editing_prompt_embeds: Optional[torch.Tensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, reverse_editing_direction: Optional[Union[bool, List[bool]]] = False, edit_guidance_scale: Optional[Union[float, List[float]]] = 5, edit_warmup_steps: Optional[Union[int, List[int]]] = 0, edit_cooldown_steps: Optional[Union[int, List[int]]] = None, edit_threshold: Optional[Union[float, List[float]]] = 0.9, - user_mask: Optional[torch.FloatTensor] = None, + user_mask: Optional[torch.Tensor] = None, sem_guidance: Optional[List[torch.Tensor]] = None, use_cross_attn_mask: bool = False, use_intersect_mask: bool = True, @@ -748,7 +748,7 @@ def __call__( editing_prompt_embeds (`torch.Tensor>`, *optional*): Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should be specified via `reverse_editing_direction`. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`): @@ -765,7 +765,7 @@ def __call__( Masking threshold of guidance. Threshold should be proportional to the image region that is modified. 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). - user_mask (`torch.FloatTensor`, *optional*): + user_mask (`torch.Tensor`, *optional*): User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit masks do not meet user preferences. sem_guidance (`List[torch.Tensor]`, *optional*): @@ -1216,7 +1216,7 @@ def invert( Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead. - Args: + Args: image (`PipelineImageInput`): Input for the image(s) that are to be edited. Multiple input images have to default to the same aspect ratio. diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py index 5ea7c2c14551..b8b46ce4fff1 100644 --- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py @@ -409,14 +409,14 @@ def encode_prompt( num_images_per_prompt: int = 1, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, enable_edit_guidance: bool = True, editing_prompt: Optional[str] = None, - editing_prompt_embeds: Optional[torch.FloatTensor] = None, - editing_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + editing_prompt_embeds: Optional[torch.Tensor] = None, + editing_pooled_prompt_embeds: Optional[torch.Tensor] = None, ) -> object: r""" Encodes the prompt into text encoder hidden states. @@ -432,11 +432,11 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -450,11 +450,11 @@ def encode_prompt( editing_prompt (`str` or `List[str]`, *optional*): Editing prompt(s) to be encoded. If not defined and 'enable_edit_guidance' is True, one has to pass `editing_prompt_embeds` instead. - editing_prompt_embeds (`torch.FloatTensor`, *optional*): + editing_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from `editing_prompt` input argument. - editing_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + editing_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated edit pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled editing_pooled_prompt_embeds will be generated from `editing_prompt` input argument. @@ -713,7 +713,7 @@ def upcast_vae(self): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -726,7 +726,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -804,8 +804,8 @@ def __call__( denoising_end: Optional[float] = None, negative_prompt: Optional[Union[str, List[str]]] = None, negative_prompt_2: Optional[Union[str, List[str]]] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -824,7 +824,7 @@ def __call__( sem_guidance: Optional[List[torch.Tensor]] = None, use_cross_attn_mask: bool = False, use_intersect_mask: bool = False, - user_mask: Optional[torch.FloatTensor] = None, + user_mask: Optional[torch.Tensor] = None, attn_store_steps: Optional[List[int]] = [], store_averaged_over_steps: bool = True, clip_skip: Optional[int] = None, @@ -851,11 +851,11 @@ def __call__( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -869,7 +869,7 @@ def __call__( of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -1449,7 +1449,7 @@ def invert( Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead. - Args: + Args: image (`PipelineImageInput`): Input for the image(s) that are to be edited. Multiple input images have to default to the same aspect ratio. diff --git a/src/diffusers/pipelines/musicldm/pipeline_musicldm.py b/src/diffusers/pipelines/musicldm/pipeline_musicldm.py index 2bd828f0df24..728635da6d4d 100644 --- a/src/diffusers/pipelines/musicldm/pipeline_musicldm.py +++ b/src/diffusers/pipelines/musicldm/pipeline_musicldm.py @@ -120,8 +120,8 @@ def _encode_prompt( num_waveforms_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -139,10 +139,10 @@ def _encode_prompt( The prompt or prompts not to guide the audio generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -427,11 +427,11 @@ def __call__( num_waveforms_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, output_type: Optional[str] = "np", @@ -465,21 +465,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py index 263d507bbc75..b225fd71edf8 100644 --- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py @@ -266,7 +266,7 @@ def check_inputs(self, image, height, width, callback_steps): and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -393,9 +393,9 @@ def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free @torch.no_grad() def __call__( self, - example_image: Union[torch.FloatTensor, PIL.Image.Image], - image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image], + example_image: Union[torch.Tensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image], height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -404,22 +404,22 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" The call function to the pipeline for generation. Args: - example_image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): + example_image (`torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): An example image to guide image generation. - image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): + image (`torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): `Image` or tensor representing an image batch to be inpainted (parts of the image are masked out with `mask_image` and repainted according to `prompt`). - mask_image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): + mask_image (`torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): `Image` or tensor representing an image batch to mask `image`. White pixels in the mask are repainted, while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the @@ -445,7 +445,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -456,7 +456,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/pia/pipeline_pia.py b/src/diffusers/pipelines/pia/pipeline_pia.py index acd8659d2a7c..26569319bc21 100644 --- a/src/diffusers/pipelines/pia/pipeline_pia.py +++ b/src/diffusers/pipelines/pia/pipeline_pia.py @@ -21,7 +21,7 @@ import torch from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection -from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel from ...models.lora import adjust_lora_scale_text_encoder @@ -43,6 +43,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor +from ...video_processor import VideoProcessor from ..free_init_utils import FreeInitMixin from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin @@ -89,28 +90,6 @@ ] -# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid -def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"): - batch_size, channels, num_frames, height, width = video.shape - outputs = [] - for batch_idx in range(batch_size): - batch_vid = video[batch_idx].permute(1, 0, 2, 3) - batch_output = processor.postprocess(batch_vid, output_type) - - outputs.append(batch_output) - - if output_type == "np": - outputs = np.stack(outputs) - - elif output_type == "pt": - outputs = torch.stack(outputs) - - elif not output_type == "pil": - raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']") - - return outputs - - def prepare_mask_coef_by_statistics(num_frames: int, cond_frame: int, motion_scale: int): assert num_frames > 0, "video_length should be greater than 0" @@ -218,7 +197,7 @@ def __init__( image_encoder=image_encoder, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt def encode_prompt( @@ -228,8 +207,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -249,10 +228,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -621,7 +600,7 @@ def prepare_masked_condition( ) _, _, _, scaled_height, scaled_width = shape - image = self.image_processor.preprocess(image) + image = self.video_processor.preprocess(image) image = image.to(device, dtype) if isinstance(generator, list): @@ -701,11 +680,11 @@ def __call__( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, motion_scale: int = 0, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -746,20 +725,20 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -770,8 +749,7 @@ def __call__( added. Must be between 0 and 8. Set between 0-2 to only increase the amount of motion. Set between 3-5 to create looping motion. Set between 6-8 to perform motion with image style transfer. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or - `np.array`. + The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead of a plain tuple. @@ -959,7 +937,7 @@ def __call__( video = latents else: video_tensor = self.decode_latents(latents) - video = tensor2vid(video_tensor, self.image_processor, output_type=output_type) + video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type) # 10. Offload all models self.maybe_free_model_hooks() diff --git a/src/diffusers/pipelines/pipeline_loading_utils.py b/src/diffusers/pipelines/pipeline_loading_utils.py index f7d9785043d1..941de2b47a66 100644 --- a/src/diffusers/pipelines/pipeline_loading_utils.py +++ b/src/diffusers/pipelines/pipeline_loading_utils.py @@ -609,6 +609,7 @@ def load_sub_model( ): """Helper method to load the module `name` from `library_name` and `class_name`""" # retrieve class candidates + class_obj, class_candidates = get_class_obj_and_candidates( library_name, class_name, diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py index 22f0c507e5ee..6d3f5c1e274d 100644 --- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py @@ -175,6 +175,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -190,14 +191,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -208,6 +213,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -281,10 +296,10 @@ def encode_prompt( negative_prompt: str = "", num_images_per_prompt: int = 1, device: Optional[torch.device] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_attention_mask: Optional[torch.FloatTensor] = None, - negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + prompt_attention_mask: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, clean_caption: bool = False, max_sequence_length: int = 120, **kwargs, @@ -305,10 +320,10 @@ def encode_prompt( number of images that should be generated per prompt device: (`torch.device`, *optional*): torch device to place the resulting embeddings on - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the "" string. clean_caption (`bool`, defaults to `False`): @@ -351,7 +366,7 @@ def encode_prompt( ): removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) logger.warning( - "The following part of your input was truncated because CLIP can only handle sequences up to" + "The following part of your input was truncated because T5 can only handle sequences up to" f" {max_length} tokens: {removed_text}" ) @@ -672,20 +687,21 @@ def __call__( negative_prompt: str = "", num_inference_steps: int = 20, timesteps: List[int] = None, + sigmas: List[float] = None, guidance_scale: float = 4.5, num_images_per_prompt: Optional[int] = 1, height: Optional[int] = None, width: Optional[int] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_attention_mask: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_attention_mask: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, use_resolution_binning: bool = True, @@ -707,8 +723,13 @@ def __call__( The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps` - timesteps are used. Must be in descending order. + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 4.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -727,18 +748,18 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings. + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Alpha this negative prompt should be "". If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_prompt_attention_mask (`torch.FloatTensor`, *optional*): + negative_prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for negative text embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between @@ -747,7 +768,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -837,7 +858,9 @@ def __call__( prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0) # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) # 5. Prepare latents. latent_channels = self.transformer.config.in_channels diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py index e3dd3ddc0404..1db7e5d9ab8a 100644 --- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py @@ -23,7 +23,7 @@ from ...image_processor import PixArtImageProcessor from ...models import AutoencoderKL, Transformer2DModel -from ...schedulers import DPMSolverMultistepScheduler +from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( BACKENDS_MAPPING, deprecate, @@ -119,6 +119,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -134,14 +135,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -152,6 +157,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -188,7 +203,7 @@ def __init__( text_encoder: T5EncoderModel, vae: AutoencoderKL, transformer: Transformer2DModel, - scheduler: DPMSolverMultistepScheduler, + scheduler: KarrasDiffusionSchedulers, ): super().__init__() @@ -199,7 +214,7 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor) - # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.encode_prompt + # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.encode_prompt with 120->300 def encode_prompt( self, prompt: Union[str, List[str]], @@ -207,12 +222,12 @@ def encode_prompt( negative_prompt: str = "", num_images_per_prompt: int = 1, device: Optional[torch.device] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_attention_mask: Optional[torch.FloatTensor] = None, - negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + prompt_attention_mask: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, clean_caption: bool = False, - max_sequence_length: int = 120, + max_sequence_length: int = 300, **kwargs, ): r""" @@ -231,15 +246,15 @@ def encode_prompt( number of images that should be generated per prompt device: (`torch.device`, *optional*): torch device to place the resulting embeddings on - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the "" string. clean_caption (`bool`, defaults to `False`): If `True`, the function will preprocess and clean the provided caption before encoding. - max_sequence_length (`int`, defaults to 120): Maximum sequence length to use for the prompt. + max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt. """ if "mask_feature" in kwargs: @@ -277,7 +292,7 @@ def encode_prompt( ): removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) logger.warning( - "The following part of your input was truncated because CLIP can only handle sequences up to" + "The following part of your input was truncated because T5 can only handle sequences up to" f" {max_length} tokens: {removed_text}" ) @@ -599,20 +614,21 @@ def __call__( negative_prompt: str = "", num_inference_steps: int = 20, timesteps: List[int] = None, + sigmas: List[float] = None, guidance_scale: float = 4.5, num_images_per_prompt: Optional[int] = 1, height: Optional[int] = None, width: Optional[int] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_attention_mask: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_attention_mask: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, use_resolution_binning: bool = True, @@ -634,8 +650,13 @@ def __call__( The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps` - timesteps are used. Must be in descending order. + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 4.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -654,18 +675,18 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings. + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_prompt_attention_mask (`torch.FloatTensor`, *optional*): + negative_prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for negative text embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between @@ -674,7 +695,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -686,7 +707,7 @@ def __call__( If set to `True`, the requested height and width are first mapped to the closest resolutions using `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to the requested resolution. Useful for generating non-square images. - max_sequence_length (`int` defaults to 120): Maximum sequence length to use with the `prompt`. + max_sequence_length (`int` defaults to 300): Maximum sequence length to use with the `prompt`. Examples: @@ -763,7 +784,9 @@ def __call__( prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0) # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) # 5. Prepare latents. latent_channels = self.transformer.config.in_channels diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py index fe83a860aeac..e068387b6162 100644 --- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py @@ -224,10 +224,10 @@ def __call__( num_images_per_prompt: int = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, editing_prompt: Optional[Union[str, List[str]]] = None, editing_prompt_embeddings: Optional[torch.Tensor] = None, @@ -268,7 +268,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -279,7 +279,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py index 9c29ebafc639..254a5996885f 100644 --- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py +++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py @@ -70,7 +70,7 @@ class ShapEPipelineOutput(BaseOutput): Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`]. Args: - images (`torch.FloatTensor`) + images (`torch.Tensor`) A list of images for 3D rendering. """ @@ -188,7 +188,7 @@ def __call__( num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 4.0, frame_size: int = 64, output_type: Optional[str] = "pil", # pil, np, latent, mesh @@ -210,7 +210,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py index 700ca5db6f07..7cc145e4c3e2 100644 --- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py @@ -70,7 +70,7 @@ class ShapEPipelineOutput(BaseOutput): Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`]. Args: - images (`torch.FloatTensor`) + images (`torch.Tensor`) A list of images for 3D rendering. """ @@ -169,7 +169,7 @@ def __call__( num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 4.0, frame_size: int = 64, output_type: Optional[str] = "pil", # pil, np, latent, mesh @@ -179,7 +179,7 @@ def __call__( The call function to the pipeline for generation. Args: - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be used as the starting point. Can also accept image latents as image, but if passing latents directly it is not encoded again. num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -190,7 +190,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py index 047c6f7dd863..9d9f9d9b2ab1 100644 --- a/src/diffusers/pipelines/shap_e/renderer.py +++ b/src/diffusers/pipelines/shap_e/renderer.py @@ -844,7 +844,7 @@ def render_rays(self, rays, sampler, n_samples, prev_model_out=None, render_with transmittance(t[i + 1]) := transmittance(t[i]). 4) The last term is integration to infinity (e.g. [t[-1], math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty). - args: + Args: rays: [batch_size x ... x 2 x 3] origin and direction. sampler: disjoint volume integrals. n_samples: number of ts to sample. prev_model_outputs: model outputs from the previous rendering step, including diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py index 9db92cbdd181..fd644a5dfdba 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py @@ -129,10 +129,10 @@ def encode_prompt( do_classifier_free_guidance, prompt=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_embeds_pooled: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds_pooled: Optional[torch.Tensor] = None, ): if prompt_embeds is None: # get prompt text embeddings @@ -285,18 +285,18 @@ def num_timesteps(self): @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image_embeddings: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeddings: Union[torch.Tensor, List[torch.Tensor]], prompt: Union[str, List[str]] = None, num_inference_steps: int = 10, guidance_scale: float = 0.0, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_embeds_pooled: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds_pooled: Optional[torch.Tensor] = None, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -306,7 +306,7 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image_embedding (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embedding (`torch.Tensor` or `List[torch.Tensor]`): Image Embeddings either extracted from an image or generated by a Prior Model. prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. @@ -322,17 +322,17 @@ def __call__( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `decoder_guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + negative_prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input argument. @@ -341,7 +341,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py index d27e727231c9..6724b60cc424 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py @@ -71,6 +71,7 @@ class StableCascadeCombinedPipeline(DiffusionPipeline): """ _load_connected_pipes = True + _optional_components = ["prior_feature_extractor", "prior_image_encoder"] def __init__( self, @@ -161,13 +162,13 @@ def __call__( num_inference_steps: int = 12, decoder_guidance_scale: float = 0.0, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_embeds_pooled: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds_pooled: Optional[torch.Tensor] = None, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -186,17 +187,17 @@ def __call__( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + negative_prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -229,7 +230,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py index ce17294a257d..1e9e33a42030 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py @@ -54,19 +54,19 @@ class StableCascadePriorPipelineOutput(BaseOutput): Output class for WuerstchenPriorPipeline. Args: - image_embeddings (`torch.FloatTensor` or `np.ndarray`) + image_embeddings (`torch.Tensor` or `np.ndarray`) Prior image embeddings for text prompt - prompt_embeds (`torch.FloatTensor`): + prompt_embeds (`torch.Tensor`): Text embeddings for the prompt. - negative_prompt_embeds (`torch.FloatTensor`): + negative_prompt_embeds (`torch.Tensor`): Text embeddings for the negative prompt. """ - image_embeddings: Union[torch.FloatTensor, np.ndarray] - prompt_embeds: Union[torch.FloatTensor, np.ndarray] - prompt_embeds_pooled: Union[torch.FloatTensor, np.ndarray] - negative_prompt_embeds: Union[torch.FloatTensor, np.ndarray] - negative_prompt_embeds_pooled: Union[torch.FloatTensor, np.ndarray] + image_embeddings: Union[torch.Tensor, np.ndarray] + prompt_embeds: Union[torch.Tensor, np.ndarray] + prompt_embeds_pooled: Union[torch.Tensor, np.ndarray] + negative_prompt_embeds: Union[torch.Tensor, np.ndarray] + negative_prompt_embeds_pooled: Union[torch.Tensor, np.ndarray] class StableCascadePriorPipeline(DiffusionPipeline): @@ -150,10 +150,10 @@ def encode_prompt( do_classifier_free_guidance, prompt=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_embeds_pooled: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds_pooled: Optional[torch.Tensor] = None, ): if prompt_embeds is None: # get prompt text embeddings @@ -374,14 +374,14 @@ def __call__( timesteps: List[float] = None, guidance_scale: float = 4.0, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - image_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_embeds_pooled: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds_pooled: Optional[torch.Tensor] = None, + image_embeds: Optional[torch.Tensor] = None, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pt", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -409,21 +409,21 @@ def __call__( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `decoder_guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + negative_prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input argument. - image_embeds (`torch.FloatTensor`, *optional*): + image_embeds (`torch.Tensor`, *optional*): Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting. If not provided, image embeddings will be generated from `image` input argument if existing. num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -431,7 +431,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py index 311347dccaa8..2e34dcb83c01 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py @@ -288,7 +288,7 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.FloatTensor`): + image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.Tensor`): `Image`, or tensor representing an image batch which will be upscaled. * num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -329,7 +329,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py index bee6ea7b1132..cd9ec57fb879 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py @@ -197,7 +197,7 @@ def check_inputs( ) # verify batch size of prompt and image are same if image is a list or tensor or numpy array - if isinstance(image, list) or isinstance(image, np.ndarray): + if isinstance(image, (list, np.ndarray)): if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -395,7 +395,7 @@ def __call__( [`schedulers.DDIMScheduler`], will be ignored for others. generator (`np.random.RandomState`, *optional*): A np.random.RandomState to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index d8da3b0c4bee..c92fe583781b 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import inspect from typing import Any, Callable, Dict, List, Optional, Union import warnings @@ -23,6 +22,7 @@ from packaging import version from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection +from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...configuration_utils import FrozenDict from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin @@ -79,6 +79,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -94,14 +95,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -112,6 +117,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -264,8 +279,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -296,8 +311,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -317,10 +332,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -681,7 +696,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -694,7 +709,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -749,22 +764,25 @@ def __call__( width: Optional[int] = None, num_inference_steps: int = 50, timesteps: List[int] = None, + sigmas: List[float] = None, guidance_scale: float = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, clip_skip: Optional[int] = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], **kwargs, ): @@ -785,6 +803,10 @@ def __call__( Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.5): A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. @@ -799,18 +821,18 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -830,11 +852,11 @@ def __call__( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -866,6 +888,9 @@ def __call__( "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", ) + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor width = width or self.unet.config.sample_size * self.vae_scale_factor @@ -934,7 +959,9 @@ def __call__( ) # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) # 5. Prepare latent variables num_channels_latents = self.unet.config.in_channels diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 1f822971568f..d29e519974b4 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -156,8 +156,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -189,8 +189,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -210,10 +210,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -609,7 +609,7 @@ def __call__( self, prompt: Union[str, List[str]] = None, image: PipelineImageInput = None, - depth_map: Optional[torch.FloatTensor] = None, + depth_map: Optional[torch.Tensor] = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -617,8 +617,8 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -633,10 +633,10 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be used as the starting point. Can accept image latents as `image` only if `depth_map` is not `None`. - depth_map (`torch.FloatTensor`, *optional*): + depth_map (`torch.Tensor`, *optional*): Depth prediction to be used as additional conditioning for the image generation process. If not defined, it automatically predicts the depth with `self.depth_estimator`. strength (`float`, *optional*, defaults to 0.8): @@ -662,10 +662,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py index c300c7a2f3f4..93a8bd160318 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py @@ -207,7 +207,7 @@ def check_inputs(self, image, height, width, callback_steps): and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -248,7 +248,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype @torch.no_grad() def __call__( self, - image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor], + image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor], height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -256,17 +256,17 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" The call function to the pipeline for generation. Args: - image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): + image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`): Image or images to guide image generation. If you provide a tensor, it needs to be compatible with [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json). height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -287,7 +287,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -298,7 +298,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 1b31c099b177..d806f230a06a 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -21,6 +21,7 @@ from packaging import version from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection +from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...configuration_utils import FrozenDict from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin @@ -115,6 +116,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -130,14 +132,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -148,6 +154,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -300,8 +316,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -333,8 +349,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -354,10 +370,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -769,7 +785,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -782,7 +798,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -833,20 +849,23 @@ def __call__( strength: float = 0.8, num_inference_steps: Optional[int] = 50, timesteps: List[int] = None, + sigmas: List[float] = None, guidance_scale: Optional[float] = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: int = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], **kwargs, ): @@ -856,7 +875,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a @@ -875,6 +894,10 @@ def __call__( Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.5): A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. @@ -889,14 +912,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -912,11 +935,11 @@ def __call__( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -947,6 +970,9 @@ def __call__( "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", ) + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + # 1. Check inputs. Raise error if not correct self.check_inputs( prompt, @@ -1009,7 +1035,9 @@ def __call__( image = self.image_processor.preprocess(image) # 5. set timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 636ef5a562ff..e82928a195b2 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -21,6 +21,7 @@ from packaging import version from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection +from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...configuration_utils import FrozenDict from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin @@ -179,6 +180,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -194,14 +196,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -212,6 +218,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -373,8 +389,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -406,8 +422,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -427,10 +443,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -917,7 +933,7 @@ def get_timesteps(self, num_inference_steps, strength, device): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -930,7 +946,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -978,28 +994,31 @@ def __call__( prompt: Union[str, List[str]] = None, image: PipelineImageInput = None, mask_image: PipelineImageInput = None, - masked_image_latents: torch.FloatTensor = None, + masked_image_latents: torch.Tensor = None, height: Optional[int] = None, width: Optional[int] = None, padding_mask_crop: Optional[int] = None, strength: float = 1.0, num_inference_steps: int = 50, timesteps: List[int] = None, + sigmas: List[float] = None, guidance_scale: float = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: int = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], **kwargs, ): @@ -1009,14 +1028,14 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to be masked out with `mask_image` and repainted according to `prompt`). For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but if passing latents directly it is not encoded again. - mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one @@ -1047,6 +1066,10 @@ def __call__( Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.5): A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. @@ -1061,18 +1084,18 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -1088,11 +1111,11 @@ def __call__( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -1152,6 +1175,9 @@ def __call__( "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", ) + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor width = width or self.unet.config.sample_size * self.vae_scale_factor @@ -1221,7 +1247,9 @@ def __call__( ) # 4. set timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) timesteps, num_inference_steps = self.get_timesteps( num_inference_steps=num_inference_steps, strength=strength, device=device ) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index 0bf5a92a4fcc..35166313aeac 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -13,13 +13,14 @@ # limitations under the License. import inspect -from typing import Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import PIL.Image import torch from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection +from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel @@ -168,15 +169,18 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], + cross_attention_kwargs: Optional[Dict[str, Any]] = None, **kwargs, ): r""" @@ -185,7 +189,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be repainted according to `prompt`. Can also accept image latents as `image`, but if passing latents directly it is not encoded again. num_inference_steps (`int`, *optional*, defaults to 100): @@ -210,14 +214,14 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): @@ -227,15 +231,18 @@ def __call__( return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a plain tuple. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the `._callback_tensor_inputs` attribute of your pipeline class. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). Examples: @@ -290,6 +297,9 @@ def __call__( "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", ) + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + # 0. Check inputs self.check_inputs( prompt, @@ -409,6 +419,7 @@ def __call__( t, encoder_hidden_states=prompt_embeds, added_cond_kwargs=added_cond_kwargs, + cross_attention_kwargs=cross_attention_kwargs, return_dict=False, )[0] @@ -471,8 +482,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -490,10 +501,10 @@ def _encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index 918dffe5199d..4d033133e5ec 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -221,7 +221,7 @@ def check_inputs(self, prompt, image, callback_steps): ) # verify batch size of prompt and image are same if image is a list or tensor - if isinstance(image, list) or isinstance(image, torch.Tensor): + if isinstance(image, (list, torch.Tensor)): if isinstance(prompt, str): batch_size = 1 else: @@ -267,10 +267,10 @@ def __call__( guidance_scale: float = 9.0, negative_prompt: Optional[Union[str, List[str]]] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -279,7 +279,7 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide image upscaling. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be upscaled. If it's a tensor, it can be either a latent output from a Stable Diffusion model or an image tensor in the range `[-1, 1]`. It is considered a `latent` if `image.shape[1]` is `4`; otherwise, it is considered to be an image representation and @@ -299,7 +299,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -310,7 +310,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 2d04cf41d9b5..07aafe8821b8 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -176,8 +176,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -209,8 +209,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -230,10 +230,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -468,7 +468,7 @@ def check_inputs( ) # verify batch size of prompt and image are same if image is a list or tensor or numpy array - if isinstance(image, list) or isinstance(image, torch.Tensor) or isinstance(image, np.ndarray): + if isinstance(image, (list, np.ndarray, torch.Tensor)): if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -542,12 +542,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: int = None, @@ -558,7 +558,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be upscaled. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -577,14 +577,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -594,7 +594,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index 02ddc65c7111..1839c4511060 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -257,8 +257,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -290,8 +290,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -311,10 +311,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -588,7 +588,7 @@ def noise_image_embeddings( self, image_embeds: torch.Tensor, noise_level: int, - noise: Optional[torch.FloatTensor] = None, + noise: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, ): """ @@ -644,19 +644,19 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, noise_level: int = 0, # prior args prior_num_inference_steps: int = 25, prior_guidance_scale: float = 4.0, - prior_latents: Optional[torch.FloatTensor] = None, + prior_latents: Optional[torch.Tensor] = None, clip_skip: Optional[int] = None, ): """ @@ -686,14 +686,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -702,7 +702,7 @@ def __call__( Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -718,7 +718,7 @@ def __call__( prior_guidance_scale (`float`, *optional*, defaults to 4.0): A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - prior_latents (`torch.FloatTensor`, *optional*): + prior_latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image embedding generation in the prior denoising process. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index 134ec39effc5..13915b39f034 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -166,8 +166,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -254,8 +254,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -275,10 +275,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -537,7 +537,7 @@ def check_inputs( and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -569,7 +569,7 @@ def noise_image_embeddings( self, image_embeds: torch.Tensor, noise_level: int, - noise: Optional[torch.FloatTensor] = None, + noise: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, ): """ @@ -615,7 +615,7 @@ def noise_image_embeddings( @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, prompt: Union[str, List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, @@ -625,16 +625,16 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, noise_level: int = 0, - image_embeds: Optional[torch.FloatTensor] = None, + image_embeds: Optional[torch.Tensor] = None, clip_skip: Optional[int] = None, ): r""" @@ -644,7 +644,7 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, either `prompt_embeds` will be used or prompt is initialized to `""`. - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image` or tensor representing an image batch. The image is encoded to its CLIP embedding which the `unet` is conditioned on. The image is _not_ encoded by the `vae` and then used as the latents in the denoising process like it is in the standard Stable Diffusion text-guided image variation process. @@ -669,14 +669,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -685,7 +685,7 @@ def __call__( Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -695,7 +695,7 @@ def __call__( noise_level (`int`, *optional*, defaults to `0`): The amount of noise to add to the image embeddings. A higher `noise_level` increases the variance in the final un-noised images. See [`StableUnCLIPPipeline.noise_image_embeddings`] for more details. - image_embeds (`torch.FloatTensor`, *optional*): + image_embeds (`torch.Tensor`, *optional*): Pre-generated CLIP embeddings to condition the `unet` on. These latents are not used in the denoising process. If you want to provide pre-generated latents, pass them to `__call__` as `latents`. clip_skip (`int`, *optional*): diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/src/diffusers/pipelines/stable_diffusion/safety_checker.py index 6a16b06d2056..0fe76f7a09cd 100644 --- a/src/diffusers/pipelines/stable_diffusion/safety_checker.py +++ b/src/diffusers/pipelines/stable_diffusion/safety_checker.py @@ -101,7 +101,7 @@ def forward(self, clip_input, images): #return images, False @torch.no_grad() - def forward_onnx(self, clip_input: torch.FloatTensor, images: torch.FloatTensor): + def forward_onnx(self, clip_input: torch.Tensor, images: torch.Tensor): pooled_output = self.vision_model(clip_input)[1] # pooled_output image_embeds = self.visual_projection(pooled_output) diff --git a/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py index 2adcb0a8c0a1..347d573e1fb7 100644 --- a/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +++ b/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py @@ -254,8 +254,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -287,8 +287,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -308,10 +308,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -746,12 +746,12 @@ def __call__( num_images_per_prompt: int = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, max_iter_to_alter: int = 25, @@ -789,14 +789,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -806,7 +806,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py index e89d7f77e8b1..967d525c7397 100644 --- a/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +++ b/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py @@ -53,7 +53,7 @@ class DiffEditInversionPipelineOutput(BaseOutput): Output class for Stable Diffusion pipelines. Args: - latents (`torch.FloatTensor`) + latents (`torch.Tensor`) inverted latents tensor images (`List[PIL.Image.Image]` or `np.ndarray`) List of denoised PIL images of length `num_timesteps * batch_size` or numpy array of shape `(num_timesteps, @@ -61,7 +61,7 @@ class DiffEditInversionPipelineOutput(BaseOutput): diffusion pipeline. """ - latents: torch.FloatTensor + latents: torch.Tensor images: Union[List[PIL.Image.Image], np.ndarray] @@ -185,7 +185,7 @@ def preprocess(image): def preprocess_mask(mask, batch_size: int = 1): if not isinstance(mask, torch.Tensor): # preprocess mask - if isinstance(mask, PIL.Image.Image) or isinstance(mask, np.ndarray): + if isinstance(mask, (PIL.Image.Image, np.ndarray)): mask = [mask] if isinstance(mask, list): @@ -381,8 +381,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -414,8 +414,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -435,10 +435,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -831,15 +831,15 @@ def get_epsilon(self, model_output: torch.Tensor, sample: torch.Tensor, timestep @replace_example_docstring(EXAMPLE_DOC_STRING) def generate_mask( self, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, target_prompt: Optional[Union[str, List[str]]] = None, target_negative_prompt: Optional[Union[str, List[str]]] = None, - target_prompt_embeds: Optional[torch.FloatTensor] = None, - target_negative_prompt_embeds: Optional[torch.FloatTensor] = None, + target_prompt_embeds: Optional[torch.Tensor] = None, + target_negative_prompt_embeds: Optional[torch.Tensor] = None, source_prompt: Optional[Union[str, List[str]]] = None, source_negative_prompt: Optional[Union[str, List[str]]] = None, - source_prompt_embeds: Optional[torch.FloatTensor] = None, - source_negative_prompt_embeds: Optional[torch.FloatTensor] = None, + source_prompt_embeds: Optional[torch.Tensor] = None, + source_negative_prompt_embeds: Optional[torch.Tensor] = None, num_maps_per_mask: Optional[int] = 10, mask_encode_strength: Optional[float] = 0.5, mask_thresholding_ratio: Optional[float] = 3.0, @@ -861,10 +861,10 @@ def generate_mask( target_negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide what to not include in image generation. If not defined, you need to pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). - target_prompt_embeds (`torch.FloatTensor`, *optional*): + target_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - target_negative_prompt_embeds (`torch.FloatTensor`, *optional*): + target_negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. source_prompt (`str` or `List[str]`, *optional*): @@ -873,11 +873,11 @@ def generate_mask( source_negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide semantic mask generation away from using DiffEdit. If not defined, you need to pass `source_negative_prompt_embeds` or `source_image` instead. - source_prompt_embeds (`torch.FloatTensor`, *optional*): + source_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings to guide the semantic mask generation. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from `source_prompt` input argument. - source_negative_prompt_embeds (`torch.FloatTensor`, *optional*): + source_negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings to negatively guide the semantic mask generation. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from `source_negative_prompt` input argument. @@ -1051,18 +1051,18 @@ def generate_mask( def invert( self, prompt: Optional[Union[str, List[str]]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, num_inference_steps: int = 50, inpaint_strength: float = 0.8, guidance_scale: float = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, decode_latents: bool = False, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, lambda_auto_corr: float = 20.0, @@ -1095,10 +1095,10 @@ def invert( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. decode_latents (`bool`, *optional*, defaults to `False`): @@ -1111,7 +1111,7 @@ def invert( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -1289,8 +1289,8 @@ def invert( def __call__( self, prompt: Optional[Union[str, List[str]]] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, - image_latents: Union[torch.FloatTensor, PIL.Image.Image] = None, + mask_image: Union[torch.Tensor, PIL.Image.Image] = None, + image_latents: Union[torch.Tensor, PIL.Image.Image] = None, inpaint_strength: Optional[float] = 0.8, num_inference_steps: int = 50, guidance_scale: float = 7.5, @@ -1298,12 +1298,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: int = None, @@ -1319,7 +1319,7 @@ def __call__( repainted, while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, 1, H, W)`. - image_latents (`PIL.Image.Image` or `torch.FloatTensor`): + image_latents (`PIL.Image.Image` or `torch.Tensor`): Partially noised image latents from the inversion process to be used as inputs for image generation. inpaint_strength (`float`, *optional*, defaults to 0.8): Indicates extent to inpaint the masked area. Must be between 0 and 1. When `inpaint_strength` is 1, the @@ -1343,14 +1343,14 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -1360,7 +1360,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py index 94043b7285c9..4b96e317dcc3 100644 --- a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +++ b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py @@ -180,8 +180,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -213,8 +213,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -234,10 +234,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -541,12 +541,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -592,14 +592,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -609,7 +609,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py index c20e940b4db6..963f29f21f2e 100644 --- a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +++ b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py @@ -238,8 +238,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -259,10 +259,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -705,12 +705,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, gligen_normalize_constant: float = 28.7, @@ -764,14 +764,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -781,7 +781,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py index e2096be7e894..067ae5f6f172 100755 --- a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -154,8 +154,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -187,8 +187,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -208,10 +208,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -469,12 +469,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, use_karras_sigmas: Optional[bool] = False, noise_sampler_seed: Optional[int] = None, @@ -512,14 +512,14 @@ def __call__( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -531,7 +531,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py index 3cfda4064d13..f7ca0c7c4daa 100644 --- a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py @@ -207,10 +207,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -236,17 +236,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -584,11 +584,11 @@ def __call__( negative_prompt_2: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, original_size: Optional[Tuple[int, int]] = None, @@ -642,21 +642,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. diff --git a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py index 880d6dcf401a..a100a38b04ea 100644 --- a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +++ b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py @@ -80,6 +80,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -95,14 +96,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -113,6 +118,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -239,8 +254,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -272,8 +287,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -293,10 +308,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -651,7 +666,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -664,7 +679,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -719,16 +734,17 @@ def __call__( width: Optional[int] = None, num_inference_steps: int = 49, timesteps: List[int] = None, + sigmas: List[float] = None, guidance_scale: float = 5.0, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -751,6 +767,14 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 5.0): A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. @@ -765,19 +789,19 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -888,7 +912,9 @@ def __call__( prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) # 5. Prepare latent variables num_channels_latents = self.unet.config.in_channels diff --git a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py index 514c8643ba36..2b80d2856869 100644 --- a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py @@ -80,6 +80,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -95,14 +96,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -113,6 +118,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -211,8 +226,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -244,8 +259,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -265,10 +280,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -662,7 +677,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -675,7 +690,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -775,11 +790,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -824,19 +839,19 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py index 63b8c6108ac4..cd59cf51869d 100644 --- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py @@ -516,11 +516,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, sld_guidance_scale: Optional[float] = 1000, sld_warmup_steps: Optional[int] = 10, @@ -555,7 +555,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -568,7 +568,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py b/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py index 549747e97146..338e4c65c500 100644 --- a/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py +++ b/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py @@ -85,7 +85,7 @@ def forward(self, clip_input, images): return images, has_nsfw_concepts @torch.no_grad() - def forward_onnx(self, clip_input: torch.FloatTensor, images: torch.FloatTensor): + def forward_onnx(self, clip_input: torch.Tensor, images: torch.Tensor): pooled_output = self.vision_model(clip_input)[1] # pooled_output image_embeds = self.visual_projection(pooled_output) diff --git a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py index cb29ce386f32..04c62efb46be 100644 --- a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py @@ -169,8 +169,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -202,8 +202,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -223,10 +223,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -570,14 +570,14 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -611,19 +611,19 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -633,7 +633,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 5af29b5719ce..2568150fa5f2 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -24,6 +24,7 @@ CLIPVisionModelWithProjection, ) +from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import ( FromSingleFileMixin, @@ -107,6 +108,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -122,14 +124,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -140,6 +146,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -267,10 +283,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -296,17 +312,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -747,7 +763,7 @@ def upcast_vae(self): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -760,7 +776,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -820,6 +836,7 @@ def __call__( width: Optional[int] = None, num_inference_steps: int = 50, timesteps: List[int] = None, + sigmas: List[float] = None, denoising_end: Optional[float] = None, guidance_scale: float = 5.0, negative_prompt: Optional[Union[str, List[str]]] = None, @@ -827,13 +844,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -845,7 +862,9 @@ def __call__( negative_crops_coords_top_left: Tuple[int, int] = (0, 0), negative_target_size: Optional[Tuple[int, int]] = None, clip_skip: Optional[int] = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], **kwargs, ): @@ -876,6 +895,10 @@ def __call__( Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. denoising_end (`float`, *optional*): When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be completed before it is intentionally prematurely terminated. As a result, the returned sample will @@ -904,26 +927,26 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -972,11 +995,11 @@ def __call__( as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -1006,6 +1029,9 @@ def __call__( "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", ) + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + # 0. Default height and width to unet height = height or self.default_sample_size * self.vae_scale_factor width = width or self.default_sample_size * self.vae_scale_factor @@ -1075,7 +1101,9 @@ def __call__( ) # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) # 5. Prepare latent variables num_channels_latents = self.unet.config.in_channels diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index b98ea279c1a2..838489dca778 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -25,6 +25,7 @@ CLIPVisionModelWithProjection, ) +from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import ( FromSingleFileMixin, @@ -124,6 +125,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -139,14 +141,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -157,6 +163,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -288,10 +304,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -317,17 +333,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -887,7 +903,7 @@ def upcast_vae(self): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -900,7 +916,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -964,6 +980,7 @@ def __call__( strength: float = 0.3, num_inference_steps: int = 50, timesteps: List[int] = None, + sigmas: List[float] = None, denoising_start: Optional[float] = None, denoising_end: Optional[float] = None, guidance_scale: float = 5.0, @@ -972,13 +989,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -992,7 +1009,9 @@ def __call__( aesthetic_score: float = 6.0, negative_aesthetic_score: float = 2.5, clip_skip: Optional[int] = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], **kwargs, ): @@ -1006,7 +1025,7 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): + image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): The image(s) to modify with the pipeline. strength (`float`, *optional*, defaults to 0.3): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` @@ -1022,6 +1041,10 @@ def __call__( Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. denoising_start (`float`, *optional*): When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and @@ -1058,26 +1081,26 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -1137,11 +1160,11 @@ def __call__( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -1171,6 +1194,9 @@ def __call__( "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", ) + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + # 1. Check inputs. Raise error if not correct self.check_inputs( prompt, @@ -1237,7 +1263,9 @@ def __call__( def denoising_value_valid(dnv): return isinstance(dnv, float) and 0 < dnv < 1 - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) timesteps, num_inference_steps = self.get_timesteps( num_inference_steps, strength, diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py index 26c1484e9ae2..631e309993b1 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py @@ -26,6 +26,7 @@ CLIPVisionModelWithProjection, ) +from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor from ...loaders import ( FromSingleFileMixin, @@ -269,6 +270,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -284,14 +286,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -302,6 +308,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -516,10 +532,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -545,17 +561,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1117,7 +1133,7 @@ def upcast_vae(self): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -1130,7 +1146,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -1192,13 +1208,14 @@ def __call__( prompt_2: Optional[Union[str, List[str]]] = None, image: PipelineImageInput = None, mask_image: PipelineImageInput = None, - masked_image_latents: torch.FloatTensor = None, + masked_image_latents: torch.Tensor = None, height: Optional[int] = None, width: Optional[int] = None, padding_mask_crop: Optional[int] = None, strength: float = 0.9999, num_inference_steps: int = 50, timesteps: List[int] = None, + sigmas: List[float] = None, denoising_start: Optional[float] = None, denoising_end: Optional[float] = None, guidance_scale: float = 7.5, @@ -1207,13 +1224,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1227,7 +1244,9 @@ def __call__( aesthetic_score: float = 6.0, negative_aesthetic_score: float = 2.5, clip_skip: Optional[int] = None, - callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], **kwargs, ): @@ -1281,6 +1300,10 @@ def __call__( Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. denoising_start (`float`, *optional*): When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and @@ -1309,22 +1332,22 @@ def __call__( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -1337,7 +1360,7 @@ def __call__( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -1391,11 +1414,11 @@ def __call__( clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -1425,6 +1448,9 @@ def __call__( "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", ) + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor width = width or self.unet.config.sample_size * self.vae_scale_factor @@ -1498,7 +1524,9 @@ def __call__( def denoising_value_valid(dnv): return isinstance(dnv, float) and 0 < dnv < 1 - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) timesteps, num_inference_steps = self.get_timesteps( num_inference_steps, strength, diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py index d9380020b329..3cdef82303f7 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py @@ -222,10 +222,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): r""" @@ -250,17 +250,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -622,14 +622,14 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -647,7 +647,7 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): + image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): The image(s) to modify with the pipeline. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. @@ -689,21 +689,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -715,7 +715,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_xl/watermark.py b/src/diffusers/pipelines/stable_diffusion_xl/watermark.py index f457cdbdb1eb..70d06bb6320d 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/watermark.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/watermark.py @@ -21,7 +21,7 @@ def __init__(self): self.encoder.set_watermark("bits", self.watermark) - def apply_watermark(self, images: torch.FloatTensor): + def apply_watermark(self, images: torch.Tensor): # can't encode images that are smaller than 256 if images.shape[-1] < 256: return images diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py index da6832cebd4d..b14fdd4f8de3 100644 --- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py @@ -21,11 +21,12 @@ import torch from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection -from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...image_processor import PipelineImageInput from ...models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel from ...schedulers import EulerDiscreteScheduler from ...utils import BaseOutput, logging, replace_example_docstring from ...utils.torch_utils import is_compiled_module, randn_tensor +from ...video_processor import VideoProcessor from ..pipeline_utils import DiffusionPipeline @@ -61,26 +62,64 @@ def _append_dims(x, target_dims): return x[(...,) + (None,) * dims_to_append] -# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid -def tensor2vid(video: torch.Tensor, processor: VaeImageProcessor, output_type: str = "np"): - batch_size, channels, num_frames, height, width = video.shape - outputs = [] - for batch_idx in range(batch_size): - batch_vid = video[batch_idx].permute(1, 0, 2, 3) - batch_output = processor.postprocess(batch_vid, output_type) - - outputs.append(batch_output) - - if output_type == "np": - outputs = np.stack(outputs) - - elif output_type == "pt": - outputs = torch.stack(outputs) - - elif not output_type == "pil": - raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']") +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, + **kwargs, +): + """ + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. - return outputs + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. + + Returns: + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps @dataclass @@ -89,12 +128,12 @@ class StableVideoDiffusionPipelineOutput(BaseOutput): Output class for Stable Video Diffusion pipeline. Args: - frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.FloatTensor`]): + frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.Tensor`]): List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size, num_frames, height, width, num_channels)`. """ - frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.FloatTensor] + frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.Tensor] class StableVideoDiffusionPipeline(DiffusionPipeline): @@ -139,7 +178,7 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.video_processor = VideoProcessor(do_resize=True, vae_scale_factor=self.vae_scale_factor) def _encode_image( self, @@ -147,12 +186,12 @@ def _encode_image( device: Union[str, torch.device], num_videos_per_prompt: int, do_classifier_free_guidance: bool, - ) -> torch.FloatTensor: + ) -> torch.Tensor: dtype = next(self.image_encoder.parameters()).dtype if not isinstance(image, torch.Tensor): - image = self.image_processor.pil_to_numpy(image) - image = self.image_processor.numpy_to_pt(image) + image = self.video_processor.pil_to_numpy(image) + image = self.video_processor.numpy_to_pt(image) # We normalize the image before resizing to match with the original implementation. # Then we unnormalize it after resizing. @@ -240,7 +279,7 @@ def _get_add_time_ids( return add_time_ids - def decode_latents(self, latents: torch.FloatTensor, num_frames: int, decode_chunk_size: int = 14): + def decode_latents(self, latents: torch.Tensor, num_frames: int, decode_chunk_size: int = 14): # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width] latents = latents.flatten(0, 1) @@ -276,7 +315,7 @@ def check_inputs(self, image, height, width): and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -293,7 +332,7 @@ def prepare_latents( dtype: torch.dtype, device: Union[str, torch.device], generator: torch.Generator, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ): shape = ( batch_size, @@ -338,11 +377,12 @@ def num_timesteps(self): @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor], + image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor], height: int = 576, width: int = 1024, num_frames: Optional[int] = None, num_inference_steps: int = 25, + sigmas: Optional[List[float]] = None, min_guidance_scale: float = 1.0, max_guidance_scale: float = 3.0, fps: int = 7, @@ -351,7 +391,7 @@ def __call__( decode_chunk_size: Optional[int] = None, num_videos_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], @@ -361,7 +401,7 @@ def __call__( The call function to the pipeline for generation. Args: - image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): + image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`): Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0, 1]`. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -374,6 +414,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 25): The number of denoising steps. More denoising steps usually lead to a higher quality video at the expense of slower inference. This parameter is modulated by `strength`. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. min_guidance_scale (`float`, *optional*, defaults to 1.0): The minimum guidance scale. Used for the classifier free guidance with first frame. max_guidance_scale (`float`, *optional*, defaults to 3.0): @@ -396,7 +440,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -421,8 +465,8 @@ def __call__( Returns: [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`: If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is - returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`) - is returned. + returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.Tensor`) is + returned. """ # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor @@ -455,7 +499,7 @@ def __call__( fps = fps - 1 # 4. Encode input image using VAE - image = self.image_processor.preprocess(image, height=height, width=width).to(device) + image = self.video_processor.preprocess(image, height=height, width=width).to(device) noise = randn_tensor(image.shape, generator=generator, device=device, dtype=image.dtype) image = image + noise_aug_strength * noise @@ -492,8 +536,7 @@ def __call__( added_time_ids = added_time_ids.to(device) # 6. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, None, sigmas) # 7. Prepare latent variables num_channels_latents = self.unet.config.in_channels @@ -562,7 +605,7 @@ def __call__( if needs_upcasting: self.vae.to(dtype=torch.float16) frames = self.decode_latents(latents, num_frames, decode_chunk_size) - frames = tensor2vid(frames, self.image_processor, output_type=output_type) + frames = self.video_processor.postprocess_video(video=frames, output_type=output_type) else: frames = latents diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py index fd5b1bb454b7..ab4ab87d2aff 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py @@ -124,6 +124,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -139,14 +140,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -157,6 +162,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -256,8 +271,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -289,8 +304,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -310,10 +325,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -620,7 +635,7 @@ def _default_height_width(self, height, width, image): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -633,7 +648,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -669,17 +684,18 @@ def __call__( width: Optional[int] = None, num_inference_steps: int = 50, timesteps: List[int] = None, + sigmas: List[float] = None, guidance_scale: float = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, adapter_conditioning_scale: Union[float, List[float]] = 1.0, @@ -692,9 +708,9 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the - type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be + type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. @@ -707,6 +723,10 @@ def __call__( Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.5): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -725,14 +745,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -744,7 +764,7 @@ def __call__( of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -816,7 +836,9 @@ def __call__( prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) # 5. Prepare latent variables num_channels_latents = self.unet.config.in_channels diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py index fdf938b248e6..2aa2415a47bb 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py @@ -140,6 +140,7 @@ def retrieve_timesteps( num_inference_steps: Optional[int] = None, device: Optional[Union[str, torch.device]] = None, timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, **kwargs, ): """ @@ -155,14 +156,18 @@ def retrieve_timesteps( device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): - Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default - timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` - must be `None`. + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. Returns: `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if not accepts_timesteps: @@ -173,6 +178,16 @@ def retrieve_timesteps( scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps @@ -281,10 +296,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -310,17 +325,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -791,7 +806,7 @@ def _default_height_width(self, height, width, image): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -804,7 +819,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -841,6 +856,7 @@ def __call__( width: Optional[int] = None, num_inference_steps: int = 50, timesteps: List[int] = None, + sigmas: List[float] = None, denoising_end: Optional[float] = None, guidance_scale: float = 5.0, negative_prompt: Optional[Union[str, List[str]]] = None, @@ -848,16 +864,16 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -881,9 +897,9 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the - type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be + type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. Anything below 512 pixels won't work well for @@ -900,6 +916,10 @@ def __call__( Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. denoising_end (`float`, *optional*): When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be completed before it is intentionally prematurely terminated. As a result, the returned sample will @@ -928,26 +948,26 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -960,7 +980,7 @@ def __call__( instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -1101,7 +1121,9 @@ def __call__( ) # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) # 5. Prepare latent variables num_channels_latents = self.unet.config.in_channels diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py index 2dae5b4ead69..040bf0efba84 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py @@ -15,7 +15,7 @@ class TextToVideoSDPipelineOutput(BaseOutput): """ Output class for text-to-video pipelines. - Args: + Args: frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index 6c33836e60da..69828fd208fb 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -15,11 +15,9 @@ import inspect from typing import Any, Callable, Dict, List, Optional, Union -import numpy as np import torch from transformers import CLIPTextModel, CLIPTokenizer -from ...image_processor import VaeImageProcessor from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet3DConditionModel from ...models.lora import adjust_lora_scale_text_encoder @@ -33,6 +31,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor +from ...video_processor import VideoProcessor from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from . import TextToVideoSDPipelineOutput @@ -59,28 +58,6 @@ """ -# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid -def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"): - batch_size, channels, num_frames, height, width = video.shape - outputs = [] - for batch_idx in range(batch_size): - batch_vid = video[batch_idx].permute(1, 0, 2, 3) - batch_output = processor.postprocess(batch_vid, output_type) - - outputs.append(batch_output) - - if output_type == "np": - outputs = np.stack(outputs) - - elif output_type == "pt": - outputs = torch.stack(outputs) - - elif not output_type == "pil": - raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']") - - return outputs - - class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-video generation. @@ -127,7 +104,7 @@ def __init__( scheduler=scheduler, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( @@ -137,8 +114,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -170,8 +147,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -191,10 +168,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -465,12 +442,12 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "np", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -505,25 +482,25 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"np"`): - The output format of the generated video. Choose between `torch.FloatTensor` or `np.array`. + The output format of the generated video. Choose between `torch.Tensor` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -652,7 +629,7 @@ def __call__( video = latents else: video_tensor = self.decode_latents(latents) - video = tensor2vid(video_tensor, self.image_processor, output_type) + video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type) # 9. Offload all models self.maybe_free_model_hooks() diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py index 3901946afe46..d903974071e5 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py @@ -16,11 +16,9 @@ from typing import Any, Callable, Dict, List, Optional, Union import numpy as np -import PIL.Image import torch from transformers import CLIPTextModel, CLIPTokenizer -from ...image_processor import VaeImageProcessor from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet3DConditionModel from ...models.lora import adjust_lora_scale_text_encoder @@ -34,6 +32,7 @@ unscale_lora_layers, ) from ...utils.torch_utils import randn_tensor +from ...video_processor import VideoProcessor from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from . import TextToVideoSDPipelineOutput @@ -94,69 +93,6 @@ def retrieve_latents( raise AttributeError("Could not access latents of provided encoder_output") -# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid -def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"): - batch_size, channels, num_frames, height, width = video.shape - outputs = [] - for batch_idx in range(batch_size): - batch_vid = video[batch_idx].permute(1, 0, 2, 3) - batch_output = processor.postprocess(batch_vid, output_type) - - outputs.append(batch_output) - - if output_type == "np": - outputs = np.stack(outputs) - - elif output_type == "pt": - outputs = torch.stack(outputs) - - elif not output_type == "pil": - raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']") - - return outputs - - -def preprocess_video(video): - supported_formats = (np.ndarray, torch.Tensor, PIL.Image.Image) - - if isinstance(video, supported_formats): - video = [video] - elif not (isinstance(video, list) and all(isinstance(i, supported_formats) for i in video)): - raise ValueError( - f"Input is in incorrect format: {[type(i) for i in video]}. Currently, we only support {', '.join(supported_formats)}" - ) - - if isinstance(video[0], PIL.Image.Image): - video = [np.array(frame) for frame in video] - - if isinstance(video[0], np.ndarray): - video = np.concatenate(video, axis=0) if video[0].ndim == 5 else np.stack(video, axis=0) - - if video.dtype == np.uint8: - video = np.array(video).astype(np.float32) / 255.0 - - if video.ndim == 4: - video = video[None, ...] - - video = torch.from_numpy(video.transpose(0, 4, 1, 2, 3)) - - elif isinstance(video[0], torch.Tensor): - video = torch.cat(video, axis=0) if video[0].ndim == 5 else torch.stack(video, axis=0) - - # don't need any preprocess if the video is latents - channel = video.shape[1] - if channel == 4: - return video - - # move channels before num_frames - video = video.permute(0, 2, 1, 3, 4) - - # normalize video - video = 2.0 * video - 1.0 - - return video - - class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-guided video-to-video generation. @@ -203,7 +139,7 @@ def __init__( scheduler=scheduler, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( @@ -213,8 +149,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -246,8 +182,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -267,10 +203,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -563,19 +499,19 @@ def prepare_latents(self, video, timestep, batch_size, dtype, device, generator= def __call__( self, prompt: Union[str, List[str]] = None, - video: Union[List[np.ndarray], torch.FloatTensor] = None, + video: Union[List[np.ndarray], torch.Tensor] = None, strength: float = 0.6, num_inference_steps: int = 50, guidance_scale: float = 15.0, negative_prompt: Optional[Union[str, List[str]]] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "np", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -586,7 +522,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - video (`List[np.ndarray]` or `torch.FloatTensor`): + video (`List[np.ndarray]` or `torch.Tensor`): `video` frames or tensor representing a video batch to be used as the starting point for the process. Can also accept video latents as `image`, if passing latents directly, it will not be encoded again. strength (`float`, *optional*, defaults to 0.8): @@ -610,25 +546,25 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"np"`): - The output format of the generated video. Choose between `torch.FloatTensor` or `np.array`. + The output format of the generated video. Choose between `torch.Tensor` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -687,7 +623,7 @@ def __call__( prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) # 4. Preprocess video - video = preprocess_video(video) + video = self.video_processor.preprocess_video(video) # 5. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) @@ -749,7 +685,7 @@ def __call__( video = latents else: video_tensor = self.decode_latents(latents) - video = tensor2vid(video_tensor, self.image_processor, output_type) + video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type) # 10. Offload all models self.maybe_free_model_hooks() diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py index dddd65079625..5ba671359292 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -392,7 +392,7 @@ def backward_loop( `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -529,12 +529,12 @@ def __call__( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, motion_field_strength_x: float = 12, motion_field_strength_y: float = 12, output_type: Optional[str] = "tensor", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, t0: int = 44, t1: int = 47, @@ -569,7 +569,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -581,7 +581,7 @@ def __call__( a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -795,8 +795,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -816,10 +816,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py index 2dbd928b916e..2679fd1ea5d5 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py @@ -581,10 +581,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -610,17 +610,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -861,7 +861,7 @@ def backward_loop( `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -933,16 +933,16 @@ def __call__( eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, frame_ids: Optional[List[int]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, + latents: Optional[torch.Tensor] = None, motion_field_strength_x: float = 12, motion_field_strength_y: float = 12, output_type: Optional[str] = "tensor", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -1002,21 +1002,21 @@ def __call__( frame_ids (`List[int]`, *optional*): Indexes of the frames that are being generated. This is used when generating longer videos chunk-by-chunk. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -1034,7 +1034,7 @@ def __call__( of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip.py b/src/diffusers/pipelines/unclip/pipeline_unclip.py index 117527e60ab9..9268a97aa117 100644 --- a/src/diffusers/pipelines/unclip/pipeline_unclip.py +++ b/src/diffusers/pipelines/unclip/pipeline_unclip.py @@ -217,9 +217,9 @@ def __call__( decoder_num_inference_steps: int = 25, super_res_num_inference_steps: int = 7, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prior_latents: Optional[torch.FloatTensor] = None, - decoder_latents: Optional[torch.FloatTensor] = None, - super_res_latents: Optional[torch.FloatTensor] = None, + prior_latents: Optional[torch.Tensor] = None, + decoder_latents: Optional[torch.Tensor] = None, + super_res_latents: Optional[torch.Tensor] = None, text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None, text_attention_mask: Optional[torch.Tensor] = None, prior_guidance_scale: float = 4.0, @@ -250,11 +250,11 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prior_latents (`torch.FloatTensor` of shape (batch size, embeddings dimension), *optional*): + prior_latents (`torch.Tensor` of shape (batch size, embeddings dimension), *optional*): Pre-generated noisy latents to be used as inputs for the prior. - decoder_latents (`torch.FloatTensor` of shape (batch size, channels, height, width), *optional*): + decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. - super_res_latents (`torch.FloatTensor` of shape (batch size, channels, super res height, super res width), *optional*): + super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. prior_guidance_scale (`float`, *optional*, defaults to 4.0): A higher guidance scale value encourages the model to generate images closely linked to the text diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py index 6c646a7df362..2a0e7e90e4d2 100644 --- a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +++ b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py @@ -199,13 +199,13 @@ def _encode_image(self, image, device, num_images_per_prompt, image_embeddings: @torch.no_grad() def __call__( self, - image: Optional[Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor]] = None, + image: Optional[Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor]] = None, num_images_per_prompt: int = 1, decoder_num_inference_steps: int = 25, super_res_num_inference_steps: int = 7, generator: Optional[torch.Generator] = None, - decoder_latents: Optional[torch.FloatTensor] = None, - super_res_latents: Optional[torch.FloatTensor] = None, + decoder_latents: Optional[torch.Tensor] = None, + super_res_latents: Optional[torch.Tensor] = None, image_embeddings: Optional[torch.Tensor] = None, decoder_guidance_scale: float = 8.0, output_type: Optional[str] = "pil", @@ -215,7 +215,7 @@ def __call__( The call function to the pipeline for generation. Args: - image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): + image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`): `Image` or tensor representing an image batch to be used as the starting point. If you provide a tensor, it needs to be compatible with the [`CLIPImageProcessor`] [configuration](https://huggingface.co/fusing/karlo-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json). @@ -231,9 +231,9 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - decoder_latents (`torch.FloatTensor` of shape (batch size, channels, height, width), *optional*): + decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. - super_res_latents (`torch.FloatTensor` of shape (batch size, channels, super res height, super res width), *optional*): + super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. decoder_guidance_scale (`float`, *optional*, defaults to 4.0): A higher guidance scale value encourages the model to generate images closely linked to the text diff --git a/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py b/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py index bf0a4eb475c0..75e5d43678d5 100644 --- a/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py +++ b/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py @@ -220,7 +220,7 @@ def generate_beam( input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*): Tokenizer indices of input sequence tokens in the vocabulary. One of `input_ids` and `input_embeds` must be supplied. - input_embeds (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*): + input_embeds (`torch.Tensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*): An embedded representation to directly pass to the transformer as a prefix for beam search. One of `input_ids` and `input_embeds` must be supplied. device: diff --git a/src/diffusers/pipelines/unidiffuser/modeling_uvit.py b/src/diffusers/pipelines/unidiffuser/modeling_uvit.py index 6579e272a3bf..abc51edf6df1 100644 --- a/src/diffusers/pipelines/unidiffuser/modeling_uvit.py +++ b/src/diffusers/pipelines/unidiffuser/modeling_uvit.py @@ -739,8 +739,7 @@ def forward( """ Args: hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`. - When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input - hidden_states + When continuous, `torch.Tensor` of shape `(batch size, channel, height, width)`): Input hidden_states encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to self-attention. @@ -1038,9 +1037,9 @@ def no_weight_decay(self): def forward( self, - latent_image_embeds: torch.FloatTensor, - image_embeds: torch.FloatTensor, - prompt_embeds: torch.FloatTensor, + latent_image_embeds: torch.Tensor, + image_embeds: torch.Tensor, + prompt_embeds: torch.Tensor, timestep_img: Union[torch.Tensor, float, int], timestep_text: Union[torch.Tensor, float, int], data_type: Optional[Union[torch.Tensor, float, int]] = 1, @@ -1049,11 +1048,11 @@ def forward( ): """ Args: - latent_image_embeds (`torch.FloatTensor` of shape `(batch size, latent channels, height, width)`): + latent_image_embeds (`torch.Tensor` of shape `(batch size, latent channels, height, width)`): Latent image representation from the VAE encoder. - image_embeds (`torch.FloatTensor` of shape `(batch size, 1, clip_img_dim)`): + image_embeds (`torch.Tensor` of shape `(batch size, 1, clip_img_dim)`): CLIP-embedded image representation (unsqueezed in the first dimension). - prompt_embeds (`torch.FloatTensor` of shape `(batch size, seq_len, text_dim)`): + prompt_embeds (`torch.Tensor` of shape `(batch size, seq_len, text_dim)`): CLIP-embedded text representation. timestep_img (`torch.long` or `float` or `int`): Current denoising step for the image. diff --git a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py index 5d61b1054e1c..44a03ef56eec 100644 --- a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +++ b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py @@ -304,7 +304,7 @@ def _infer_batch_size( if isinstance(image, PIL.Image.Image): batch_size = 1 else: - # Image must be available and type either PIL.Image.Image or torch.FloatTensor. + # Image must be available and type either PIL.Image.Image or torch.Tensor. # Not currently supporting something like image_embeds. batch_size = image.shape[0] multiplier = num_prompts_per_image @@ -353,8 +353,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -386,8 +386,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -407,10 +407,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1080,7 +1080,7 @@ def check_inputs( def __call__( self, prompt: Optional[Union[str, List[str]]] = None, - image: Optional[Union[torch.FloatTensor, PIL.Image.Image]] = None, + image: Optional[Union[torch.Tensor, PIL.Image.Image]] = None, height: Optional[int] = None, width: Optional[int] = None, data_type: Optional[int] = 1, @@ -1091,15 +1091,15 @@ def __call__( num_prompts_per_image: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_latents: Optional[torch.FloatTensor] = None, - vae_latents: Optional[torch.FloatTensor] = None, - clip_latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_latents: Optional[torch.Tensor] = None, + vae_latents: Optional[torch.Tensor] = None, + clip_latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -1109,7 +1109,7 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. Required for text-conditioned image generation (`text2img`) mode. - image (`torch.FloatTensor` or `PIL.Image.Image`, *optional*): + image (`torch.Tensor` or `PIL.Image.Image`, *optional*): `Image` or tensor representing an image batch. Required for image-conditioned text generation (`img2text`) mode. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -1144,29 +1144,29 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for joint image-text generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. This assumes a full set of VAE, CLIP, and text latents, if supplied, overrides the value of `prompt_latents`, `vae_latents`, and `clip_latents`. - prompt_latents (`torch.FloatTensor`, *optional*): + prompt_latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for text generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - vae_latents (`torch.FloatTensor`, *optional*): + vae_latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - clip_latents (`torch.FloatTensor`, *optional*): + clip_latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. Used in text-conditioned image generation (`text2img`) mode. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are be generated from the `negative_prompt` input argument. Used in text-conditioned image generation (`text2img`) mode. @@ -1176,7 +1176,7 @@ def __call__( Whether or not to return a [`~pipelines.ImageTextPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py b/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py index 3b21dfb5f1f0..b2cf8cbc978c 100644 --- a/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +++ b/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py @@ -130,7 +130,7 @@ def __init__( ) @apply_forward_hook - def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput: + def encode(self, x: torch.Tensor, return_dict: bool = True) -> VQEncoderOutput: h = self.in_block(x) h = self.down_blocks(h) @@ -141,8 +141,8 @@ def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOut @apply_forward_hook def decode( - self, h: torch.FloatTensor, force_not_quantize: bool = True, return_dict: bool = True - ) -> Union[DecoderOutput, torch.FloatTensor]: + self, h: torch.Tensor, force_not_quantize: bool = True, return_dict: bool = True + ) -> Union[DecoderOutput, torch.Tensor]: if not force_not_quantize: quant, _, _ = self.vquantizer(h) else: @@ -155,10 +155,10 @@ def decode( return DecoderOutput(sample=dec) - def forward(self, sample: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: + def forward(self, sample: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: r""" Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. """ diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py index e4277d58a006..b08421415b23 100644 --- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py @@ -209,7 +209,7 @@ def num_timesteps(self): @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image_embeddings: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeddings: Union[torch.Tensor, List[torch.Tensor]], prompt: Union[str, List[str]] = None, num_inference_steps: int = 12, timesteps: Optional[List[float]] = None, @@ -217,7 +217,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -228,7 +228,7 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image_embedding (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embedding (`torch.Tensor` or `List[torch.Tensor]`): Image Embeddings either extracted from an image or generated by a Prior Model. prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. @@ -252,7 +252,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py index da6b5e0258bb..7819c8c0a0ef 100644 --- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py @@ -154,11 +154,11 @@ def __call__( decoder_timesteps: Optional[List[float]] = None, decoder_guidance_scale: float = 0.0, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -176,10 +176,10 @@ def __call__( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -218,7 +218,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py index 4640f7696766..4dddd18c30d5 100644 --- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py @@ -54,12 +54,12 @@ class WuerstchenPriorPipelineOutput(BaseOutput): Output class for WuerstchenPriorPipeline. Args: - image_embeddings (`torch.FloatTensor` or `np.ndarray`) + image_embeddings (`torch.Tensor` or `np.ndarray`) Prior image embeddings for text prompt """ - image_embeddings: Union[torch.FloatTensor, np.ndarray] + image_embeddings: Union[torch.Tensor, np.ndarray] class WuerstchenPriorPipeline(DiffusionPipeline, LoraLoaderMixin): @@ -136,8 +136,8 @@ def encode_prompt( do_classifier_free_guidance, prompt=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): if prompt is not None and isinstance(prompt, str): batch_size = 1 @@ -288,11 +288,11 @@ def __call__( timesteps: List[float] = None, guidance_scale: float = 8.0, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pt", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -324,10 +324,10 @@ def __call__( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `decoder_guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -336,7 +336,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py index 720d8ea25e29..da162ed14419 100644 --- a/src/diffusers/schedulers/__init__.py +++ b/src/diffusers/schedulers/__init__.py @@ -68,7 +68,7 @@ _import_structure["scheduling_tcd"] = ["TCDScheduler"] _import_structure["scheduling_unclip"] = ["UnCLIPScheduler"] _import_structure["scheduling_unipc_multistep"] = ["UniPCMultistepScheduler"] - _import_structure["scheduling_utils"] = ["KarrasDiffusionSchedulers", "SchedulerMixin"] + _import_structure["scheduling_utils"] = ["AysSchedules", "KarrasDiffusionSchedulers", "SchedulerMixin"] _import_structure["scheduling_vq_diffusion"] = ["VQDiffusionScheduler"] try: @@ -163,7 +163,7 @@ from .scheduling_tcd import TCDScheduler from .scheduling_unclip import UnCLIPScheduler from .scheduling_unipc_multistep import UniPCMultistepScheduler - from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin + from .scheduling_utils import AysSchedules, KarrasDiffusionSchedulers, SchedulerMixin from .scheduling_vq_diffusion import VQDiffusionScheduler try: diff --git a/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py b/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py index d776d989a143..f5f9bd256c2e 100644 --- a/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py +++ b/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py @@ -31,19 +31,19 @@ class KarrasVeOutput(BaseOutput): Output class for the scheduler's step function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - derivative (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + derivative (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Derivative of predicted original image sample (x_0). - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample (x_{0}) based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - derivative: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + derivative: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None class KarrasVeScheduler(SchedulerMixin, ConfigMixin): @@ -94,21 +94,21 @@ def __init__( # setable values self.num_inference_steps: int = None self.timesteps: np.IntTensor = None - self.schedule: torch.FloatTensor = None # sigma(t_i) + self.schedule: torch.Tensor = None # sigma(t_i) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -136,14 +136,14 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.schedule = torch.tensor(schedule, dtype=torch.float32, device=device) def add_noise_to_input( - self, sample: torch.FloatTensor, sigma: float, generator: Optional[torch.Generator] = None - ) -> Tuple[torch.FloatTensor, float]: + self, sample: torch.Tensor, sigma: float, generator: Optional[torch.Generator] = None + ) -> Tuple[torch.Tensor, float]: """ Explicit Langevin-like "churn" step of adding noise to the sample according to a `gamma_i ≥ 0` to reach a higher noise level `sigma_hat = sigma_i + gamma_i*sigma_i`. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. sigma (`float`): generator (`torch.Generator`, *optional*): @@ -163,10 +163,10 @@ def add_noise_to_input( def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, sigma_hat: float, sigma_prev: float, - sample_hat: torch.FloatTensor, + sample_hat: torch.Tensor, return_dict: bool = True, ) -> Union[KarrasVeOutput, Tuple]: """ @@ -174,11 +174,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. sigma_hat (`float`): sigma_prev (`float`): - sample_hat (`torch.FloatTensor`): + sample_hat (`torch.Tensor`): return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~schedulers.scheduling_karras_ve.KarrasVESchedulerOutput`] or `tuple`. @@ -202,25 +202,25 @@ def step( def step_correct( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, sigma_hat: float, sigma_prev: float, - sample_hat: torch.FloatTensor, - sample_prev: torch.FloatTensor, - derivative: torch.FloatTensor, + sample_hat: torch.Tensor, + sample_prev: torch.Tensor, + derivative: torch.Tensor, return_dict: bool = True, ) -> Union[KarrasVeOutput, Tuple]: """ Corrects the predicted sample based on the `model_output` of the network. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. sigma_hat (`float`): TODO sigma_prev (`float`): TODO - sample_hat (`torch.FloatTensor`): TODO - sample_prev (`torch.FloatTensor`): TODO - derivative (`torch.FloatTensor`): TODO + sample_hat (`torch.Tensor`): TODO + sample_prev (`torch.Tensor`): TODO + derivative (`torch.Tensor`): TODO return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`. diff --git a/src/diffusers/schedulers/scheduling_amused.py b/src/diffusers/schedulers/scheduling_amused.py index 51fbe6a4dc7d..238b8d869171 100644 --- a/src/diffusers/schedulers/scheduling_amused.py +++ b/src/diffusers/schedulers/scheduling_amused.py @@ -29,16 +29,16 @@ class AmusedSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: torch.FloatTensor = None + prev_sample: torch.Tensor + pred_original_sample: torch.Tensor = None class AmusedScheduler(SchedulerMixin, ConfigMixin): @@ -70,7 +70,7 @@ def set_timesteps( def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: torch.long, sample: torch.LongTensor, starting_mask_ratio: int = 1, diff --git a/src/diffusers/schedulers/scheduling_consistency_decoder.py b/src/diffusers/schedulers/scheduling_consistency_decoder.py index 37efb7dc7cc0..d7af018b284a 100644 --- a/src/diffusers/schedulers/scheduling_consistency_decoder.py +++ b/src/diffusers/schedulers/scheduling_consistency_decoder.py @@ -61,12 +61,12 @@ class ConsistencyDecoderSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. """ - prev_sample: torch.FloatTensor + prev_sample: torch.Tensor class ConsistencyDecoderScheduler(SchedulerMixin, ConfigMixin): @@ -113,28 +113,28 @@ def set_timesteps( def init_noise_sigma(self): return self.sqrt_one_minus_alphas_cumprod[self.timesteps[0]] - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample * self.c_in[timestep] def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[ConsistencyDecoderSchedulerOutput, Tuple]: @@ -143,11 +143,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`float`): The current timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. diff --git a/src/diffusers/schedulers/scheduling_consistency_models.py b/src/diffusers/schedulers/scheduling_consistency_models.py index 3abf52f0afd7..653171638ccf 100644 --- a/src/diffusers/schedulers/scheduling_consistency_models.py +++ b/src/diffusers/schedulers/scheduling_consistency_models.py @@ -33,12 +33,12 @@ class CMStochasticIterativeSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. """ - prev_sample: torch.FloatTensor + prev_sample: torch.Tensor class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin): @@ -126,20 +126,18 @@ def set_begin_index(self, begin_index: int = 0): """ self._begin_index = begin_index - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Scales the consistency model input by `(sigma**2 + sigma_data**2) ** 0.5`. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. - timestep (`float` or `torch.FloatTensor`): + timestep (`float` or `torch.Tensor`): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ # Get sigma corresponding to timestep @@ -278,7 +276,7 @@ def get_scalings_for_boundary_condition(self, sigma): Args: - sigma (`torch.FloatTensor`): + sigma (`torch.Tensor`): The current sigma in the Karras sigma schedule. Returns: @@ -319,9 +317,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[CMStochasticIterativeSchedulerOutput, Tuple]: @@ -330,11 +328,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`float`): The current timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -349,11 +347,7 @@ def step( otherwise a tuple is returned where the first element is the sample tensor. """ - if ( - isinstance(timestep, int) - or isinstance(timestep, torch.IntTensor) - or isinstance(timestep, torch.LongTensor) - ): + if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)): raise ValueError( ( "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" @@ -417,10 +411,10 @@ def step( # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index 718514abf9d8..104a4a3f3b48 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -35,16 +35,16 @@ class DDIMSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -98,11 +98,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -211,7 +211,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") # Rescale for zero SNR if rescale_betas_zero_snr: @@ -233,19 +233,19 @@ def __init__( self.num_inference_steps = None self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -261,7 +261,7 @@ def _get_variance(self, timestep, prev_timestep): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -341,13 +341,13 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = False, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[DDIMSchedulerOutput, Tuple]: """ @@ -355,11 +355,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. eta (`float`): The weight of noise for added noise in diffusion step. @@ -370,7 +370,7 @@ def step( `use_clipped_model_output` has no effect. generator (`torch.Generator`, *optional*): A random number generator. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`CycleDiffusion`]. return_dict (`bool`, *optional*, defaults to `True`): @@ -470,10 +470,10 @@ def step( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -495,9 +495,7 @@ def add_noise( return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_ddim_inverse.py b/src/diffusers/schedulers/scheduling_ddim_inverse.py index 9ca6ed4aca63..6c2352f2c828 100644 --- a/src/diffusers/schedulers/scheduling_ddim_inverse.py +++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py @@ -33,16 +33,16 @@ class DDIMSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -97,11 +97,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -207,7 +207,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") # Rescale for zero SNR if rescale_betas_zero_snr: @@ -231,19 +231,19 @@ def __init__( self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps).copy().astype(np.int64)) # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.scale_model_input - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -288,9 +288,9 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[DDIMSchedulerOutput, Tuple]: """ @@ -298,11 +298,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. eta (`float`): The weight of noise for added noise in diffusion step. @@ -311,7 +311,7 @@ def step( because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would coincide with the one provided as input and `use_clipped_model_output` has no effect. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`CycleDiffusion`]. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py index 995478f273d1..0cf84b694db5 100644 --- a/src/diffusers/schedulers/scheduling_ddim_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py @@ -35,16 +35,16 @@ class DDIMParallelSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -99,11 +99,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -218,7 +218,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") # Rescale for zero SNR if rescale_betas_zero_snr: @@ -241,19 +241,19 @@ def __init__( self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.scale_model_input - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -283,7 +283,7 @@ def _batch_get_variance(self, t, prev_t): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -364,13 +364,13 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = False, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[DDIMParallelSchedulerOutput, Tuple]: """ @@ -378,9 +378,9 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. eta (`float`): weight of noise for added noise in diffusion step. use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped @@ -388,7 +388,7 @@ def step( `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would coincide with the one provided as input and `use_clipped_model_output` will have not effect. generator: random number generator. - variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we + variance_noise (`torch.Tensor`): instead of generating noise for the variance using `generator`, we can directly provide the noise for the variance itself. This is useful for methods such as CycleDiffusion. (https://arxiv.org/abs/2210.05559) return_dict (`bool`): option for returning tuple rather than DDIMParallelSchedulerOutput class @@ -486,12 +486,12 @@ def step( def batch_step_no_noise( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timesteps: List[int], - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = False, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Batched version of the `step` function, to be able to reverse the SDE for multiple samples/timesteps at once. Also, does not add any noise to the predicted sample, which is necessary for parallel sampling where the noise @@ -501,10 +501,10 @@ def batch_step_no_noise( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timesteps (`List[int]`): current discrete timesteps in the diffusion chain. This is now a list of integers. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. eta (`float`): weight of noise for added noise in diffusion step. use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped @@ -513,7 +513,7 @@ def batch_step_no_noise( coincide with the one provided as input and `use_clipped_model_output` will have not effect. Returns: - `torch.FloatTensor`: sample tensor at previous timestep. + `torch.Tensor`: sample tensor at previous timestep. """ if self.num_inference_steps is None: @@ -595,10 +595,10 @@ def batch_step_no_noise( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -620,9 +620,7 @@ def add_noise( return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index 978fb63737e1..9af53a61d0a8 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -33,16 +33,16 @@ class DDPMSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None def betas_for_alpha_bar( @@ -96,11 +96,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -211,7 +211,7 @@ def __init__( betas = torch.linspace(-6, 6, num_train_timesteps) self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") # Rescale for zero SNR if rescale_betas_zero_snr: @@ -231,19 +231,19 @@ def __init__( self.variance_type = variance_type - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -363,7 +363,7 @@ def _get_variance(self, t, predicted_variance=None, variance_type=None): return variance - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -398,9 +398,9 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, return_dict: bool = True, ) -> Union[DDPMSchedulerOutput, Tuple]: @@ -409,11 +409,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -498,10 +498,10 @@ def step( def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -522,9 +522,7 @@ def add_noise( noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise return noisy_samples - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py index 30fbad29f1e6..94f64425816c 100644 --- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py @@ -34,16 +34,16 @@ class DDPMParallelSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -98,11 +98,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -219,7 +219,7 @@ def __init__( betas = torch.linspace(-6, 6, num_train_timesteps) self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") # Rescale for zero SNR if rescale_betas_zero_snr: @@ -240,19 +240,19 @@ def __init__( self.variance_type = variance_type # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.scale_model_input - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -375,7 +375,7 @@ def _get_variance(self, t, predicted_variance=None, variance_type=None): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -410,9 +410,9 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, return_dict: bool = True, ) -> Union[DDPMParallelSchedulerOutput, Tuple]: @@ -421,9 +421,9 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. generator: random number generator. return_dict (`bool`): option for returning tuple rather than DDPMParallelSchedulerOutput class @@ -506,10 +506,10 @@ def step( def batch_step_no_noise( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timesteps: List[int], - sample: torch.FloatTensor, - ) -> torch.FloatTensor: + sample: torch.Tensor, + ) -> torch.Tensor: """ Batched version of the `step` function, to be able to reverse the SDE for multiple samples/timesteps at once. Also, does not add any noise to the predicted sample, which is necessary for parallel sampling where the noise @@ -519,14 +519,14 @@ def batch_step_no_noise( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timesteps (`List[int]`): current discrete timesteps in the diffusion chain. This is now a list of integers. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. Returns: - `torch.FloatTensor`: sample tensor at previous timestep. + `torch.Tensor`: sample tensor at previous timestep. """ t = timesteps num_inference_steps = self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps @@ -587,10 +587,10 @@ def batch_step_no_noise( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -612,9 +612,7 @@ def add_noise( return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py index 6a0f4f5efe3c..71b5669b0528 100644 --- a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py +++ b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py @@ -33,12 +33,12 @@ class DDPMWuerstchenSchedulerOutput(BaseOutput): Output class for the scheduler's step function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the denoising loop. """ - prev_sample: torch.FloatTensor + prev_sample: torch.Tensor def betas_for_alpha_bar( @@ -125,17 +125,17 @@ def _alpha_cumprod(self, t, device): ) ** 2 / self._init_alpha_cumprod.to(device) return alpha_cumprod.clamp(0.0001, 0.9999) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): input sample + sample (`torch.Tensor`): input sample timestep (`int`, optional): current timestep Returns: - `torch.FloatTensor`: scaled input sample + `torch.Tensor`: scaled input sample """ return sample @@ -163,9 +163,9 @@ def set_timesteps( def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, return_dict: bool = True, ) -> Union[DDPMWuerstchenSchedulerOutput, Tuple]: @@ -174,9 +174,9 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. generator: random number generator. return_dict (`bool`): option for returning tuple rather than DDPMWuerstchenSchedulerOutput class @@ -209,10 +209,10 @@ def step( def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: device = original_samples.device dtype = original_samples.dtype alpha_cumprod = self._alpha_cumprod(timesteps, device=device).view( diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py index f25fbf029bd4..685765779e21 100644 --- a/src/diffusers/schedulers/scheduling_deis_multistep.py +++ b/src/diffusers/schedulers/scheduling_deis_multistep.py @@ -152,7 +152,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) @@ -170,13 +170,13 @@ def __init__( if algorithm_type in ["dpmsolver", "dpmsolver++"]: self.register_to_config(algorithm_type="deis") else: - raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{algorithm_type} is not implemented for {self.__class__}") if solver_type not in ["logrho"]: if solver_type in ["midpoint", "heun", "bh1", "bh2"]: self.register_to_config(solver_type="logrho") else: - raise NotImplementedError(f"solver type {solver_type} does is not implemented for {self.__class__}") + raise NotImplementedError(f"solver type {solver_type} is not implemented for {self.__class__}") # setable values self.num_inference_steps = None @@ -276,7 +276,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -341,7 +341,7 @@ def _sigma_to_alpha_sigma_t(self, sigma): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -368,24 +368,24 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Convert the model output to the corresponding type the DEIS algorithm needs. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -425,26 +425,26 @@ def convert_model_output( def deis_first_order_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the first-order DEIS (equivalent to DDIM). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -483,22 +483,22 @@ def deis_first_order_update( def multistep_deis_second_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the second-order multistep DEIS. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None) @@ -552,22 +552,22 @@ def ind_fn(t, b, c): def multistep_deis_third_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the third-order multistep DEIS. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ @@ -673,9 +673,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -683,11 +683,11 @@ def step( the multistep DEIS. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`. @@ -736,17 +736,17 @@ def step( return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -754,10 +754,10 @@ def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py index 7e0939e0d927..0f0e5296054f 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py @@ -78,11 +78,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -229,7 +229,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") if rescale_betas_zero_snr: self.betas = rescale_zero_terminal_snr(self.betas) @@ -256,13 +256,13 @@ def __init__( if algorithm_type == "deis": self.register_to_config(algorithm_type="dpmsolver++") else: - raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{algorithm_type} is not implemented for {self.__class__}") if solver_type not in ["midpoint", "heun"]: if solver_type in ["logrho", "bh1", "bh2"]: self.register_to_config(solver_type="midpoint") else: - raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}") if algorithm_type not in ["dpmsolver++", "sde-dpmsolver++"] and final_sigmas_type == "zero": raise ValueError( @@ -303,7 +303,12 @@ def set_begin_index(self, begin_index: int = 0): """ self._begin_index = begin_index - def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None): + def set_timesteps( + self, + num_inference_steps: int = None, + device: Union[str, torch.device] = None, + timesteps: Optional[List[int]] = None, + ): """ Sets the discrete timesteps used for the diffusion chain (to be run before inference). @@ -312,33 +317,54 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc The number of diffusion steps used when generating samples with a pre-trained model. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to support arbitrary timesteps schedule. If `None`, timesteps will be generated + based on the `timestep_spacing` attribute. If `timesteps` is passed, `num_inference_steps` and `sigmas` + must be `None`, and `timestep_spacing` attribute will be ignored. """ - # Clipping the minimum of all lambda(t) for numerical stability. - # This is critical for cosine (squaredcos_cap_v2) noise schedule. - clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped) - last_timestep = ((self.config.num_train_timesteps - clipped_idx).numpy()).item() - - # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 - if self.config.timestep_spacing == "linspace": - timesteps = ( - np.linspace(0, last_timestep - 1, num_inference_steps + 1).round()[::-1][:-1].copy().astype(np.int64) - ) - elif self.config.timestep_spacing == "leading": - step_ratio = last_timestep // (num_inference_steps + 1) - # creates integer timesteps by multiplying by ratio - # casting to int to avoid issues when num_inference_step is power of 3 - timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64) - timesteps += self.config.steps_offset - elif self.config.timestep_spacing == "trailing": - step_ratio = self.config.num_train_timesteps / num_inference_steps - # creates integer timesteps by multiplying by ratio - # casting to int to avoid issues when num_inference_step is power of 3 - timesteps = np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64) - timesteps -= 1 + if num_inference_steps is None and timesteps is None: + raise ValueError("Must pass exactly one of `num_inference_steps` or `timesteps`.") + if num_inference_steps is not None and timesteps is not None: + raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.") + if timesteps is not None and self.config.use_karras_sigmas: + raise ValueError("Cannot use `timesteps` with `config.use_karras_sigmas = True`") + if timesteps is not None and self.config.use_lu_lambdas: + raise ValueError("Cannot use `timesteps` with `config.use_lu_lambdas = True`") + + if timesteps is not None: + timesteps = np.array(timesteps).astype(np.int64) else: - raise ValueError( - f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." - ) + # Clipping the minimum of all lambda(t) for numerical stability. + # This is critical for cosine (squaredcos_cap_v2) noise schedule. + clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped) + last_timestep = ((self.config.num_train_timesteps - clipped_idx).numpy()).item() + + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = ( + np.linspace(0, last_timestep - 1, num_inference_steps + 1) + .round()[::-1][:-1] + .copy() + .astype(np.int64) + ) + elif self.config.timestep_spacing == "leading": + step_ratio = last_timestep // (num_inference_steps + 1) + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = ( + (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64) + ) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = self.config.num_train_timesteps / num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + ) sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) log_sigmas = np.log(sigmas) @@ -382,7 +408,7 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -446,7 +472,7 @@ def _sigma_to_alpha_sigma_t(self, sigma): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -471,7 +497,7 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho return sigmas - def _convert_to_lu(self, in_lambdas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_lu(self, in_lambdas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Lu et al. (2022).""" lambda_min: float = in_lambdas[-1].item() @@ -486,11 +512,11 @@ def _convert_to_lu(self, in_lambdas: torch.FloatTensor, num_inference_steps) -> def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an @@ -504,13 +530,13 @@ def convert_model_output( Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -585,23 +611,23 @@ def convert_model_output( def dpm_solver_first_order_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the first-order DPMSolver (equivalent to DDIM). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -654,23 +680,23 @@ def dpm_solver_first_order_update( def multistep_dpm_solver_second_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the second-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None) @@ -777,22 +803,22 @@ def multistep_dpm_solver_second_order_update( def multistep_dpm_solver_third_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the third-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ @@ -893,11 +919,11 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -905,15 +931,15 @@ def step( the multistep DPMSolver. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`LEdits++`]. return_dict (`bool`): @@ -980,27 +1006,27 @@ def step( return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py index 0b48b499d36e..3f48066455fb 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py @@ -182,9 +182,9 @@ def create_state(self, common: Optional[CommonSchedulerState] = None) -> DPMSolv # settings for DPM-Solver if self.config.algorithm_type not in ["dpmsolver", "dpmsolver++"]: - raise NotImplementedError(f"{self.config.algorithm_type} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{self.config.algorithm_type} is not implemented for {self.__class__}") if self.config.solver_type not in ["midpoint", "heun"]: - raise NotImplementedError(f"{self.config.solver_type} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{self.config.solver_type} is not implemented for {self.__class__}") # standard deviation of the initial noise distribution init_noise_sigma = jnp.array(1.0, dtype=self.dtype) diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py index 428eaea6a6b8..4695941e725d 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py @@ -178,7 +178,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) @@ -196,13 +196,13 @@ def __init__( if algorithm_type == "deis": self.register_to_config(algorithm_type="dpmsolver++") else: - raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{algorithm_type} is not implemented for {self.__class__}") if solver_type not in ["midpoint", "heun"]: if solver_type in ["logrho", "bh1", "bh2"]: self.register_to_config(solver_type="midpoint") else: - raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}") # setable values self.num_inference_steps = None @@ -295,7 +295,7 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -360,7 +360,7 @@ def _sigma_to_alpha_sigma_t(self, sigma): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -388,11 +388,11 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.convert_model_output def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an @@ -406,13 +406,13 @@ def convert_model_output( Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -488,23 +488,23 @@ def convert_model_output( # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.dpm_solver_first_order_update def dpm_solver_first_order_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the first-order DPMSolver (equivalent to DDIM). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -558,23 +558,23 @@ def dpm_solver_first_order_update( # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_second_order_update def multistep_dpm_solver_second_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the second-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None) @@ -682,22 +682,22 @@ def multistep_dpm_solver_second_order_update( # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_third_order_update def multistep_dpm_solver_third_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the third-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ @@ -786,11 +786,11 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -798,15 +798,15 @@ def step( the multistep DPMSolver. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`CycleDiffusion`]. return_dict (`bool`): @@ -867,27 +867,27 @@ def step( return SchedulerOutput(prev_sample=prev_sample) # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.scale_model_input - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py index 54455b0f2e5a..37c9ab624d05 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py @@ -184,7 +184,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) @@ -257,21 +257,21 @@ def set_begin_index(self, begin_index: int = 0): def scale_model_input( self, - sample: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - ) -> torch.FloatTensor: + sample: torch.Tensor, + timestep: Union[float, torch.Tensor], + ) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -395,7 +395,7 @@ def _sigma_to_t(self, sigma, log_sigmas): return t # copied from diffusers.schedulers.scheduling_euler_discrete._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" sigma_min: float = in_sigmas[-1].item() @@ -414,9 +414,9 @@ def state_in_first_order(self): def step( self, - model_output: Union[torch.FloatTensor, np.ndarray], - timestep: Union[float, torch.FloatTensor], - sample: Union[torch.FloatTensor, np.ndarray], + model_output: Union[torch.Tensor, np.ndarray], + timestep: Union[float, torch.Tensor], + sample: Union[torch.Tensor, np.ndarray], return_dict: bool = True, s_noise: float = 1.0, ) -> Union[SchedulerOutput, Tuple]: @@ -425,11 +425,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor` or `np.ndarray`): + model_output (`torch.Tensor` or `np.ndarray`): The direct output from learned diffusion model. - timestep (`float` or `torch.FloatTensor`): + timestep (`float` or `torch.Tensor`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor` or `np.ndarray`): + sample (`torch.Tensor` or `np.ndarray`): A current instance of a sample created by the diffusion process. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -450,10 +450,10 @@ def step( self.noise_sampler = BrownianTreeNoiseSampler(sample, min_sigma, max_sigma, self.noise_sampler_seed) # Define functions to compute sigma and t from each other - def sigma_fn(_t: torch.FloatTensor) -> torch.FloatTensor: + def sigma_fn(_t: torch.Tensor) -> torch.Tensor: return _t.neg().exp() - def t_fn(_sigma: torch.FloatTensor) -> torch.FloatTensor: + def t_fn(_sigma: torch.Tensor) -> torch.Tensor: return _sigma.log().neg() if self.state_in_first_order: @@ -526,10 +526,10 @@ def t_fn(_sigma: torch.FloatTensor) -> torch.FloatTensor: # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py index d7a073c2383e..15008eec0e04 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py @@ -172,7 +172,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) @@ -190,12 +190,12 @@ def __init__( if algorithm_type == "deis": self.register_to_config(algorithm_type="dpmsolver++") else: - raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{algorithm_type} is not implemented for {self.__class__}") if solver_type not in ["midpoint", "heun"]: if solver_type in ["logrho", "bh1", "bh2"]: self.register_to_config(solver_type="midpoint") else: - raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}") if algorithm_type != "dpmsolver++" and final_sigmas_type == "zero": raise ValueError( @@ -274,7 +274,12 @@ def set_begin_index(self, begin_index: int = 0): """ self._begin_index = begin_index - def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None): + def set_timesteps( + self, + num_inference_steps: int = None, + device: Union[str, torch.device] = None, + timesteps: Optional[List[int]] = None, + ): """ Sets the discrete timesteps used for the diffusion chain (to be run before inference). @@ -283,17 +288,33 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic The number of diffusion steps used when generating samples with a pre-trained model. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default + timestep spacing strategy of equal spacing between timesteps schedule is used. If `timesteps` is + passed, `num_inference_steps` must be `None`. """ + if num_inference_steps is None and timesteps is None: + raise ValueError("Must pass exactly one of `num_inference_steps` or `timesteps`.") + if num_inference_steps is not None and timesteps is not None: + raise ValueError("Must pass exactly one of `num_inference_steps` or `timesteps`.") + if timesteps is not None and self.config.use_karras_sigmas: + raise ValueError("Cannot use `timesteps` when `config.use_karras_sigmas=True`.") + + num_inference_steps = num_inference_steps or len(timesteps) self.num_inference_steps = num_inference_steps - # Clipping the minimum of all lambda(t) for numerical stability. - # This is critical for cosine (squaredcos_cap_v2) noise schedule. - clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped) - timesteps = ( - np.linspace(0, self.config.num_train_timesteps - 1 - clipped_idx, num_inference_steps + 1) - .round()[::-1][:-1] - .copy() - .astype(np.int64) - ) + + if timesteps is not None: + timesteps = np.array(timesteps).astype(np.int64) + else: + # Clipping the minimum of all lambda(t) for numerical stability. + # This is critical for cosine (squaredcos_cap_v2) noise schedule. + clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped) + timesteps = ( + np.linspace(0, self.config.num_train_timesteps - 1 - clipped_idx, num_inference_steps + 1) + .round()[::-1][:-1] + .copy() + .astype(np.int64) + ) sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) if self.config.use_karras_sigmas: @@ -340,7 +361,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -405,7 +426,7 @@ def _sigma_to_alpha_sigma_t(self, sigma): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -432,11 +453,11 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an @@ -450,13 +471,13 @@ def convert_model_output( Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -521,26 +542,26 @@ def convert_model_output( def dpm_solver_first_order_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the first-order DPMSolver (equivalent to DDIM). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -577,27 +598,27 @@ def dpm_solver_first_order_update( def singlestep_dpm_solver_second_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the second-order singlestep DPMSolver that computes the solution at time `prev_timestep` from the time `timestep_list[-2]`. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. timestep (`int`): The current and latter discrete timestep in the diffusion chain. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None) @@ -671,27 +692,27 @@ def singlestep_dpm_solver_second_order_update( def singlestep_dpm_solver_third_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the third-order singlestep DPMSolver that computes the solution at time `prev_timestep` from the time `timestep_list[-3]`. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. timestep (`int`): The current and latter discrete timestep in the diffusion chain. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ @@ -775,29 +796,29 @@ def singlestep_dpm_solver_third_order_update( def singlestep_dpm_solver_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, order: int = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the singlestep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. timestep (`int`): The current and latter discrete timestep in the diffusion chain. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. order (`int`): The solver order at this step. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None) @@ -870,9 +891,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -880,11 +901,11 @@ def step( the singlestep DPMSolver. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`. @@ -929,17 +950,17 @@ def step( return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -947,10 +968,10 @@ def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py index dfc7978a2ee2..e2b6a35247ee 100644 --- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py @@ -119,7 +119,7 @@ def __init__( if solver_type in ["logrho", "bh1", "bh2"]: self.register_to_config(solver_type="midpoint") else: - raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}") if algorithm_type not in ["dpmsolver++", "sde-dpmsolver++"] and final_sigmas_type == "zero": raise ValueError( @@ -206,21 +206,19 @@ def precondition_outputs(self, sample, model_output, sigma): return denoised # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.scale_model_input - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -276,7 +274,7 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas - def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: + def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" sigma_min = sigma_min or self.config.sigma_min sigma_max = sigma_max or self.config.sigma_max @@ -289,7 +287,7 @@ def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch. return sigmas # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_exponential_sigmas - def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: + def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Implementation closely follows k-diffusion. https://github.com/crowsonkb/k-diffusion/blob/6ab5146d4a5ef63901326489f31f1d8e7dd36b48/k_diffusion/sampling.py#L26 @@ -300,7 +298,7 @@ def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> t return sigmas # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -365,9 +363,9 @@ def _sigma_to_alpha_sigma_t(self, sigma): def convert_model_output( self, - model_output: torch.FloatTensor, - sample: torch.FloatTensor = None, - ) -> torch.FloatTensor: + model_output: torch.Tensor, + sample: torch.Tensor = None, + ) -> torch.Tensor: """ Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an @@ -381,13 +379,13 @@ def convert_model_output(
Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ sigma = self.sigmas[self.step_index] @@ -400,21 +398,21 @@ def convert_model_output( def dpm_solver_first_order_update( self, - model_output: torch.FloatTensor, - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + model_output: torch.Tensor, + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, + ) -> torch.Tensor: """ One step for the first-order DPMSolver (equivalent to DDIM). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index] @@ -438,21 +436,21 @@ def dpm_solver_first_order_update( def multistep_dpm_solver_second_order_update( self, - model_output_list: List[torch.FloatTensor], - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + model_output_list: List[torch.Tensor], + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, + ) -> torch.Tensor: """ One step for the second-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ sigma_t, sigma_s0, sigma_s1 = ( @@ -509,20 +507,20 @@ def multistep_dpm_solver_second_order_update( def multistep_dpm_solver_third_order_update( self, - model_output_list: List[torch.FloatTensor], - sample: torch.FloatTensor = None, - ) -> torch.FloatTensor: + model_output_list: List[torch.Tensor], + sample: torch.Tensor = None, + ) -> torch.Tensor: """ One step for the third-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ sigma_t, sigma_s0, sigma_s1, sigma_s2 = ( @@ -596,9 +594,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: @@ -607,11 +605,11 @@ def step( the multistep DPMSolver. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -675,10 +673,10 @@ def step( # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_edm_euler.py b/src/diffusers/schedulers/scheduling_edm_euler.py index 0ef9263c9e30..121ac0d174f6 100644 --- a/src/diffusers/schedulers/scheduling_edm_euler.py +++ b/src/diffusers/schedulers/scheduling_edm_euler.py @@ -35,16 +35,16 @@ class EDMEulerSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None class EDMEulerScheduler(SchedulerMixin, ConfigMixin): @@ -174,21 +174,19 @@ def precondition_outputs(self, sample, model_output, sigma): return denoised - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -227,7 +225,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17 - def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: + def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" sigma_min = sigma_min or self.config.sigma_min sigma_max = sigma_max or self.config.sigma_max @@ -239,7 +237,7 @@ def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch. return sigmas - def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: + def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Implementation closely follows k-diffusion. https://github.com/crowsonkb/k-diffusion/blob/6ab5146d4a5ef63901326489f31f1d8e7dd36b48/k_diffusion/sampling.py#L26 @@ -275,9 +273,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, s_churn: float = 0.0, s_tmin: float = 0.0, s_tmax: float = float("inf"), @@ -290,11 +288,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. s_churn (`float`): s_tmin (`float`): @@ -312,11 +310,7 @@ def step( returned, otherwise a tuple is returned where the first element is the sample tensor. """ - if ( - isinstance(timestep, int) - or isinstance(timestep, torch.IntTensor) - or isinstance(timestep, torch.LongTensor) - ): + if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)): raise ValueError( ( "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" @@ -375,10 +369,10 @@ def step( # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py index 6631ef63a8c1..485e919e9cc5 100644 --- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py @@ -35,16 +35,16 @@ class EulerAncestralDiscreteSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -99,11 +99,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -190,7 +190,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") if rescale_betas_zero_snr: self.betas = rescale_zero_terminal_snr(self.betas) @@ -250,21 +250,19 @@ def set_begin_index(self, begin_index: int = 0): """ self._begin_index = begin_index - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ @@ -346,9 +344,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]: @@ -357,11 +355,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -377,11 +375,7 @@ def step( """ - if ( - isinstance(timestep, int) - or isinstance(timestep, torch.IntTensor) - or isinstance(timestep, torch.LongTensor) - ): + if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)): raise ValueError( ( "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" @@ -450,10 +444,10 @@ def step( # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py index be5bbc235878..46e0e6baef81 100644 --- a/src/diffusers/schedulers/scheduling_euler_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_discrete.py @@ -35,16 +35,16 @@ class EulerDiscreteSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -99,11 +99,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -167,6 +167,9 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin): Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and dark samples instead of limiting it to samples with medium brightness. Loosely related to [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506). + final_sigmas_type (`str`, defaults to `"zero"`): + The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final + sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -189,6 +192,7 @@ def __init__( timestep_type: str = "discrete", # can be "discrete" or "continuous" steps_offset: int = 0, rescale_betas_zero_snr: bool = False, + final_sigmas_type: str = "zero", # can be "zero" or "sigma_min" ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -201,7 +205,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") if rescale_betas_zero_snr: self.betas = rescale_zero_terminal_snr(self.betas) @@ -270,21 +274,19 @@ def set_begin_index(self, begin_index: int = 0): """ self._begin_index = begin_index - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -296,7 +298,13 @@ def scale_model_input( self.is_scale_input_called = True return sample - def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None): + def set_timesteps( + self, + num_inference_steps: int = None, + device: Union[str, torch.device] = None, + timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, + ): """ Sets the discrete timesteps used for the diffusion chain (to be run before inference). @@ -305,60 +313,111 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic The number of diffusion steps used when generating samples with a pre-trained model. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to support arbitrary timesteps schedule. If `None`, timesteps will be generated + based on the `timestep_spacing` attribute. If `timesteps` is passed, `num_inference_steps` and `sigmas` + must be `None`, and `timestep_spacing` attribute will be ignored. + sigmas (`List[float]`, *optional*): + Custom sigmas used to support arbitrary timesteps schedule schedule. If `None`, timesteps and sigmas + will be generated based on the relevant scheduler attributes. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`, and the timesteps will be generated based on the + custom sigmas schedule. """ - self.num_inference_steps = num_inference_steps - # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 - if self.config.timestep_spacing == "linspace": - timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=np.float32)[ - ::-1 - ].copy() - elif self.config.timestep_spacing == "leading": - step_ratio = self.config.num_train_timesteps // self.num_inference_steps - # creates integer timesteps by multiplying by ratio - # casting to int to avoid issues when num_inference_step is power of 3 - timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32) - timesteps += self.config.steps_offset - elif self.config.timestep_spacing == "trailing": - step_ratio = self.config.num_train_timesteps / self.num_inference_steps - # creates integer timesteps by multiplying by ratio - # casting to int to avoid issues when num_inference_step is power of 3 - timesteps = (np.arange(self.config.num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32) - timesteps -= 1 - else: + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` should be set.") + if num_inference_steps is None and timesteps is None and sigmas is None: + raise ValueError("Must pass exactly one of `num_inference_steps` or `timesteps` or `sigmas.") + if num_inference_steps is not None and (timesteps is not None or sigmas is not None): + raise ValueError("Can only pass one of `num_inference_steps` or `timesteps` or `sigmas`.") + if timesteps is not None and self.config.use_karras_sigmas: + raise ValueError("Cannot set `timesteps` with `config.use_karras_sigmas = True`.") + if ( + timesteps is not None + and self.config.timestep_type == "continuous" + and self.config.prediction_type == "v_prediction" + ): raise ValueError( - f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + "Cannot set `timesteps` with `config.timestep_type = 'continuous'` and `config.prediction_type = 'v_prediction'`." ) - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) - log_sigmas = np.log(sigmas) + if num_inference_steps is None: + num_inference_steps = len(timesteps) if timesteps is not None else len(sigmas) - 1 + self.num_inference_steps = num_inference_steps - if self.config.interpolation_type == "linear": - sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas) - elif self.config.interpolation_type == "log_linear": - sigmas = torch.linspace(np.log(sigmas[-1]), np.log(sigmas[0]), num_inference_steps + 1).exp().numpy() - else: - raise ValueError( - f"{self.config.interpolation_type} is not implemented. Please specify interpolation_type to either" - " 'linear' or 'log_linear'" - ) + if sigmas is not None: + log_sigmas = np.log(np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)) + sigmas = np.array(sigmas).astype(np.float32) + timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas[:-1]]) - if self.config.use_karras_sigmas: - sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps) - timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]) + else: + if timesteps is not None: + timesteps = np.array(timesteps).astype(np.float32) + else: + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = np.linspace( + 0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=np.float32 + )[::-1].copy() + elif self.config.timestep_spacing == "leading": + step_ratio = self.config.num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = ( + (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32) + ) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = self.config.num_train_timesteps / self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = ( + (np.arange(self.config.num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32) + ) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + ) + + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) + log_sigmas = np.log(sigmas) + if self.config.interpolation_type == "linear": + sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas) + elif self.config.interpolation_type == "log_linear": + sigmas = torch.linspace(np.log(sigmas[-1]), np.log(sigmas[0]), num_inference_steps + 1).exp().numpy() + else: + raise ValueError( + f"{self.config.interpolation_type} is not implemented. Please specify interpolation_type to either" + " 'linear' or 'log_linear'" + ) + + if self.config.use_karras_sigmas: + sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps) + timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]) + + if self.config.final_sigmas_type == "sigma_min": + sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5 + elif self.config.final_sigmas_type == "zero": + sigma_last = 0 + else: + raise ValueError( + f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}" + ) + + sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32) sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device) # TODO: Support the full EDM scalings for all prediction types and timestep types if self.config.timestep_type == "continuous" and self.config.prediction_type == "v_prediction": - self.timesteps = torch.Tensor([0.25 * sigma.log() for sigma in sigmas]).to(device=device) + self.timesteps = torch.Tensor([0.25 * sigma.log() for sigma in sigmas[:-1]]).to(device=device) else: self.timesteps = torch.from_numpy(timesteps.astype(np.float32)).to(device=device) - self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)]) self._step_index = None self._begin_index = None - self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication + self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication def _sigma_to_t(self, sigma, log_sigmas): # get log sigma @@ -384,7 +443,7 @@ def _sigma_to_t(self, sigma, log_sigmas): return t # Copied from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17 - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -433,9 +492,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, s_churn: float = 0.0, s_tmin: float = 0.0, s_tmax: float = float("inf"), @@ -448,11 +507,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. s_churn (`float`): s_tmin (`float`): @@ -471,11 +530,7 @@ def step( returned, otherwise a tuple is returned where the first element is the sample tensor. """ - if ( - isinstance(timestep, int) - or isinstance(timestep, torch.IntTensor) - or isinstance(timestep, torch.LongTensor) - ): + if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)): raise ValueError( ( "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" @@ -545,10 +600,10 @@ def step( def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): @@ -576,9 +631,7 @@ def add_noise( noisy_samples = original_samples + noise * sigma return noisy_samples - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.FloatTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor: if ( isinstance(timesteps, int) or isinstance(timesteps, torch.IntTensor) diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py index fb8f28f73d58..8d0a4a830f42 100644 --- a/src/diffusers/schedulers/scheduling_heun_discrete.py +++ b/src/diffusers/schedulers/scheduling_heun_discrete.py @@ -135,7 +135,7 @@ def __init__( elif beta_schedule == "exp": self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="exp") else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) @@ -198,21 +198,21 @@ def set_begin_index(self, begin_index: int = 0): def scale_model_input( self, - sample: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - ) -> torch.FloatTensor: + sample: torch.Tensor, + timestep: Union[float, torch.Tensor], + ) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -224,9 +224,10 @@ def scale_model_input( def set_timesteps( self, - num_inference_steps: int, + num_inference_steps: Optional[int] = None, device: Union[str, torch.device] = None, num_train_timesteps: Optional[int] = None, + timesteps: Optional[List[int]] = None, ): """ Sets the discrete timesteps used for the diffusion chain (to be run before inference). @@ -236,30 +237,47 @@ def set_timesteps( The number of diffusion steps used when generating samples with a pre-trained model. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + num_train_timesteps (`int`, *optional*): + The number of diffusion steps used when training the model. If `None`, the default + `num_train_timesteps` attribute is used. + timesteps (`List[int]`, *optional*): + Custom timesteps used to support arbitrary spacing between timesteps. If `None`, timesteps will be + generated based on the `timestep_spacing` attribute. If `timesteps` is passed, `num_inference_steps` + must be `None`, and `timestep_spacing` attribute will be ignored. """ + if num_inference_steps is None and timesteps is None: + raise ValueError("Must pass exactly one of `num_inference_steps` or `custom_timesteps`.") + if num_inference_steps is not None and timesteps is not None: + raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.") + if timesteps is not None and self.config.use_karras_sigmas: + raise ValueError("Cannot use `timesteps` with `config.use_karras_sigmas = True`") + + num_inference_steps = num_inference_steps or len(timesteps) self.num_inference_steps = num_inference_steps - num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps - # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 - if self.config.timestep_spacing == "linspace": - timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=np.float32)[::-1].copy() - elif self.config.timestep_spacing == "leading": - step_ratio = num_train_timesteps // self.num_inference_steps - # creates integer timesteps by multiplying by ratio - # casting to int to avoid issues when num_inference_step is power of 3 - timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32) - timesteps += self.config.steps_offset - elif self.config.timestep_spacing == "trailing": - step_ratio = num_train_timesteps / self.num_inference_steps - # creates integer timesteps by multiplying by ratio - # casting to int to avoid issues when num_inference_step is power of 3 - timesteps = (np.arange(num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32) - timesteps -= 1 + if timesteps is not None: + timesteps = np.array(timesteps, dtype=np.float32) else: - raise ValueError( - f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." - ) + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=np.float32)[::-1].copy() + elif self.config.timestep_spacing == "leading": + step_ratio = num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = num_train_timesteps / self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + ) sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) log_sigmas = np.log(sigmas) @@ -311,7 +329,7 @@ def _sigma_to_t(self, sigma, log_sigmas): return t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -351,9 +369,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: Union[torch.FloatTensor, np.ndarray], - timestep: Union[float, torch.FloatTensor], - sample: Union[torch.FloatTensor, np.ndarray], + model_output: Union[torch.Tensor, np.ndarray], + timestep: Union[float, torch.Tensor], + sample: Union[torch.Tensor, np.ndarray], return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -361,11 +379,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -451,10 +469,10 @@ def step( # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_ipndm.py b/src/diffusers/schedulers/scheduling_ipndm.py index afc8fd940eaf..9f15f1fe0aab 100644 --- a/src/diffusers/schedulers/scheduling_ipndm.py +++ b/src/diffusers/schedulers/scheduling_ipndm.py @@ -137,9 +137,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -147,11 +147,11 @@ def step( the linear multistep method. It performs one forward pass multiple times to approximate the solution. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -193,17 +193,17 @@ def step( return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py index fd2b94759f08..338412d96bd5 100644 --- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py @@ -129,7 +129,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) @@ -175,21 +175,21 @@ def set_begin_index(self, begin_index: int = 0): def scale_model_input( self, - sample: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - ) -> torch.FloatTensor: + sample: torch.Tensor, + timestep: Union[float, torch.Tensor], + ) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -321,7 +321,7 @@ def _sigma_to_t(self, sigma, log_sigmas): return t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -376,9 +376,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: Union[torch.FloatTensor, np.ndarray], - timestep: Union[float, torch.FloatTensor], - sample: Union[torch.FloatTensor, np.ndarray], + model_output: Union[torch.Tensor, np.ndarray], + timestep: Union[float, torch.Tensor], + sample: Union[torch.Tensor, np.ndarray], generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: @@ -387,11 +387,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -477,10 +477,10 @@ def step( # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py index 57a1af6b797a..de66a7b6eaa1 100644 --- a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py +++ b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py @@ -128,7 +128,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) @@ -175,21 +175,21 @@ def set_begin_index(self, begin_index: int = 0): def scale_model_input( self, - sample: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - ) -> torch.FloatTensor: + sample: torch.Tensor, + timestep: Union[float, torch.Tensor], + ) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -334,7 +334,7 @@ def _sigma_to_t(self, sigma, log_sigmas): return t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -361,9 +361,9 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) def step( self, - model_output: Union[torch.FloatTensor, np.ndarray], - timestep: Union[float, torch.FloatTensor], - sample: Union[torch.FloatTensor, np.ndarray], + model_output: Union[torch.Tensor, np.ndarray], + timestep: Union[float, torch.Tensor], + sample: Union[torch.Tensor, np.ndarray], return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -371,11 +371,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -452,10 +452,10 @@ def step( # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_karras_ve_flax.py b/src/diffusers/schedulers/scheduling_karras_ve_flax.py index 4d099604a9e5..0d387b53ac3e 100644 --- a/src/diffusers/schedulers/scheduling_karras_ve_flax.py +++ b/src/diffusers/schedulers/scheduling_karras_ve_flax.py @@ -176,10 +176,10 @@ def step( Args: state (`KarrasVeSchedulerState`): the `FlaxKarrasVeScheduler` state data class. - model_output (`torch.FloatTensor` or `np.ndarray`): direct output from learned diffusion model. + model_output (`torch.Tensor` or `np.ndarray`): direct output from learned diffusion model. sigma_hat (`float`): TODO sigma_prev (`float`): TODO - sample_hat (`torch.FloatTensor` or `np.ndarray`): TODO + sample_hat (`torch.Tensor` or `np.ndarray`): TODO return_dict (`bool`): option for returning tuple rather than FlaxKarrasVeOutput class Returns: @@ -213,12 +213,12 @@ def step_correct( Args: state (`KarrasVeSchedulerState`): the `FlaxKarrasVeScheduler` state data class. - model_output (`torch.FloatTensor` or `np.ndarray`): direct output from learned diffusion model. + model_output (`torch.Tensor` or `np.ndarray`): direct output from learned diffusion model. sigma_hat (`float`): TODO sigma_prev (`float`): TODO - sample_hat (`torch.FloatTensor` or `np.ndarray`): TODO - sample_prev (`torch.FloatTensor` or `np.ndarray`): TODO - derivative (`torch.FloatTensor` or `np.ndarray`): TODO + sample_hat (`torch.Tensor` or `np.ndarray`): TODO + sample_prev (`torch.Tensor` or `np.ndarray`): TODO + derivative (`torch.Tensor` or `np.ndarray`): TODO return_dict (`bool`): option for returning tuple rather than FlaxKarrasVeOutput class Returns: diff --git a/src/diffusers/schedulers/scheduling_lcm.py b/src/diffusers/schedulers/scheduling_lcm.py index f15fe0adf261..f1aa09ab1723 100644 --- a/src/diffusers/schedulers/scheduling_lcm.py +++ b/src/diffusers/schedulers/scheduling_lcm.py @@ -37,16 +37,16 @@ class LCMSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - denoised: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + denoised: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -95,17 +95,17 @@ def alpha_bar_fn(t): # Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr -def rescale_zero_terminal_snr(betas: torch.FloatTensor) -> torch.FloatTensor: +def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor: """ Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -224,7 +224,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") # Rescale for zero SNR if rescale_betas_zero_snr: @@ -296,24 +296,24 @@ def set_begin_index(self, begin_index: int = 0): """ self._begin_index = begin_index - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -497,9 +497,9 @@ def get_scalings_for_boundary_condition_discrete(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[LCMSchedulerOutput, Tuple]: @@ -508,11 +508,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -594,10 +594,10 @@ def step( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -619,9 +619,7 @@ def add_noise( return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py index 61a91783c25b..f96a819db1c6 100644 --- a/src/diffusers/schedulers/scheduling_lms_discrete.py +++ b/src/diffusers/schedulers/scheduling_lms_discrete.py @@ -32,16 +32,16 @@ class LMSDiscreteSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -149,7 +149,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) @@ -202,21 +202,19 @@ def set_begin_index(self, begin_index: int = 0): """ self._begin_index = begin_index - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. - timestep (`float` or `torch.FloatTensor`): + timestep (`float` or `torch.Tensor`): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ @@ -351,7 +349,7 @@ def _sigma_to_t(self, sigma, log_sigmas): return t # copied from diffusers.schedulers.scheduling_euler_discrete._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" sigma_min: float = in_sigmas[-1].item() @@ -366,9 +364,9 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor) -> torch.FloatTensor: def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, order: int = 4, return_dict: bool = True, ) -> Union[LMSDiscreteSchedulerOutput, Tuple]: @@ -377,11 +375,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. - timestep (`float` or `torch.FloatTensor`): + timestep (`float` or `torch.Tensor`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. order (`int`, defaults to 4): The order of the linear multistep method. @@ -444,10 +442,10 @@ def step( # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py index e7c861ea3831..a05e71c3c225 100644 --- a/src/diffusers/schedulers/scheduling_pndm.py +++ b/src/diffusers/schedulers/scheduling_pndm.py @@ -135,7 +135,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) @@ -225,9 +225,9 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -236,11 +236,11 @@ def step( or [`~PNDMScheduler.step_plms`] depending on the internal variable `counter`. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`. @@ -258,9 +258,9 @@ def step( def step_prk( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -269,11 +269,11 @@ def step_prk( equation. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -318,9 +318,9 @@ def step_prk( def step_plms( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -328,11 +328,11 @@ def step_plms( the linear multistep method. It performs one forward pass multiple times to approximate the solution. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -387,17 +387,17 @@ def step_plms( return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -448,10 +448,10 @@ def _get_prev_sample(self, sample, timestep, prev_timestep, model_output): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls diff --git a/src/diffusers/schedulers/scheduling_repaint.py b/src/diffusers/schedulers/scheduling_repaint.py index ccd3d2743192..97665bb5277b 100644 --- a/src/diffusers/schedulers/scheduling_repaint.py +++ b/src/diffusers/schedulers/scheduling_repaint.py @@ -31,16 +31,16 @@ class RePaintSchedulerOutput(BaseOutput): Output class for the scheduler's step function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample (x_{0}) based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: torch.FloatTensor + prev_sample: torch.Tensor + pred_original_sample: torch.Tensor # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -143,7 +143,7 @@ def __init__( betas = torch.linspace(-6, 6, num_train_timesteps) self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) @@ -160,19 +160,19 @@ def __init__( self.eta = eta - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -245,11 +245,11 @@ def _get_variance(self, t): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, - original_image: torch.FloatTensor, - mask: torch.FloatTensor, + sample: torch.Tensor, + original_image: torch.Tensor, + mask: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[RePaintSchedulerOutput, Tuple]: @@ -258,15 +258,15 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. - original_image (`torch.FloatTensor`): + original_image (`torch.Tensor`): The original image to inpaint on. - mask (`torch.FloatTensor`): + mask (`torch.Tensor`): The mask where a value of 0.0 indicates which part of the original image to inpaint. generator (`torch.Generator`, *optional*): A random number generator. @@ -351,10 +351,10 @@ def undo_step(self, sample, timestep, generator=None): def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: raise NotImplementedError("Use `DDPMScheduler.add_noise()` to train for sampling with RePaint.") def __len__(self): diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py index b8d95c609bf1..50049a530800 100644 --- a/src/diffusers/schedulers/scheduling_sasolver.py +++ b/src/diffusers/schedulers/scheduling_sasolver.py @@ -180,7 +180,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) @@ -194,7 +194,7 @@ def __init__( self.init_noise_sigma = 1.0 if algorithm_type not in ["data_prediction", "noise_prediction"]: - raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{algorithm_type} is not implemented for {self.__class__}") # setable values self.num_inference_steps = None @@ -305,7 +305,7 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -370,7 +370,7 @@ def _sigma_to_alpha_sigma_t(self, sigma): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -397,11 +397,11 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Convert the model output to the corresponding type the data_prediction/noise_prediction algorithm needs. Noise_prediction is designed to discretize an integral of the noise prediction model, and data_prediction is @@ -415,13 +415,13 @@ def convert_model_output( Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -686,29 +686,29 @@ def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list, def stochastic_adams_bashforth_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor, - noise: torch.FloatTensor, + sample: torch.Tensor, + noise: torch.Tensor, order: int, - tau: torch.FloatTensor, + tau: torch.Tensor, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the SA-Predictor. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model at the current timestep. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. order (`int`): The order of SA-Predictor at this timestep. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None) @@ -813,32 +813,32 @@ def stochastic_adams_bashforth_update( def stochastic_adams_moulton_update( self, - this_model_output: torch.FloatTensor, + this_model_output: torch.Tensor, *args, - last_sample: torch.FloatTensor, - last_noise: torch.FloatTensor, - this_sample: torch.FloatTensor, + last_sample: torch.Tensor, + last_noise: torch.Tensor, + this_sample: torch.Tensor, order: int, - tau: torch.FloatTensor, + tau: torch.Tensor, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the SA-Corrector. Args: - this_model_output (`torch.FloatTensor`): + this_model_output (`torch.Tensor`): The model outputs at `x_t`. this_timestep (`int`): The current timestep `t`. - last_sample (`torch.FloatTensor`): + last_sample (`torch.Tensor`): The generated sample before the last predictor `x_{t-1}`. - this_sample (`torch.FloatTensor`): + this_sample (`torch.Tensor`): The generated sample after the last predictor `x_{t}`. order (`int`): The order of SA-Corrector at this step. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The corrected sample tensor at the current timestep. """ @@ -979,9 +979,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: @@ -990,11 +990,11 @@ def step( the SA-Solver. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -1079,17 +1079,17 @@ def step( return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -1097,10 +1097,10 @@ def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls diff --git a/src/diffusers/schedulers/scheduling_sde_ve.py b/src/diffusers/schedulers/scheduling_sde_ve.py index 8f8dd1877331..cedfbf7d6ad5 100644 --- a/src/diffusers/schedulers/scheduling_sde_ve.py +++ b/src/diffusers/schedulers/scheduling_sde_ve.py @@ -32,15 +32,15 @@ class SdeVeOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - prev_sample_mean (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample_mean (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Mean averaged `prev_sample` over previous timesteps. """ - prev_sample: torch.FloatTensor - prev_sample_mean: torch.FloatTensor + prev_sample: torch.Tensor + prev_sample_mean: torch.Tensor class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin): @@ -86,19 +86,19 @@ def __init__( self.set_sigmas(num_train_timesteps, sigma_min, sigma_max, sampling_eps) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -159,9 +159,9 @@ def get_adjacent_sigma(self, timesteps, t): def step_pred( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[SdeVeOutput, Tuple]: @@ -170,11 +170,11 @@ def step_pred( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -227,8 +227,8 @@ def step_pred( def step_correct( self, - model_output: torch.FloatTensor, - sample: torch.FloatTensor, + model_output: torch.Tensor, + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: @@ -237,9 +237,9 @@ def step_correct( making the prediction for the previous timestep. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -282,10 +282,10 @@ def step_correct( def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples timesteps = timesteps.to(original_samples.device) sigmas = self.discrete_sigmas.to(original_samples.device)[timesteps] diff --git a/src/diffusers/schedulers/scheduling_tcd.py b/src/diffusers/schedulers/scheduling_tcd.py index 0216b7afc80a..580224404c54 100644 --- a/src/diffusers/schedulers/scheduling_tcd.py +++ b/src/diffusers/schedulers/scheduling_tcd.py @@ -37,15 +37,15 @@ class TCDSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_noised_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_noised_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted noised sample `(x_{s})` based on the model output from the current timestep. """ - prev_sample: torch.FloatTensor - pred_noised_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_noised_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -94,17 +94,17 @@ def alpha_bar_fn(t): # Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr -def rescale_zero_terminal_snr(betas: torch.FloatTensor) -> torch.FloatTensor: +def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor: """ Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -225,7 +225,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") # Rescale for zero SNR if rescale_betas_zero_snr: @@ -297,19 +297,19 @@ def set_begin_index(self, begin_index: int = 0): """ self._begin_index = begin_index - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -326,7 +326,7 @@ def _get_variance(self, timestep, prev_timestep): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -524,9 +524,9 @@ def set_timesteps( def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.3, generator: Optional[torch.Generator] = None, return_dict: bool = True, @@ -536,11 +536,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. eta (`float`): A stochastic parameter (referred to as `gamma` in the paper) used to control the stochasticity in every @@ -631,10 +631,10 @@ def step( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -656,9 +656,7 @@ def add_noise( return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py index c99e97cd85dd..6e1580290f22 100644 --- a/src/diffusers/schedulers/scheduling_unclip.py +++ b/src/diffusers/schedulers/scheduling_unclip.py @@ -32,16 +32,16 @@ class UnCLIPSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -146,17 +146,17 @@ def __init__( self.variance_type = variance_type - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): input sample + sample (`torch.Tensor`): input sample timestep (`int`, optional): current timestep Returns: - `torch.FloatTensor`: scaled input sample + `torch.Tensor`: scaled input sample """ return sample @@ -215,9 +215,9 @@ def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, prev_timestep: Optional[int] = None, generator=None, return_dict: bool = True, @@ -227,9 +227,9 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. prev_timestep (`int`, *optional*): The previous timestep to predict the previous sample at. Used to dynamically compute beta. If not given, `t-1` is used and the pre-computed beta is used. @@ -327,10 +327,10 @@ def step( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py index 74e97a33f1e2..367761ce3fef 100644 --- a/src/diffusers/schedulers/scheduling_unipc_multistep.py +++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py @@ -78,11 +78,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -211,7 +211,7 @@ def __init__( # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") if rescale_betas_zero_snr: self.betas = rescale_zero_terminal_snr(self.betas) @@ -237,7 +237,7 @@ def __init__( if solver_type in ["midpoint", "heun", "logrho"]: self.register_to_config(solver_type="bh2") else: - raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}") self.predict_x0 = predict_x0 # setable values @@ -360,7 +360,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -425,7 +425,7 @@ def _sigma_to_alpha_sigma_t(self, sigma): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -452,24 +452,24 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Convert the model output to the corresponding type the UniPC algorithm needs. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -522,27 +522,27 @@ def convert_model_output( def multistep_uni_p_bh_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, order: int = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model at the current timestep. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. order (`int`): The order of UniP at this timestep (corresponds to the *p* in UniPC-p). Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None) @@ -651,30 +651,30 @@ def multistep_uni_p_bh_update( def multistep_uni_c_bh_update( self, - this_model_output: torch.FloatTensor, + this_model_output: torch.Tensor, *args, - last_sample: torch.FloatTensor = None, - this_sample: torch.FloatTensor = None, + last_sample: torch.Tensor = None, + this_sample: torch.Tensor = None, order: int = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the UniC (B(h) version). Args: - this_model_output (`torch.FloatTensor`): + this_model_output (`torch.Tensor`): The model outputs at `x_t`. this_timestep (`int`): The current timestep `t`. - last_sample (`torch.FloatTensor`): + last_sample (`torch.Tensor`): The generated sample before the last predictor `x_{t-1}`. - this_sample (`torch.FloatTensor`): + this_sample (`torch.Tensor`): The generated sample after the last predictor `x_{t}`. order (`int`): The `p` of UniC-p at this step. The effective order of accuracy should be `order + 1`. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The corrected sample tensor at the current timestep. """ this_timestep = args[0] if len(args) > 0 else kwargs.pop("this_timestep", None) @@ -821,9 +821,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -831,11 +831,11 @@ def step( the multistep UniPC. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`. @@ -900,17 +900,17 @@ def step( return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -918,10 +918,10 @@ def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_utils.py b/src/diffusers/schedulers/scheduling_utils.py index dcdce6c51f05..33d34e26d89d 100644 --- a/src/diffusers/schedulers/scheduling_utils.py +++ b/src/diffusers/schedulers/scheduling_utils.py @@ -48,18 +48,27 @@ class KarrasDiffusionSchedulers(Enum): EDMEulerScheduler = 15 +AysSchedules = { + "StableDiffusionTimesteps": [999, 850, 736, 645, 545, 455, 343, 233, 124, 24], + "StableDiffusionSigmas": [14.615, 6.475, 3.861, 2.697, 1.886, 1.396, 0.963, 0.652, 0.399, 0.152, 0.0], + "StableDiffusionXLTimesteps": [999, 845, 730, 587, 443, 310, 193, 116, 53, 13], + "StableDiffusionXLSigmas": [14.615, 6.315, 3.771, 2.181, 1.342, 0.862, 0.555, 0.380, 0.234, 0.113, 0.0], + "StableDiffusionVideoSigmas": [700.00, 54.5, 15.886, 7.977, 4.248, 1.789, 0.981, 0.403, 0.173, 0.034, 0.0], +} + + @dataclass class SchedulerOutput(BaseOutput): """ Base class for the output of a scheduler's `step` function. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. """ - prev_sample: torch.FloatTensor + prev_sample: torch.Tensor class SchedulerMixin(PushToHubMixin): diff --git a/src/diffusers/schedulers/scheduling_vq_diffusion.py b/src/diffusers/schedulers/scheduling_vq_diffusion.py index 03ba95cad68b..bd8d255fa901 100644 --- a/src/diffusers/schedulers/scheduling_vq_diffusion.py +++ b/src/diffusers/schedulers/scheduling_vq_diffusion.py @@ -38,7 +38,7 @@ class VQDiffusionSchedulerOutput(BaseOutput): prev_sample: torch.LongTensor -def index_to_log_onehot(x: torch.LongTensor, num_classes: int) -> torch.FloatTensor: +def index_to_log_onehot(x: torch.LongTensor, num_classes: int) -> torch.Tensor: """ Convert batch of vector of class indices into batch of log onehot vectors @@ -50,7 +50,7 @@ def index_to_log_onehot(x: torch.LongTensor, num_classes: int) -> torch.FloatTen number of classes to be used for the onehot vectors Returns: - `torch.FloatTensor` of shape `(batch size, num classes, vector length)`: + `torch.Tensor` of shape `(batch size, num classes, vector length)`: Log onehot vectors """ x_onehot = F.one_hot(x, num_classes) @@ -59,7 +59,7 @@ def index_to_log_onehot(x: torch.LongTensor, num_classes: int) -> torch.FloatTen return log_x -def gumbel_noised(logits: torch.FloatTensor, generator: Optional[torch.Generator]) -> torch.FloatTensor: +def gumbel_noised(logits: torch.Tensor, generator: Optional[torch.Generator]) -> torch.Tensor: """ Apply gumbel noise to `logits` """ @@ -199,7 +199,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: torch.long, sample: torch.LongTensor, generator: Optional[torch.Generator] = None, @@ -210,7 +210,7 @@ def step( [`~VQDiffusionScheduler.q_posterior`] for more details about how the distribution is computer. Args: - log_p_x_0: (`torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`): + log_p_x_0: (`torch.Tensor` of shape `(batch size, num classes - 1, num latent pixels)`): The log probabilities for the predicted classes of the initial latent pixels. Does not include a prediction for the masked class as the initial unnoised image cannot be masked. t (`torch.long`): @@ -251,7 +251,7 @@ def q_posterior(self, log_p_x_0, x_t, t): ``` Args: - log_p_x_0 (`torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`): + log_p_x_0 (`torch.Tensor` of shape `(batch size, num classes - 1, num latent pixels)`): The log probabilities for the predicted classes of the initial latent pixels. Does not include a prediction for the masked class as the initial unnoised image cannot be masked. x_t (`torch.LongTensor` of shape `(batch size, num latent pixels)`): @@ -260,7 +260,7 @@ def q_posterior(self, log_p_x_0, x_t, t): The timestep that determines which transition matrix is used. Returns: - `torch.FloatTensor` of shape `(batch size, num classes, num latent pixels)`: + `torch.Tensor` of shape `(batch size, num classes, num latent pixels)`: The log probabilities for the predicted classes of the image at timestep `t-1`. """ log_onehot_x_t = index_to_log_onehot(x_t, self.num_embed) @@ -354,7 +354,7 @@ def q_posterior(self, log_p_x_0, x_t, t): return log_p_x_t_min_1 def log_Q_t_transitioning_to_known_class( - self, *, t: torch.int, x_t: torch.LongTensor, log_onehot_x_t: torch.FloatTensor, cumulative: bool + self, *, t: torch.int, x_t: torch.LongTensor, log_onehot_x_t: torch.Tensor, cumulative: bool ): """ Calculates the log probabilities of the rows from the (cumulative or non-cumulative) transition matrix for each @@ -365,14 +365,14 @@ def log_Q_t_transitioning_to_known_class( The timestep that determines which transition matrix is used. x_t (`torch.LongTensor` of shape `(batch size, num latent pixels)`): The classes of each latent pixel at time `t`. - log_onehot_x_t (`torch.FloatTensor` of shape `(batch size, num classes, num latent pixels)`): + log_onehot_x_t (`torch.Tensor` of shape `(batch size, num classes, num latent pixels)`): The log one-hot vectors of `x_t`. cumulative (`bool`): If cumulative is `False`, the single step transition matrix `t-1`->`t` is used. If cumulative is `True`, the cumulative transition matrix `0`->`t` is used. Returns: - `torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`: + `torch.Tensor` of shape `(batch size, num classes - 1, num latent pixels)`: Each _column_ of the returned matrix is a _row_ of log probabilities of the complete probability transition matrix. diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index cdc92036613d..04f91d758b94 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -58,20 +58,25 @@ get_objects_from_module, is_accelerate_available, is_accelerate_version, + is_bitsandbytes_available, is_bs4_available, is_flax_available, is_ftfy_available, + is_google_colab, is_inflect_available, is_invisible_watermark_available, is_k_diffusion_available, is_k_diffusion_version, is_librosa_available, is_note_seq_available, + is_notebook, is_onnx_available, is_peft_available, is_peft_version, + is_safetensors_available, is_scipy_available, is_tensorboard_available, + is_timm_available, is_torch_available, is_torch_npu_available, is_torch_version, diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index f5f57d8a5c5f..b8ce2d7c0466 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -295,6 +295,39 @@ except importlib_metadata.PackageNotFoundError: _torchvision_available = False +_timm_available = importlib.util.find_spec("timm") is not None +if _timm_available: + try: + _timm_version = importlib_metadata.version("timm") + logger.info(f"Timm version {_timm_version} available.") + except importlib_metadata.PackageNotFoundError: + _timm_available = False + + +def is_timm_available(): + return _timm_available + + +_bitsandbytes_available = importlib.util.find_spec("bitsandbytes") is not None +try: + _bitsandbytes_version = importlib_metadata.version("bitsandbytes") + logger.debug(f"Successfully imported bitsandbytes version {_bitsandbytes_version}") +except importlib_metadata.PackageNotFoundError: + _bitsandbytes_available = False + +# Taken from `huggingface_hub`. +_is_notebook = False +try: + shell_class = get_ipython().__class__ # type: ignore # noqa: F821 + for parent_class in shell_class.__mro__: # e.g. "is subclass of" + if parent_class.__name__ == "ZMQInteractiveShell": + _is_notebook = True # Jupyter notebook, Google colab or qtconsole + break +except NameError: + pass # Probably standard Python interpreter + +_is_google_colab = "google.colab" in sys.modules + def is_torch_available(): return _torch_available @@ -392,6 +425,22 @@ def is_torchvision_available(): return _torchvision_available +def is_safetensors_available(): + return _safetensors_available + + +def is_bitsandbytes_available(): + return _bitsandbytes_available + + +def is_notebook(): + return _is_notebook + + +def is_google_colab(): + return _is_google_colab + + # docstyle-ignore FLAX_IMPORT_ERROR = """ {0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the @@ -499,6 +548,20 @@ def is_torchvision_available(): {0} requires the invisible-watermark library but it was not found in your environment. You can install it with pip: `pip install invisible-watermark>=0.2.0` """ +# docstyle-ignore +PEFT_IMPORT_ERROR = """ +{0} requires the peft library but it was not found in your environment. You can install it with pip: `pip install peft` +""" + +# docstyle-ignore +SAFETENSORS_IMPORT_ERROR = """ +{0} requires the safetensors library but it was not found in your environment. You can install it with pip: `pip install safetensors` +""" + +# docstyle-ignore +BITSANDBYTES_IMPORT_ERROR = """ +{0} requires the bitsandbytes library but it was not found in your environment. You can install it with pip: `pip install bitsandbytes` +""" BACKENDS_MAPPING = OrderedDict( [ @@ -520,6 +583,9 @@ def is_torchvision_available(): ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)), ("torchsde", (is_torchsde_available, TORCHSDE_IMPORT_ERROR)), ("invisible_watermark", (is_invisible_watermark_available, INVISIBLE_WATERMARK_IMPORT_ERROR)), + ("peft", (is_peft_available, PEFT_IMPORT_ERROR)), + ("safetensors", (is_safetensors_available, SAFETENSORS_IMPORT_ERROR)), + ("bitsandbytes", (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)), ] ) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index c1756d6590d1..8a6afd768428 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -33,6 +33,7 @@ is_onnx_available, is_opencv_available, is_peft_available, + is_timm_available, is_torch_available, is_torch_version, is_torchsde_available, @@ -340,6 +341,13 @@ def require_peft_backend(test_case): return unittest.skipUnless(USE_PEFT_BACKEND, "test requires PEFT backend")(test_case) +def require_timm(test_case): + """ + Decorator marking a test that requires timm. These tests are skipped when timm isn't installed. + """ + return unittest.skipUnless(is_timm_available(), "test requires timm")(test_case) + + def require_peft_version_greater(peft_version): """ Decorator marking a test that requires PEFT backend with a specific version, this would require some specific diff --git a/src/diffusers/video_processor.py b/src/diffusers/video_processor.py new file mode 100644 index 000000000000..9e2727b85377 --- /dev/null +++ b/src/diffusers/video_processor.py @@ -0,0 +1,113 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import List, Optional, Union + +import numpy as np +import PIL +import torch + +from .image_processor import VaeImageProcessor, is_valid_image, is_valid_image_imagelist + + +class VideoProcessor(VaeImageProcessor): + r"""Simple video processor.""" + + def preprocess_video(self, video, height: Optional[int] = None, width: Optional[int] = None) -> torch.Tensor: + r""" + Preprocesses input video(s). + + Args: + video (`List[PIL.Image]`, `List[List[PIL.Image]]`, `torch.Tensor`, `np.array`, `List[torch.Tensor]`, `List[np.array]`): + The input video. It can be one of the following: + * List of the PIL images. + * List of list of PIL images. + * 4D Torch tensors (expected shape for each tensor `(num_frames, num_channels, height, width)`). + * 4D NumPy arrays (expected shape for each array `(num_frames, height, width, num_channels)`). + * List of 4D Torch tensors (expected shape for each tensor `(num_frames, num_channels, height, + width)`). + * List of 4D NumPy arrays (expected shape for each array `(num_frames, height, width, num_channels)`). + * 5D NumPy arrays: expected shape for each array `(batch_size, num_frames, height, width, + num_channels)`. + * 5D Torch tensors: expected shape for each array `(batch_size, num_frames, num_channels, height, + width)`. + height (`int`, *optional*, defaults to `None`): + The height in preprocessed frames of the video. If `None`, will use the `get_default_height_width()` to + get default height. + width (`int`, *optional*`, defaults to `None`): + The width in preprocessed frames of the video. If `None`, will use get_default_height_width()` to get + the default width. + """ + if isinstance(video, list) and isinstance(video[0], np.ndarray) and video[0].ndim == 5: + warnings.warn( + "Passing `video` as a list of 5d np.ndarray is deprecated." + "Please concatenate the list along the batch dimension and pass it as a single 5d np.ndarray", + FutureWarning, + ) + video = np.concatenate(video, axis=0) + if isinstance(video, list) and isinstance(video[0], torch.Tensor) and video[0].ndim == 5: + warnings.warn( + "Passing `video` as a list of 5d torch.Tensor is deprecated." + "Please concatenate the list along the batch dimension and pass it as a single 5d torch.Tensor", + FutureWarning, + ) + video = torch.cat(video, axis=0) + + # ensure the input is a list of videos: + # - if it is a batch of videos (5d torch.Tensor or np.ndarray), it is converted to a list of videos (a list of 4d torch.Tensor or np.ndarray) + # - if it is is a single video, it is convereted to a list of one video. + if isinstance(video, (np.ndarray, torch.Tensor)) and video.ndim == 5: + video = list(video) + elif isinstance(video, list) and is_valid_image(video[0]) or is_valid_image_imagelist(video): + video = [video] + elif isinstance(video, list) and is_valid_image_imagelist(video[0]): + video = video + else: + raise ValueError( + "Input is in incorrect format. Currently, we only support numpy.ndarray, torch.Tensor, PIL.Image.Image" + ) + + video = torch.stack([self.preprocess(img, height=height, width=width) for img in video], dim=0) + + # move the number of channels before the number of frames. + video = video.permute(0, 2, 1, 3, 4) + + return video + + def postprocess_video( + self, video: torch.Tensor, output_type: str = "np" + ) -> Union[np.ndarray, torch.Tensor, List[PIL.Image.Image]]: + r""" + Converts a video tensor to a list of frames for export. + + Args: + video (`torch.Tensor`): The video as a tensor. + output_type (`str`, defaults to `"np"`): Output type of the postprocessed `video` tensor. + """ + batch_size = video.shape[0] + outputs = [] + for batch_idx in range(batch_size): + batch_vid = video[batch_idx].permute(1, 0, 2, 3) + batch_output = self.postprocess(batch_vid, output_type) + outputs.append(batch_output) + + if output_type == "np": + outputs = np.stack(outputs) + elif output_type == "pt": + outputs = torch.stack(outputs) + elif not output_type == "pil": + raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']") + + return outputs diff --git a/tests/lora/test_lora_layers_sd.py b/tests/lora/test_lora_layers_sd.py index fc28d94c240b..46b965ec33d9 100644 --- a/tests/lora/test_lora_layers_sd.py +++ b/tests/lora/test_lora_layers_sd.py @@ -642,7 +642,7 @@ def test_sd_load_civitai_empty_network_alpha(self): This test simply checks that loading a LoRA with an empty network alpha works fine See: https://github.com/huggingface/diffusers/issues/5606 """ - pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(torch_device) + pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") pipeline.enable_sequential_cpu_offload() civitai_path = hf_hub_download("ybelkada/test-ahi-civitai", "ahi_lora_weights.safetensors") pipeline.load_lora_weights(civitai_path, adapter_name="ahri") diff --git a/tests/lora/test_lora_layers_sdxl.py b/tests/lora/test_lora_layers_sdxl.py index a8b2d2759f41..f6ca4f304eb9 100644 --- a/tests/lora/test_lora_layers_sdxl.py +++ b/tests/lora/test_lora_layers_sdxl.py @@ -194,7 +194,7 @@ def test_sdxl_1_0_lora(self): ).images images = images[0, -3:, -3:, -1].flatten() - expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535]) + expected = np.array([0.4468, 0.4061, 0.4134, 0.3637, 0.3202, 0.365, 0.3786, 0.3725, 0.3535]) max_diff = numpy_cosine_similarity_distance(expected, images) assert max_diff < 1e-4 @@ -224,7 +224,7 @@ def test_sdxl_1_0_blockwise_lora(self): ).images images = images[0, -3:, -3:, -1].flatten() - expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535]) + expected = np.array([00.4468, 0.4061, 0.4134, 0.3637, 0.3202, 0.365, 0.3786, 0.3725, 0.3535]) max_diff = numpy_cosine_similarity_distance(expected, images) assert max_diff < 1e-4 @@ -283,7 +283,7 @@ def test_sdxl_1_0_lora_fusion(self): images = images[0, -3:, -3:, -1].flatten() # This way we also test equivalence between LoRA fusion and the non-fusion behaviour. - expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535]) + expected = np.array([0.4468, 0.4061, 0.4134, 0.3637, 0.3202, 0.365, 0.3786, 0.3725, 0.3535]) max_diff = numpy_cosine_similarity_distance(expected, images) assert max_diff < 1e-4 @@ -507,13 +507,12 @@ def test_controlnet_canny_lora(self): image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" ) - images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images assert images[0].shape == (768, 512, 3) original_image = images[0, -3:, -3:, -1].flatten() - expected_image = np.array([0.4574, 0.4461, 0.4435, 0.4462, 0.4396, 0.439, 0.4474, 0.4486, 0.4333]) + expected_image = np.array([0.4574, 0.4487, 0.4435, 0.5163, 0.4396, 0.4411, 0.518, 0.4465, 0.4333]) max_diff = numpy_cosine_similarity_distance(expected_image, original_image) assert max_diff < 1e-4 diff --git a/tests/models/autoencoders/test_models_vae.py b/tests/models/autoencoders/test_models_vae.py index 06571126388d..d78479247347 100644 --- a/tests/models/autoencoders/test_models_vae.py +++ b/tests/models/autoencoders/test_models_vae.py @@ -791,62 +791,6 @@ def test_stable_diffusion_encode_sample(self, seed, expected_slice): tolerance = 3e-3 if torch_device != "mps" else 1e-2 assert torch_all_close(output_slice, expected_output_slice, atol=tolerance) - def test_stable_diffusion_model_local(self): - model_id = "stabilityai/sd-vae-ft-mse" - model_1 = AutoencoderKL.from_pretrained(model_id).to(torch_device) - - url = "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors" - model_2 = AutoencoderKL.from_single_file(url).to(torch_device) - image = self.get_sd_image(33) - - with torch.no_grad(): - sample_1 = model_1(image).sample - sample_2 = model_2(image).sample - - assert sample_1.shape == sample_2.shape - - output_slice_1 = sample_1[-1, -2:, -2:, :2].flatten().float().cpu() - output_slice_2 = sample_2[-1, -2:, -2:, :2].flatten().float().cpu() - - assert torch_all_close(output_slice_1, output_slice_2, atol=3e-3) - - def test_single_file_component_configs(self): - vae_single_file = AutoencoderKL.from_single_file( - "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors" - ) - vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae") - - PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values"] - for param_name, param_value in vae_single_file.config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - assert ( - vae.config[param_name] == param_value - ), f"{param_name} differs between single file loading and pretrained loading" - - def test_single_file_arguments(self): - vae_default = AutoencoderKL.from_single_file( - "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors", - ) - - assert vae_default.config.scaling_factor == 0.18215 - assert vae_default.config.sample_size == 512 - assert vae_default.dtype == torch.float32 - - scaling_factor = 2.0 - image_size = 256 - torch_dtype = torch.float16 - - vae = AutoencoderKL.from_single_file( - "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors", - image_size=image_size, - scaling_factor=scaling_factor, - torch_dtype=torch_dtype, - ) - assert vae.config.scaling_factor == scaling_factor - assert vae.config.sample_size == image_size - assert vae.dtype == torch_dtype - @slow class AsymmetricAutoencoderKLIntegrationTests(unittest.TestCase): diff --git a/tests/models/autoencoders/test_models_vq.py b/tests/models/autoencoders/test_models_vq.py index d4262e2709dc..c61ae1bdf0ff 100644 --- a/tests/models/autoencoders/test_models_vq.py +++ b/tests/models/autoencoders/test_models_vq.py @@ -98,3 +98,19 @@ def test_output_pretrained(self): expected_output_slice = torch.tensor([-0.0153, -0.4044, -0.1880, -0.5161, -0.2418, -0.4072, -0.1612, -0.0633, -0.0143]) # fmt: on self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3)) + + def test_loss_pretrained(self): + model = VQModel.from_pretrained("fusing/vqgan-dummy") + model.to(torch_device).eval() + + torch.manual_seed(0) + backend_manual_seed(torch_device, 0) + + image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size) + image = image.to(torch_device) + with torch.no_grad(): + output = model(image).commit_loss.cpu() + # fmt: off + expected_output = torch.tensor([0.1936]) + # fmt: on + self.assertTrue(torch.allclose(output, expected_output, atol=1e-3)) diff --git a/tests/models/unets/test_models_unet_stable_cascade.py b/tests/models/unets/test_models_unet_stable_cascade.py deleted file mode 100644 index 6b1f2fdd2485..000000000000 --- a/tests/models/unets/test_models_unet_stable_cascade.py +++ /dev/null @@ -1,191 +0,0 @@ -# coding=utf-8 -# Copyright 2024 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import torch - -from diffusers import StableCascadeUNet -from diffusers.utils import logging -from diffusers.utils.testing_utils import ( - enable_full_determinism, - numpy_cosine_similarity_distance, - require_torch_gpu, - slow, -) -from diffusers.utils.torch_utils import randn_tensor - - -logger = logging.get_logger(__name__) - -enable_full_determinism() - - -@slow -class StableCascadeUNetModelSlowTests(unittest.TestCase): - def tearDown(self) -> None: - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_stable_cascade_unet_prior_single_file_components(self): - single_file_url = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_c_bf16.safetensors" - single_file_unet = StableCascadeUNet.from_single_file(single_file_url) - - single_file_unet_config = single_file_unet.config - del single_file_unet - gc.collect() - torch.cuda.empty_cache() - - unet = StableCascadeUNet.from_pretrained("stabilityai/stable-cascade-prior", subfolder="prior", variant="bf16") - unet_config = unet.config - del unet - gc.collect() - torch.cuda.empty_cache() - - PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values"] - for param_name, param_value in single_file_unet_config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - - assert unet_config[param_name] == param_value - - def test_stable_cascade_unet_decoder_single_file_components(self): - single_file_url = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_bf16.safetensors" - single_file_unet = StableCascadeUNet.from_single_file(single_file_url) - - single_file_unet_config = single_file_unet.config - del single_file_unet - gc.collect() - torch.cuda.empty_cache() - - unet = StableCascadeUNet.from_pretrained("stabilityai/stable-cascade", subfolder="decoder", variant="bf16") - unet_config = unet.config - del unet - gc.collect() - torch.cuda.empty_cache() - - PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values"] - for param_name, param_value in single_file_unet_config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - - assert unet_config[param_name] == param_value - - def test_stable_cascade_unet_config_loading(self): - config = StableCascadeUNet.load_config( - pretrained_model_name_or_path="diffusers/stable-cascade-configs", subfolder="prior" - ) - single_file_url = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_c_bf16.safetensors" - - single_file_unet = StableCascadeUNet.from_single_file(single_file_url, config=config) - single_file_unet_config = single_file_unet.config - del single_file_unet - gc.collect() - torch.cuda.empty_cache() - - PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values"] - for param_name, param_value in config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - - assert single_file_unet_config[param_name] == param_value - - @require_torch_gpu - def test_stable_cascade_unet_single_file_prior_forward_pass(self): - dtype = torch.bfloat16 - generator = torch.Generator("cpu") - - model_inputs = { - "sample": randn_tensor((1, 16, 24, 24), generator=generator.manual_seed(0)).to("cuda", dtype), - "timestep_ratio": torch.tensor([1]).to("cuda", dtype), - "clip_text_pooled": randn_tensor((1, 1, 1280), generator=generator.manual_seed(0)).to("cuda", dtype), - "clip_text": randn_tensor((1, 77, 1280), generator=generator.manual_seed(0)).to("cuda", dtype), - "clip_img": randn_tensor((1, 1, 768), generator=generator.manual_seed(0)).to("cuda", dtype), - "pixels": randn_tensor((1, 3, 8, 8), generator=generator.manual_seed(0)).to("cuda", dtype), - } - - unet = StableCascadeUNet.from_pretrained( - "stabilityai/stable-cascade-prior", - subfolder="prior", - revision="refs/pr/2", - variant="bf16", - torch_dtype=dtype, - ) - unet.to("cuda") - with torch.no_grad(): - prior_output = unet(**model_inputs).sample.float().cpu().numpy() - - # Remove UNet from GPU memory before loading the single file UNet model - del unet - gc.collect() - torch.cuda.empty_cache() - - single_file_url = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_c_bf16.safetensors" - single_file_unet = StableCascadeUNet.from_single_file(single_file_url, torch_dtype=dtype) - single_file_unet.to("cuda") - with torch.no_grad(): - prior_single_file_output = single_file_unet(**model_inputs).sample.float().cpu().numpy() - - # Remove UNet from GPU memory before loading the single file UNet model - del single_file_unet - gc.collect() - torch.cuda.empty_cache() - - max_diff = numpy_cosine_similarity_distance(prior_output.flatten(), prior_single_file_output.flatten()) - assert max_diff < 8e-3 - - @require_torch_gpu - def test_stable_cascade_unet_single_file_decoder_forward_pass(self): - dtype = torch.float32 - generator = torch.Generator("cpu") - - model_inputs = { - "sample": randn_tensor((1, 4, 256, 256), generator=generator.manual_seed(0)).to("cuda", dtype), - "timestep_ratio": torch.tensor([1]).to("cuda", dtype), - "clip_text": randn_tensor((1, 77, 1280), generator=generator.manual_seed(0)).to("cuda", dtype), - "clip_text_pooled": randn_tensor((1, 1, 1280), generator=generator.manual_seed(0)).to("cuda", dtype), - "pixels": randn_tensor((1, 3, 8, 8), generator=generator.manual_seed(0)).to("cuda", dtype), - } - - unet = StableCascadeUNet.from_pretrained( - "stabilityai/stable-cascade", - subfolder="decoder", - revision="refs/pr/44", - torch_dtype=dtype, - ) - unet.to("cuda") - with torch.no_grad(): - prior_output = unet(**model_inputs).sample.float().cpu().numpy() - - # Remove UNet from GPU memory before loading the single file UNet model - del unet - gc.collect() - torch.cuda.empty_cache() - - single_file_url = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b.safetensors" - single_file_unet = StableCascadeUNet.from_single_file(single_file_url, torch_dtype=dtype) - single_file_unet.to("cuda") - with torch.no_grad(): - prior_single_file_output = single_file_unet(**model_inputs).sample.float().cpu().numpy() - - # Remove UNet from GPU memory before loading the single file UNet model - del single_file_unet - gc.collect() - torch.cuda.empty_cache() - - max_diff = numpy_cosine_similarity_distance(prior_output.flatten(), prior_single_file_output.flatten()) - assert max_diff < 1e-4 diff --git a/tests/others/test_check_copies.py b/tests/others/test_check_copies.py index 6e1c8fcfa54b..5835712343c7 100644 --- a/tests/others/test_check_copies.py +++ b/tests/others/test_check_copies.py @@ -32,16 +32,16 @@ Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. \""" - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None """ diff --git a/tests/others/test_video_processor.py b/tests/others/test_video_processor.py new file mode 100644 index 000000000000..a2fc87717f9d --- /dev/null +++ b/tests/others/test_video_processor.py @@ -0,0 +1,169 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import PIL.Image +import torch +from parameterized import parameterized + +from diffusers.video_processor import VideoProcessor + + +np.random.seed(0) +torch.manual_seed(0) + + +class VideoProcessorTest(unittest.TestCase): + def get_dummy_sample(self, input_type): + batch_size = 1 + num_frames = 5 + num_channels = 3 + height = 8 + width = 8 + + def generate_image(): + return PIL.Image.fromarray(np.random.randint(0, 256, size=(height, width, num_channels)).astype("uint8")) + + def generate_4d_array(): + return np.random.rand(num_frames, height, width, num_channels) + + def generate_5d_array(): + return np.random.rand(batch_size, num_frames, height, width, num_channels) + + def generate_4d_tensor(): + return torch.rand(num_frames, num_channels, height, width) + + def generate_5d_tensor(): + return torch.rand(batch_size, num_frames, num_channels, height, width) + + if input_type == "list_images": + sample = [generate_image() for _ in range(num_frames)] + elif input_type == "list_list_images": + sample = [[generate_image() for _ in range(num_frames)] for _ in range(num_frames)] + elif input_type == "list_4d_np": + sample = [generate_4d_array() for _ in range(num_frames)] + elif input_type == "list_list_4d_np": + sample = [[generate_4d_array() for _ in range(num_frames)] for _ in range(num_frames)] + elif input_type == "list_5d_np": + sample = [generate_5d_array() for _ in range(num_frames)] + elif input_type == "5d_np": + sample = generate_5d_array() + elif input_type == "list_4d_pt": + sample = [generate_4d_tensor() for _ in range(num_frames)] + elif input_type == "list_list_4d_pt": + sample = [[generate_4d_tensor() for _ in range(num_frames)] for _ in range(num_frames)] + elif input_type == "list_5d_pt": + sample = [generate_5d_tensor() for _ in range(num_frames)] + elif input_type == "5d_pt": + sample = generate_5d_tensor() + + return sample + + def to_np(self, video): + # List of images. + if isinstance(video[0], PIL.Image.Image): + video = np.stack([np.array(i) for i in video], axis=0) + + # List of list of images. + elif isinstance(video, list) and isinstance(video[0][0], PIL.Image.Image): + frames = [] + for vid in video: + all_current_frames = np.stack([np.array(i) for i in vid], axis=0) + frames.append(all_current_frames) + video = np.stack([np.array(frame) for frame in frames], axis=0) + + # List of 4d/5d {ndarrays, torch tensors}. + elif isinstance(video, list) and isinstance(video[0], (torch.Tensor, np.ndarray)): + if isinstance(video[0], np.ndarray): + video = np.stack(video, axis=0) if video[0].ndim == 4 else np.concatenate(video, axis=0) + else: + if video[0].ndim == 4: + video = np.stack([i.cpu().numpy().transpose(0, 2, 3, 1) for i in video], axis=0) + elif video[0].ndim == 5: + video = np.concatenate([i.cpu().numpy().transpose(0, 1, 3, 4, 2) for i in video], axis=0) + + # List of list of 4d/5d {ndarrays, torch tensors}. + elif ( + isinstance(video, list) + and isinstance(video[0], list) + and isinstance(video[0][0], (torch.Tensor, np.ndarray)) + ): + all_frames = [] + for list_of_videos in video: + temp_frames = [] + for vid in list_of_videos: + if vid.ndim == 4: + current_vid_frames = np.stack( + [i if isinstance(i, np.ndarray) else i.cpu().numpy().transpose(1, 2, 0) for i in vid], + axis=0, + ) + elif vid.ndim == 5: + current_vid_frames = np.concatenate( + [i if isinstance(i, np.ndarray) else i.cpu().numpy().transpose(0, 2, 3, 1) for i in vid], + axis=0, + ) + temp_frames.append(current_vid_frames) + temp_frames = np.stack(temp_frames, axis=0) + all_frames.append(temp_frames) + + video = np.concatenate(all_frames, axis=0) + + # Just 5d {ndarrays, torch tensors}. + elif isinstance(video, (torch.Tensor, np.ndarray)) and video.ndim == 5: + video = video if isinstance(video, np.ndarray) else video.cpu().numpy().transpose(0, 1, 3, 4, 2) + + return video + + @parameterized.expand(["list_images", "list_list_images"]) + def test_video_processor_pil(self, input_type): + video_processor = VideoProcessor(do_resize=False, do_normalize=True) + + input = self.get_dummy_sample(input_type=input_type) + + for output_type in ["pt", "np", "pil"]: + out = video_processor.postprocess_video(video_processor.preprocess_video(input), output_type=output_type) + out_np = self.to_np(out) + input_np = self.to_np(input).astype("float32") / 255.0 if output_type != "pil" else self.to_np(input) + assert np.abs(input_np - out_np).max() < 1e-6, f"Decoded output does not match input for {output_type=}" + + @parameterized.expand(["list_4d_np", "list_5d_np", "5d_np"]) + def test_video_processor_np(self, input_type): + video_processor = VideoProcessor(do_resize=False, do_normalize=True) + + input = self.get_dummy_sample(input_type=input_type) + + for output_type in ["pt", "np", "pil"]: + out = video_processor.postprocess_video(video_processor.preprocess_video(input), output_type=output_type) + out_np = self.to_np(out) + input_np = ( + (self.to_np(input) * 255.0).round().astype("uint8") if output_type == "pil" else self.to_np(input) + ) + assert np.abs(input_np - out_np).max() < 1e-6, f"Decoded output does not match input for {output_type=}" + + @parameterized.expand(["list_4d_pt", "list_5d_pt", "5d_pt"]) + def test_video_processor_pt(self, input_type): + video_processor = VideoProcessor(do_resize=False, do_normalize=True) + + input = self.get_dummy_sample(input_type=input_type) + + for output_type in ["pt", "np", "pil"]: + out = video_processor.postprocess_video(video_processor.preprocess_video(input), output_type=output_type) + out_np = self.to_np(out) + input_np = ( + (self.to_np(input) * 255.0).round().astype("uint8") if output_type == "pil" else self.to_np(input) + ) + assert np.abs(input_np - out_np).max() < 1e-6, f"Decoded output does not match input for {output_type=}" diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index 8429d7b2d5cf..7cbbe2a9ec90 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -38,7 +38,6 @@ get_python_version, load_image, load_numpy, - numpy_cosine_similarity_distance, require_python39_or_higher, require_torch_2, require_torch_gpu, @@ -1063,97 +1062,6 @@ def test_v11_shuffle_global_pool_conditions(self): expected_slice = np.array([0.1338, 0.1597, 0.1202, 0.1687, 0.1377, 0.1017, 0.2070, 0.1574, 0.1348]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - def test_load_local(self): - controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny") - pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet - ) - pipe.unet.set_default_attn_processor() - pipe.enable_model_cpu_offload() - - controlnet = ControlNetModel.from_single_file( - "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth" - ) - pipe_sf = StableDiffusionControlNetPipeline.from_single_file( - "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors", - safety_checker=None, - controlnet=controlnet, - scheduler_type="pndm", - ) - pipe_sf.unet.set_default_attn_processor() - pipe_sf.enable_model_cpu_offload() - - control_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" - ).resize((512, 512)) - prompt = "bird" - - generator = torch.Generator(device="cpu").manual_seed(0) - output = pipe( - prompt, - image=control_image, - generator=generator, - output_type="np", - num_inference_steps=3, - ).images[0] - - generator = torch.Generator(device="cpu").manual_seed(0) - output_sf = pipe_sf( - prompt, - image=control_image, - generator=generator, - output_type="np", - num_inference_steps=3, - ).images[0] - - max_diff = numpy_cosine_similarity_distance(output_sf.flatten(), output.flatten()) - assert max_diff < 1e-3 - - def test_single_file_component_configs(self): - controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny", variant="fp16") - pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", variant="fp16", safety_checker=None, controlnet=controlnet - ) - - controlnet_single_file = ControlNetModel.from_single_file( - "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth" - ) - single_file_pipe = StableDiffusionControlNetPipeline.from_single_file( - "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors", - safety_checker=None, - controlnet=controlnet_single_file, - scheduler_type="pndm", - ) - - PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "architectures", "_use_default_values"] - for param_name, param_value in single_file_pipe.controlnet.config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - - # This parameter doesn't appear to be loaded from the config. - # So when it is registered to config, it remains a tuple as this is the default in the class definition - # from_pretrained, does load from config and converts to a list when registering to config - if param_name == "conditioning_embedding_out_channels" and isinstance(param_value, tuple): - param_value = list(param_value) - - assert ( - pipe.controlnet.config[param_name] == param_value - ), f"{param_name} differs between single file loading and pretrained loading" - - for param_name, param_value in single_file_pipe.unet.config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - assert ( - pipe.unet.config[param_name] == param_value - ), f"{param_name} differs between single file loading and pretrained loading" - - for param_name, param_value in single_file_pipe.vae.config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - assert ( - pipe.vae.config[param_name] == param_value - ), f"{param_name} differs between single file loading and pretrained loading" - @slow @require_torch_gpu diff --git a/tests/pipelines/controlnet/test_controlnet_img2img.py b/tests/pipelines/controlnet/test_controlnet_img2img.py index fe6a0126118d..0b7ae50a21d1 100644 --- a/tests/pipelines/controlnet/test_controlnet_img2img.py +++ b/tests/pipelines/controlnet/test_controlnet_img2img.py @@ -39,7 +39,6 @@ enable_full_determinism, floats_tensor, load_numpy, - numpy_cosine_similarity_distance, require_torch_gpu, slow, torch_device, @@ -441,56 +440,3 @@ def test_canny(self): ) assert np.abs(expected_image - image).max() < 9e-2 - - def test_load_local(self): - controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny") - pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet - ) - pipe.unet.set_default_attn_processor() - pipe.enable_model_cpu_offload() - - controlnet = ControlNetModel.from_single_file( - "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth" - ) - pipe_sf = StableDiffusionControlNetImg2ImgPipeline.from_single_file( - "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors", - safety_checker=None, - controlnet=controlnet, - scheduler_type="pndm", - ) - pipe_sf.unet.set_default_attn_processor() - pipe_sf.enable_model_cpu_offload() - - control_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" - ).resize((512, 512)) - image = load_image( - "https://huggingface.co/lllyasviel/sd-controlnet-canny/resolve/main/images/bird.png" - ).resize((512, 512)) - prompt = "bird" - - generator = torch.Generator(device="cpu").manual_seed(0) - output = pipe( - prompt, - image=image, - control_image=control_image, - strength=0.9, - generator=generator, - output_type="np", - num_inference_steps=3, - ).images[0] - - generator = torch.Generator(device="cpu").manual_seed(0) - output_sf = pipe_sf( - prompt, - image=image, - control_image=control_image, - strength=0.9, - generator=generator, - output_type="np", - num_inference_steps=3, - ).images[0] - - max_diff = numpy_cosine_similarity_distance(output_sf.flatten(), output.flatten()) - assert max_diff < 1e-3 diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint.py b/tests/pipelines/controlnet/test_controlnet_inpaint.py index 69379b0fa3c1..10a88bf2845d 100644 --- a/tests/pipelines/controlnet/test_controlnet_inpaint.py +++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py @@ -556,55 +556,3 @@ def make_inpaint_condition(image, image_mask): ) assert numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten()) < 1e-2 - - def test_load_local(self): - controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny") - pipe_1 = StableDiffusionControlNetInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting", safety_checker=None, controlnet=controlnet - ) - - controlnet = ControlNetModel.from_single_file( - "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth" - ) - pipe_2 = StableDiffusionControlNetInpaintPipeline.from_single_file( - "https://huggingface.co/runwayml/stable-diffusion-inpainting/blob/main/sd-v1-5-inpainting.ckpt", - safety_checker=None, - controlnet=controlnet, - ) - control_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" - ).resize((512, 512)) - image = load_image( - "https://huggingface.co/lllyasviel/sd-controlnet-canny/resolve/main/images/bird.png" - ).resize((512, 512)) - mask_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/input_bench_mask.png" - ).resize((512, 512)) - - pipes = [pipe_1, pipe_2] - images = [] - for pipe in pipes: - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "bird" - output = pipe( - prompt, - image=image, - control_image=control_image, - mask_image=mask_image, - strength=0.9, - generator=generator, - output_type="np", - num_inference_steps=3, - ) - images.append(output.images[0]) - - del pipe - gc.collect() - torch.cuda.empty_cache() - - max_diff = numpy_cosine_similarity_distance(images[0].flatten(), images[1].flatten()) - assert max_diff < 1e-3 diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py b/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py index a7423bebd939..3341f6704e75 100644 --- a/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py +++ b/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py @@ -19,7 +19,15 @@ import numpy as np import torch from PIL import Image -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer +from transformers import ( + CLIPImageProcessor, + CLIPTextConfig, + CLIPTextModel, + CLIPTextModelWithProjection, + CLIPTokenizer, + CLIPVisionConfig, + CLIPVisionModelWithProjection, +) from diffusers import ( AutoencoderKL, @@ -34,6 +42,7 @@ from ..pipeline_params import ( IMAGE_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_BATCH_PARAMS, + TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS, ) @@ -55,6 +64,14 @@ class ControlNetPipelineSDXLFastTests( batch_params = TEXT_TO_IMAGE_BATCH_PARAMS image_params = frozenset(IMAGE_TO_IMAGE_IMAGE_PARAMS.union({"mask_image", "control_image"})) image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS + callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union( + { + "add_text_embeds", + "add_time_ids", + "mask", + "masked_image_latents", + } + ) def get_dummy_components(self): torch.manual_seed(0) @@ -129,6 +146,30 @@ def get_dummy_components(self): text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config) tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + image_encoder_config = CLIPVisionConfig( + hidden_size=32, + image_size=224, + projection_dim=32, + intermediate_size=37, + num_attention_heads=4, + num_channels=3, + num_hidden_layers=5, + patch_size=14, + ) + + image_encoder = CLIPVisionModelWithProjection(image_encoder_config) + + feature_extractor = CLIPImageProcessor( + crop_size=224, + do_center_crop=True, + do_normalize=True, + do_resize=True, + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], + resample=3, + size=224, + ) + components = { "unet": unet, "controlnet": controlnet, @@ -138,6 +179,8 @@ def get_dummy_components(self): "tokenizer": tokenizer, "text_encoder_2": text_encoder_2, "tokenizer_2": tokenizer_2, + "image_encoder": image_encoder, + "feature_extractor": feature_extractor, } return components diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl.py b/tests/pipelines/controlnet/test_controlnet_sdxl.py index 41fb9a1b62fc..931e5cc025ca 100644 --- a/tests/pipelines/controlnet/test_controlnet_sdxl.py +++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py @@ -37,7 +37,6 @@ from diffusers.utils.testing_utils import ( enable_full_determinism, load_image, - numpy_cosine_similarity_distance, require_torch_gpu, slow, torch_device, @@ -949,89 +948,6 @@ def test_depth(self): expected_image = np.array([0.4399, 0.5112, 0.5478, 0.4314, 0.472, 0.4823, 0.4647, 0.4957, 0.4853]) assert np.allclose(original_image, expected_image, atol=1e-04) - def test_download_ckpt_diff_format_is_same(self): - controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16) - single_file_url = ( - "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors" - ) - pipe_single_file = StableDiffusionXLControlNetPipeline.from_single_file( - single_file_url, controlnet=controlnet, torch_dtype=torch.float16 - ) - pipe_single_file.unet.set_default_attn_processor() - pipe_single_file.enable_model_cpu_offload() - pipe_single_file.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "Stormtrooper's lecture" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png" - ) - single_file_images = pipe_single_file( - prompt, image=image, generator=generator, output_type="np", num_inference_steps=2 - ).images - - generator = torch.Generator(device="cpu").manual_seed(0) - pipe = StableDiffusionXLControlNetPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16 - ) - pipe.unet.set_default_attn_processor() - pipe.enable_model_cpu_offload() - images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=2).images - - assert images[0].shape == (512, 512, 3) - assert single_file_images[0].shape == (512, 512, 3) - - max_diff = numpy_cosine_similarity_distance(images[0].flatten(), single_file_images[0].flatten()) - assert max_diff < 5e-2 - - def test_single_file_component_configs(self): - controlnet = ControlNetModel.from_pretrained( - "diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, variant="fp16" - ) - pipe = StableDiffusionXLControlNetPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", - variant="fp16", - controlnet=controlnet, - torch_dtype=torch.float16, - ) - - single_file_url = ( - "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors" - ) - single_file_pipe = StableDiffusionXLControlNetPipeline.from_single_file( - single_file_url, controlnet=controlnet, torch_dtype=torch.float16 - ) - - for param_name, param_value in single_file_pipe.text_encoder.config.to_dict().items(): - if param_name in ["torch_dtype", "architectures", "_name_or_path"]: - continue - assert pipe.text_encoder.config.to_dict()[param_name] == param_value - - for param_name, param_value in single_file_pipe.text_encoder_2.config.to_dict().items(): - if param_name in ["torch_dtype", "architectures", "_name_or_path"]: - continue - assert pipe.text_encoder_2.config.to_dict()[param_name] == param_value - - PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "architectures", "_use_default_values"] - for param_name, param_value in single_file_pipe.unet.config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - - # Upcast attention might be set to None in a config file, which is incorrect. It should default to False in the model - if param_name == "upcast_attention" and pipe.unet.config[param_name] is None: - pipe.unet.config[param_name] = False - - assert ( - pipe.unet.config[param_name] == param_value - ), f"{param_name} differs between single file loading and pretrained loading" - - for param_name, param_value in single_file_pipe.vae.config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - assert ( - pipe.vae.config[param_name] == param_value - ), f"{param_name} differs between single file loading and pretrained loading" - class StableDiffusionSSD1BControlNetPipelineFastTests(StableDiffusionXLControlNetPipelineFastTests): def test_controlnet_sdxl_guess(self): diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py b/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py index 61ff675856ae..766eea4c0c32 100644 --- a/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py +++ b/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py @@ -34,6 +34,7 @@ IMAGE_TO_IMAGE_IMAGE_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS, + TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS, ) from ..test_pipelines_common import ( IPAdapterTesterMixin, @@ -55,9 +56,13 @@ class ControlNetPipelineSDXLImg2ImgFastTests( ): pipeline_class = StableDiffusionXLControlNetImg2ImgPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS + callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union( + {"add_text_embeds", "add_time_ids", "add_neg_time_ids"} + ) def get_dummy_components(self, skip_first_text_encoder=False): torch.manual_seed(0) diff --git a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py index 0273e972a620..426e25812295 100644 --- a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py +++ b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py @@ -243,7 +243,6 @@ def tearDown(self): def test_i2vgen_xl(self): pipe = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16") - pipe = pipe.to(torch_device) pipe.enable_model_cpu_offload() pipe.set_progress_bar_config(disable=None) image = load_image( diff --git a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py index 8c95fbc7033d..bf74b2f0600c 100644 --- a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py +++ b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py @@ -612,10 +612,10 @@ def test_ip_adapter_multiple_masks(self): def test_instant_style_multiple_masks(self): image_encoder = CLIPVisionModelWithProjection.from_pretrained( "h94/IP-Adapter", subfolder="models/image_encoder", torch_dtype=torch.float16 - ).to("cuda") + ) pipeline = StableDiffusionXLPipeline.from_pretrained( "RunDiffusion/Juggernaut-XL-v9", torch_dtype=torch.float16, image_encoder=image_encoder, variant="fp16" - ).to("cuda") + ) pipeline.enable_model_cpu_offload() pipeline.load_ip_adapter( diff --git a/tests/pipelines/pixart_sigma/test_pixart.py b/tests/pipelines/pixart_sigma/test_pixart.py index c0df15ae661d..58833d15fe9a 100644 --- a/tests/pipelines/pixart_sigma/test_pixart.py +++ b/tests/pipelines/pixart_sigma/test_pixart.py @@ -336,7 +336,7 @@ def test_pixart_1024(self): image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images image_slice = image[0, -3:, -3:, -1] - expected_slice = np.array([0.0742, 0.0835, 0.2114, 0.0295, 0.0784, 0.2361, 0.1738, 0.2251, 0.3589]) + expected_slice = np.array([0.4517, 0.4446, 0.4375, 0.449, 0.4399, 0.4365, 0.4583, 0.4629, 0.4473]) max_diff = numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice) self.assertLessEqual(max_diff, 1e-4) @@ -344,7 +344,12 @@ def test_pixart_1024(self): def test_pixart_512(self): generator = torch.Generator("cpu").manual_seed(0) - pipe = PixArtSigmaPipeline.from_pretrained(self.ckpt_id_512, torch_dtype=torch.float16) + transformer = Transformer2DModel.from_pretrained( + self.ckpt_id_512, subfolder="transformer", torch_dtype=torch.float16 + ) + pipe = PixArtSigmaPipeline.from_pretrained( + self.ckpt_id_1024, transformer=transformer, torch_dtype=torch.float16 + ) pipe.enable_model_cpu_offload() prompt = self.prompt @@ -352,7 +357,7 @@ def test_pixart_512(self): image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images image_slice = image[0, -3:, -3:, -1] - expected_slice = np.array([0.3477, 0.3882, 0.4541, 0.3413, 0.3821, 0.4463, 0.4001, 0.4409, 0.4958]) + expected_slice = np.array([0.0479, 0.0378, 0.0217, 0.0942, 0.064, 0.0791, 0.2073, 0.1975, 0.2017]) max_diff = numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice) self.assertLessEqual(max_diff, 1e-4) @@ -394,7 +399,12 @@ def test_pixart_1024_without_resolution_binning(self): def test_pixart_512_without_resolution_binning(self): generator = torch.manual_seed(0) - pipe = PixArtSigmaPipeline.from_pretrained(self.ckpt_id_512, torch_dtype=torch.float16) + transformer = Transformer2DModel.from_pretrained( + self.ckpt_id_512, subfolder="transformer", torch_dtype=torch.float16 + ) + pipe = PixArtSigmaPipeline.from_pretrained( + self.ckpt_id_1024, transformer=transformer, torch_dtype=torch.float16 + ) pipe.enable_model_cpu_offload() prompt = self.prompt diff --git a/tests/pipelines/stable_cascade/test_stable_cascade_combined.py b/tests/pipelines/stable_cascade/test_stable_cascade_combined.py index 4a9a123fcda1..d256deed376c 100644 --- a/tests/pipelines/stable_cascade/test_stable_cascade_combined.py +++ b/tests/pipelines/stable_cascade/test_stable_cascade_combined.py @@ -153,6 +153,8 @@ def get_dummy_components(self): "prior_tokenizer": prior_tokenizer, "prior_prior": prior, "prior_scheduler": scheduler, + "prior_feature_extractor": None, + "prior_image_encoder": None, } return components diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 137f1e696d93..5cb22bd3c813 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -42,7 +42,6 @@ UNet2DConditionModel, logging, ) -from diffusers.models.attention_processor import AttnProcessor from diffusers.utils.testing_utils import ( CaptureLogger, enable_full_determinism, @@ -260,6 +259,44 @@ def test_stable_diffusion_lcm_custom_timesteps(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + def test_stable_diffusion_ays(self): + from diffusers.schedulers import AysSchedules + + timestep_schedule = AysSchedules["StableDiffusionTimesteps"] + sigma_schedule = AysSchedules["StableDiffusionSigmas"] + + device = "cpu" # ensure determinism for the device-dependent torch.Generator + + components = self.get_dummy_components(time_cond_proj_dim=256) + sd_pipe = StableDiffusionPipeline(**components) + sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config) + sd_pipe = sd_pipe.to(torch_device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + inputs["num_inference_steps"] = 10 + output = sd_pipe(**inputs).images + + inputs = self.get_dummy_inputs(device) + inputs["num_inference_steps"] = None + inputs["timesteps"] = timestep_schedule + output_ts = sd_pipe(**inputs).images + + inputs = self.get_dummy_inputs(device) + inputs["num_inference_steps"] = None + inputs["sigmas"] = sigma_schedule + output_sigmas = sd_pipe(**inputs).images + + assert ( + np.abs(output_sigmas.flatten() - output_ts.flatten()).max() < 1e-3 + ), "ays timesteps and ays sigmas should have the same outputs" + assert ( + np.abs(output.flatten() - output_ts.flatten()).max() > 1e-3 + ), "use ays timesteps should have different outputs" + assert ( + np.abs(output.flatten() - output_sigmas.flatten()).max() > 1e-3 + ), "use ays sigmas should have different outputs" + def test_stable_diffusion_prompt_embeds(self): components = self.get_dummy_components() sd_pipe = StableDiffusionPipeline(**components) @@ -1004,7 +1041,7 @@ def test_stable_diffusion_fp16_vs_autocast(self): def test_stable_diffusion_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 @@ -1284,62 +1321,6 @@ def test_download_local(self): assert image_out.shape == (512, 512, 3) - def test_download_ckpt_diff_format_is_same(self): - ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors" - - sf_pipe = StableDiffusionPipeline.from_single_file(ckpt_path) - sf_pipe.scheduler = DDIMScheduler.from_config(sf_pipe.scheduler.config) - sf_pipe.unet.set_attn_processor(AttnProcessor()) - sf_pipe.to("cuda") - - generator = torch.Generator(device="cpu").manual_seed(0) - image_single_file = sf_pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0] - - pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.unet.set_attn_processor(AttnProcessor()) - pipe.to("cuda") - - generator = torch.Generator(device="cpu").manual_seed(0) - image = pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0] - - max_diff = numpy_cosine_similarity_distance(image.flatten(), image_single_file.flatten()) - - assert max_diff < 1e-3 - - def test_single_file_component_configs(self): - pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") - - ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors" - single_file_pipe = StableDiffusionPipeline.from_single_file(ckpt_path, load_safety_checker=True) - - for param_name, param_value in single_file_pipe.text_encoder.config.to_dict().items(): - if param_name in ["torch_dtype", "architectures", "_name_or_path"]: - continue - assert pipe.text_encoder.config.to_dict()[param_name] == param_value - - PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "architectures", "_use_default_values"] - for param_name, param_value in single_file_pipe.unet.config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - assert ( - pipe.unet.config[param_name] == param_value - ), f"{param_name} differs between single file loading and pretrained loading" - - for param_name, param_value in single_file_pipe.vae.config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - assert ( - pipe.vae.config[param_name] == param_value - ), f"{param_name} differs between single file loading and pretrained loading" - - for param_name, param_value in single_file_pipe.safety_checker.config.to_dict().items(): - if param_name in PARAMS_TO_IGNORE: - continue - assert ( - pipe.safety_checker.config.to_dict()[param_name] == param_value - ), f"{param_name} differs between single file loading and pretrained loading" - @nightly @require_torch_gpu diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 63534118343a..d691565248bc 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -472,7 +472,7 @@ def test_stable_diffusion_img2img_ddim(self): def test_stable_diffusion_img2img_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index cc2936f28b96..5e22a871bf79 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -36,7 +36,6 @@ StableDiffusionInpaintPipeline, UNet2DConditionModel, ) -from diffusers.models.attention_processor import AttnProcessor from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image from diffusers.utils.testing_utils import ( enable_full_determinism, @@ -44,7 +43,6 @@ load_image, load_numpy, nightly, - numpy_cosine_similarity_distance, require_python39_or_higher, require_torch_2, require_torch_gpu, @@ -832,77 +830,6 @@ def test_stable_diffusion_simple_inpaint_ddim(self): expected_slice = np.array([0.3757, 0.3875, 0.4445, 0.4353, 0.3780, 0.4513, 0.3965, 0.3984, 0.4362]) assert np.abs(expected_slice - image_slice).max() < 1e-3 - def test_download_local(self): - filename = hf_hub_download("runwayml/stable-diffusion-inpainting", filename="sd-v1-5-inpainting.ckpt") - - pipe = StableDiffusionInpaintPipeline.from_single_file(filename, torch_dtype=torch.float16) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.to("cuda") - - inputs = self.get_inputs(torch_device) - inputs["num_inference_steps"] = 1 - image_out = pipe(**inputs).images[0] - - assert image_out.shape == (512, 512, 3) - - def test_download_ckpt_diff_format_is_same(self): - ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-inpainting/blob/main/sd-v1-5-inpainting.ckpt" - - pipe = StableDiffusionInpaintPipeline.from_single_file(ckpt_path) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.unet.set_attn_processor(AttnProcessor()) - pipe.to("cuda") - - inputs = self.get_inputs(torch_device) - inputs["num_inference_steps"] = 5 - image_ckpt = pipe(**inputs).images[0] - - pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting") - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.unet.set_attn_processor(AttnProcessor()) - pipe.to("cuda") - - inputs = self.get_inputs(torch_device) - inputs["num_inference_steps"] = 5 - image = pipe(**inputs).images[0] - - max_diff = numpy_cosine_similarity_distance(image.flatten(), image_ckpt.flatten()) - - assert max_diff < 1e-4 - - def test_single_file_component_configs(self): - pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting", variant="fp16") - - ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-inpainting/blob/main/sd-v1-5-inpainting.ckpt" - single_file_pipe = StableDiffusionInpaintPipeline.from_single_file(ckpt_path, load_safety_checker=True) - - for param_name, param_value in single_file_pipe.text_encoder.config.to_dict().items(): - if param_name in ["torch_dtype", "architectures", "_name_or_path"]: - continue - assert pipe.text_encoder.config.to_dict()[param_name] == param_value - - PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "architectures", "_use_default_values"] - for param_name, param_value in single_file_pipe.unet.config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - assert ( - pipe.unet.config[param_name] == param_value - ), f"{param_name} is differs between single file loading and pretrained loading" - - for param_name, param_value in single_file_pipe.vae.config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - assert ( - pipe.vae.config[param_name] == param_value - ), f"{param_name} is differs between single file loading and pretrained loading" - - for param_name, param_value in single_file_pipe.safety_checker.config.to_dict().items(): - if param_name in PARAMS_TO_IGNORE: - continue - assert ( - pipe.safety_checker.config.to_dict()[param_name] == param_value - ), f"{param_name} is differs between single file loading and pretrained loading" - @slow @require_torch_gpu @@ -1128,9 +1055,6 @@ def test_download_local(self): assert image_out.shape == (512, 512, 3) - def test_download_ckpt_diff_format_is_same(self): - pass - @nightly @require_torch_gpu diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py index a85ea9c2605d..5da8669215b2 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py @@ -353,7 +353,7 @@ def test_stable_diffusion_pix2pix_ddim(self): def test_stable_diffusion_pix2pix_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py index 63e1cb30e203..494036482a77 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py @@ -416,7 +416,7 @@ def test_stable_diffusion_attention_slicing(self): def test_stable_diffusion_text2img_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py index 97dc88cd3d26..c260d565a829 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py @@ -461,7 +461,7 @@ def test_stable_diffusion_depth2img_pipeline_ddim(self): def test_stable_diffusion_depth2img_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py index 9118c60181fd..f63aaeefd0b9 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py @@ -29,7 +29,6 @@ floats_tensor, load_image, load_numpy, - numpy_cosine_similarity_distance, require_torch_gpu, slow, torch_device, @@ -492,73 +491,3 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): mem_bytes = torch.cuda.max_memory_allocated() # make sure that less than 2.9 GB is allocated assert mem_bytes < 2.9 * 10**9 - - def test_download_ckpt_diff_format_is_same(self): - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/sd2-upscale/low_res_cat.png" - ) - - prompt = "a cat sitting on a park bench" - model_id = "stabilityai/stable-diffusion-x4-upscaler" - pipe = StableDiffusionUpscalePipeline.from_pretrained(model_id) - pipe.enable_model_cpu_offload() - - generator = torch.Generator("cpu").manual_seed(0) - output = pipe(prompt=prompt, image=image, generator=generator, output_type="np", num_inference_steps=3) - image_from_pretrained = output.images[0] - - single_file_path = ( - "https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler/blob/main/x4-upscaler-ema.safetensors" - ) - pipe_from_single_file = StableDiffusionUpscalePipeline.from_single_file(single_file_path) - pipe_from_single_file.enable_model_cpu_offload() - - generator = torch.Generator("cpu").manual_seed(0) - output_from_single_file = pipe_from_single_file( - prompt=prompt, image=image, generator=generator, output_type="np", num_inference_steps=3 - ) - image_from_single_file = output_from_single_file.images[0] - - assert image_from_pretrained.shape == (512, 512, 3) - assert image_from_single_file.shape == (512, 512, 3) - assert ( - numpy_cosine_similarity_distance(image_from_pretrained.flatten(), image_from_single_file.flatten()) < 1e-3 - ) - - def test_single_file_component_configs(self): - pipe = StableDiffusionUpscalePipeline.from_pretrained( - "stabilityai/stable-diffusion-x4-upscaler", variant="fp16" - ) - - ckpt_path = ( - "https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler/blob/main/x4-upscaler-ema.safetensors" - ) - single_file_pipe = StableDiffusionUpscalePipeline.from_single_file(ckpt_path, load_safety_checker=True) - - for param_name, param_value in single_file_pipe.text_encoder.config.to_dict().items(): - if param_name in ["torch_dtype", "architectures", "_name_or_path"]: - continue - assert pipe.text_encoder.config.to_dict()[param_name] == param_value - - PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "architectures", "_use_default_values"] - for param_name, param_value in single_file_pipe.unet.config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - assert ( - pipe.unet.config[param_name] == param_value - ), f"{param_name} differs between single file loading and pretrained loading" - - for param_name, param_value in single_file_pipe.vae.config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - assert ( - pipe.vae.config[param_name] == param_value - ), f"{param_name} differs between single file loading and pretrained loading" - - for param_name, param_value in single_file_pipe.safety_checker.config.to_dict().items(): - if param_name in PARAMS_TO_IGNORE: - continue - assert ( - pipe.safety_checker.config.to_dict()[param_name] == param_value - ), f"{param_name} differs between single file loading and pretrained loading" diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py index 97b69043e11e..923fba1272ce 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py @@ -30,7 +30,6 @@ StableDiffusionPipeline, UNet2DConditionModel, ) -from diffusers.models.attention_processor import AttnProcessor from diffusers.utils.testing_utils import ( enable_full_determinism, load_numpy, @@ -421,7 +420,6 @@ def test_stable_diffusion_text2img_pipeline_unflawed(self): pipe.scheduler = DDIMScheduler.from_config( pipe.scheduler.config, timestep_spacing="trailing", rescale_betas_zero_snr=True ) - pipe.to(torch_device) pipe.enable_model_cpu_offload() pipe.set_progress_bar_config(disable=None) @@ -473,34 +471,10 @@ def test_download_local(self): assert image_out.shape == (768, 768, 3) - def test_download_ckpt_diff_format_is_same(self): - single_file_path = ( - "https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-ema-pruned.safetensors" - ) - - pipe_single = StableDiffusionPipeline.from_single_file(single_file_path) - pipe_single.scheduler = DDIMScheduler.from_config(pipe_single.scheduler.config) - pipe_single.unet.set_attn_processor(AttnProcessor()) - pipe_single.enable_model_cpu_offload() - - generator = torch.Generator(device="cpu").manual_seed(0) - image_ckpt = pipe_single("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0] - - pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1") - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.unet.set_attn_processor(AttnProcessor()) - pipe.enable_model_cpu_offload() - - generator = torch.Generator(device="cpu").manual_seed(0) - image = pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0] - - max_diff = numpy_cosine_similarity_distance(image.flatten(), image_ckpt.flatten()) - assert max_diff < 1e-3 - def test_stable_diffusion_text2img_intermediate_state_v_pred(self): number_of_steps = 0 - def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def test_callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: test_callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py index 99cd8b2e7d1b..6ddb562aacf2 100644 --- a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py +++ b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py @@ -213,7 +213,7 @@ def test_stable_diffusion_img_variation_pipeline_default(self): def test_stable_diffusion_img_variation_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py index 35a654542023..4e36dab5ac6c 100644 --- a/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py +++ b/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py @@ -349,7 +349,7 @@ def test_stable_diffusion_panorama_k_lms(self): def test_stable_diffusion_panorama_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py index 95d6dd69ca9e..50f3eb99f016 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py @@ -214,6 +214,44 @@ def test_stable_diffusion_xl_euler_lcm_custom_timesteps(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + def test_stable_diffusion_ays(self): + from diffusers.schedulers import AysSchedules + + timestep_schedule = AysSchedules["StableDiffusionXLTimesteps"] + sigma_schedule = AysSchedules["StableDiffusionXLSigmas"] + + device = "cpu" # ensure determinism for the device-dependent torch.Generator + + components = self.get_dummy_components(time_cond_proj_dim=256) + sd_pipe = StableDiffusionXLPipeline(**components) + sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config) + sd_pipe = sd_pipe.to(torch_device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + inputs["num_inference_steps"] = 10 + output = sd_pipe(**inputs).images + + inputs = self.get_dummy_inputs(device) + inputs["num_inference_steps"] = None + inputs["timesteps"] = timestep_schedule + output_ts = sd_pipe(**inputs).images + + inputs = self.get_dummy_inputs(device) + inputs["num_inference_steps"] = None + inputs["sigmas"] = sigma_schedule + output_sigmas = sd_pipe(**inputs).images + + assert ( + np.abs(output_sigmas.flatten() - output_ts.flatten()).max() < 1e-3 + ), "ays timesteps and ays sigmas should have the same outputs" + assert ( + np.abs(output.flatten() - output_ts.flatten()).max() > 1e-3 + ), "use ays timesteps should have different outputs" + assert ( + np.abs(output.flatten() - output_sigmas.flatten()).max() > 1e-3 + ), "use ays sigmas should have different outputs" + def test_stable_diffusion_xl_prompt_embeds(self): components = self.get_dummy_components() sd_pipe = StableDiffusionXLPipeline(**components) @@ -1046,68 +1084,3 @@ def test_stable_diffusion_lcm(self): max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten()) assert max_diff < 1e-2 - - def test_download_ckpt_diff_format_is_same(self): - ckpt_path = ( - "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors" - ) - - pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.unet.set_default_attn_processor() - pipe.enable_model_cpu_offload() - - generator = torch.Generator(device="cpu").manual_seed(0) - image_ckpt = pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0] - - pipe = StableDiffusionXLPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.unet.set_default_attn_processor() - pipe.enable_model_cpu_offload() - - generator = torch.Generator(device="cpu").manual_seed(0) - image = pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0] - - max_diff = numpy_cosine_similarity_distance(image.flatten(), image_ckpt.flatten()) - - assert max_diff < 6e-3 - - def test_single_file_component_configs(self): - pipe = StableDiffusionXLPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ) - ckpt_path = ( - "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors" - ) - single_file_pipe = StableDiffusionXLPipeline.from_single_file( - ckpt_path, variant="fp16", torch_dtype=torch.float16 - ) - - for param_name, param_value in single_file_pipe.text_encoder.config.to_dict().items(): - if param_name in ["torch_dtype", "architectures", "_name_or_path"]: - continue - assert pipe.text_encoder.config.to_dict()[param_name] == param_value - - for param_name, param_value in single_file_pipe.text_encoder_2.config.to_dict().items(): - if param_name in ["torch_dtype", "architectures", "_name_or_path"]: - continue - assert pipe.text_encoder_2.config.to_dict()[param_name] == param_value - - PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "architectures", "_use_default_values"] - for param_name, param_value in single_file_pipe.unet.config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - if param_name == "upcast_attention" and pipe.unet.config[param_name] is None: - pipe.unet.config[param_name] = False - assert ( - pipe.unet.config[param_name] == param_value - ), f"{param_name} is differs between single file loading and pretrained loading" - - for param_name, param_value in single_file_pipe.vae.config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - assert ( - pipe.vae.config[param_name] == param_value - ), f"{param_name} is differs between single file loading and pretrained loading" diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py index 08bd2a0444cd..416703d23180 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gc import random import unittest @@ -32,13 +31,10 @@ T2IAdapter, UNet2DConditionModel, ) -from diffusers.utils import load_image, logging +from diffusers.utils import logging from diffusers.utils.testing_utils import ( enable_full_determinism, floats_tensor, - numpy_cosine_similarity_distance, - require_torch_gpu, - slow, torch_device, ) @@ -678,54 +674,3 @@ def test_adapter_sdxl_lcm_custom_timesteps(self): print(",".join(debug)) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - -@slow -@require_torch_gpu -class AdapterSDXLPipelineSlowTests(unittest.TestCase): - def setUp(self): - super().setUp() - gc.collect() - torch.cuda.empty_cache() - - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_download_ckpt_diff_format_is_same(self): - ckpt_path = ( - "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors" - ) - adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16) - prompt = "toy" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/t2i_adapter/toy_canny.png" - ) - pipe_single_file = StableDiffusionXLAdapterPipeline.from_single_file( - ckpt_path, - adapter=adapter, - torch_dtype=torch.float16, - ) - pipe_single_file.enable_model_cpu_offload() - pipe_single_file.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - images_single_file = pipe_single_file( - prompt, image=image, generator=generator, output_type="np", num_inference_steps=3 - ).images - - generator = torch.Generator(device="cpu").manual_seed(0) - pipe = StableDiffusionXLAdapterPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", - adapter=adapter, - torch_dtype=torch.float16, - ) - pipe.enable_model_cpu_offload() - images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images - - assert images_single_file[0].shape == (768, 512, 3) - assert images[0].shape == (768, 512, 3) - - max_diff = numpy_cosine_similarity_distance(images[0].flatten(), images_single_file[0].flatten()) - assert max_diff < 5e-3 diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py index 19cab2b32a35..cb338e5e56aa 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gc import random import unittest @@ -32,19 +31,15 @@ from diffusers import ( AutoencoderKL, AutoencoderTiny, - DDIMScheduler, EulerDiscreteScheduler, LCMScheduler, StableDiffusionXLImg2ImgPipeline, UNet2DConditionModel, ) -from diffusers.utils import load_image from diffusers.utils.testing_utils import ( enable_full_determinism, floats_tensor, - numpy_cosine_similarity_distance, require_torch_gpu, - slow, torch_device, ) @@ -781,85 +776,3 @@ def test_inference_batch_single_identical(self): def test_save_load_optional_components(self): self._test_save_load_optional_components() - - -@slow -class StableDiffusionXLImg2ImgIntegrationTests(unittest.TestCase): - def setUp(self): - # clean up the VRAM before each test - super().setUp() - gc.collect() - torch.cuda.empty_cache() - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_download_ckpt_diff_format_is_same(self): - ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/blob/main/sd_xl_refiner_1.0.safetensors" - init_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_img2img/sketch-mountains-input.png" - ) - - pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16 - ) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.unet.set_default_attn_processor() - pipe.enable_model_cpu_offload() - - generator = torch.Generator(device="cpu").manual_seed(0) - image = pipe( - prompt="mountains", image=init_image, num_inference_steps=5, generator=generator, output_type="np" - ).images[0] - - pipe_single_file = StableDiffusionXLImg2ImgPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16) - pipe_single_file.scheduler = DDIMScheduler.from_config(pipe_single_file.scheduler.config) - pipe_single_file.unet.set_default_attn_processor() - pipe_single_file.enable_model_cpu_offload() - - generator = torch.Generator(device="cpu").manual_seed(0) - image_single_file = pipe_single_file( - prompt="mountains", image=init_image, num_inference_steps=5, generator=generator, output_type="np" - ).images[0] - - max_diff = numpy_cosine_similarity_distance(image.flatten(), image_single_file.flatten()) - - assert max_diff < 5e-2 - - def test_single_file_component_configs(self): - pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-refiner-1.0", - torch_dtype=torch.float16, - variant="fp16", - ) - ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/blob/main/sd_xl_refiner_1.0.safetensors" - single_file_pipe = StableDiffusionXLImg2ImgPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16) - - assert pipe.text_encoder is None - assert single_file_pipe.text_encoder is None - - for param_name, param_value in single_file_pipe.text_encoder_2.config.to_dict().items(): - if param_name in ["torch_dtype", "architectures", "_name_or_path"]: - continue - assert pipe.text_encoder_2.config.to_dict()[param_name] == param_value - - PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "architectures", "_use_default_values"] - for param_name, param_value in single_file_pipe.unet.config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - if param_name == "upcast_attention" and pipe.unet.config[param_name] is None: - pipe.unet.config[param_name] = False - assert ( - pipe.unet.config[param_name] == param_value - ), f"{param_name} is differs between single file loading and pretrained loading" - - for param_name, param_value in single_file_pipe.vae.config.items(): - if param_name in PARAMS_TO_IGNORE: - continue - assert ( - pipe.vae.config[param_name] == param_value - ), f"{param_name} differs between single file loading and pretrained loading" diff --git a/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py b/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py index 199ed57bc27b..60fc21e2027b 100644 --- a/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py +++ b/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py @@ -534,7 +534,6 @@ def test_sd_video(self): variant="fp16", torch_dtype=torch.float16, ) - pipe = pipe.to(torch_device) pipe.enable_model_cpu_offload() pipe.set_progress_bar_config(disable=None) image = load_image( diff --git a/tests/schedulers/test_scheduler_dpm_multi.py b/tests/schedulers/test_scheduler_dpm_multi.py index fcf8881bc820..ef407eaa3dc9 100644 --- a/tests/schedulers/test_scheduler_dpm_multi.py +++ b/tests/schedulers/test_scheduler_dpm_multi.py @@ -111,9 +111,33 @@ def full_loop(self, scheduler=None, **config): sample = self.dummy_sample_deter scheduler.set_timesteps(num_inference_steps) + generator = torch.manual_seed(0) + for i, t in enumerate(scheduler.timesteps): residual = model(sample, t) - sample = scheduler.step(residual, t, sample).prev_sample + sample = scheduler.step(residual, t, sample, generator=generator).prev_sample + + return sample + + def full_loop_custom_timesteps(self, **config): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config(**config) + scheduler = scheduler_class(**scheduler_config) + + num_inference_steps = 10 + scheduler.set_timesteps(num_inference_steps) + timesteps = scheduler.timesteps + # reset the timesteps using `timesteps` + scheduler = scheduler_class(**scheduler_config) + scheduler.set_timesteps(num_inference_steps=None, timesteps=timesteps) + + generator = torch.manual_seed(0) + model = self.dummy_model() + sample = self.dummy_sample_deter + + for i, t in enumerate(scheduler.timesteps): + residual = model(sample, t) + sample = scheduler.step(residual, t, sample, generator=generator).prev_sample return sample @@ -309,10 +333,28 @@ def test_fp16_support(self): assert sample.dtype == torch.float16 - def test_duplicated_timesteps(self, **config): + def test_duplicated_timesteps(self): for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config(**config) + scheduler_config = self.get_scheduler_config() scheduler = scheduler_class(**scheduler_config) scheduler.set_timesteps(scheduler.config.num_train_timesteps) assert len(scheduler.timesteps) == scheduler.num_inference_steps + + def test_custom_timesteps(self): + for algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]: + for prediction_type in ["epsilon", "sample", "v_prediction"]: + for final_sigmas_type in ["sigma_min", "zero"]: + sample = self.full_loop( + algorithm_type=algorithm_type, + prediction_type=prediction_type, + final_sigmas_type=final_sigmas_type, + ) + sample_custom_timesteps = self.full_loop_custom_timesteps( + algorithm_type=algorithm_type, + prediction_type=prediction_type, + final_sigmas_type=final_sigmas_type, + ) + assert ( + torch.sum(torch.abs(sample - sample_custom_timesteps)) < 1e-5 + ), f"Scheduler outputs are not identical for algorithm_type: {algorithm_type}, prediction_type: {prediction_type} and final_sigmas_type: {final_sigmas_type}" diff --git a/tests/schedulers/test_scheduler_dpm_single.py b/tests/schedulers/test_scheduler_dpm_single.py index 251a1506f39b..ea43c210d650 100644 --- a/tests/schedulers/test_scheduler_dpm_single.py +++ b/tests/schedulers/test_scheduler_dpm_single.py @@ -103,14 +103,31 @@ def full_loop(self, scheduler=None, **config): scheduler_config = self.get_scheduler_config(**config) scheduler = scheduler_class(**scheduler_config) + num_inference_steps = 10 + model = self.dummy_model() + sample = self.dummy_sample_deter + scheduler.set_timesteps(num_inference_steps) + + for i, t in enumerate(scheduler.timesteps): + residual = model(sample, t) + sample = scheduler.step(residual, t, sample).prev_sample + + return sample + + def full_loop_custom_timesteps(self, **config): scheduler_class = self.scheduler_classes[0] scheduler_config = self.get_scheduler_config(**config) scheduler = scheduler_class(**scheduler_config) num_inference_steps = 10 + scheduler.set_timesteps(num_inference_steps) + timesteps = scheduler.timesteps + # reset the timesteps using`timesteps` + scheduler = scheduler_class(**scheduler_config) + scheduler.set_timesteps(num_inference_steps=None, timesteps=timesteps) + model = self.dummy_model() sample = self.dummy_sample_deter - scheduler.set_timesteps(num_inference_steps) for i, t in enumerate(scheduler.timesteps): residual = model(sample, t) @@ -307,3 +324,21 @@ def test_full_loop_with_noise(self): assert abs(result_sum.item() - 269.2187) < 1e-2, f" expected result sum 269.2187, but get {result_sum}" assert abs(result_mean.item() - 0.3505) < 1e-3, f" expected result mean 0.3505, but get {result_mean}" + + def test_custom_timesteps(self): + for prediction_type in ["epsilon", "sample", "v_prediction"]: + for lower_order_final in [True, False]: + for final_sigmas_type in ["sigma_min", "zero"]: + sample = self.full_loop( + prediction_type=prediction_type, + lower_order_final=lower_order_final, + final_sigmas_type=final_sigmas_type, + ) + sample_custom_timesteps = self.full_loop_custom_timesteps( + prediction_type=prediction_type, + lower_order_final=lower_order_final, + final_sigmas_type=final_sigmas_type, + ) + assert ( + torch.sum(torch.abs(sample - sample_custom_timesteps)) < 1e-5 + ), f"Scheduler outputs are not identical for prediction_type: {prediction_type}, lower_order_final: {lower_order_final} and final_sigmas_type: {final_sigmas_type}" diff --git a/tests/schedulers/test_scheduler_euler.py b/tests/schedulers/test_scheduler_euler.py index 41c418c5064c..fbb49b164165 100644 --- a/tests/schedulers/test_scheduler_euler.py +++ b/tests/schedulers/test_scheduler_euler.py @@ -49,12 +49,13 @@ def test_rescale_betas_zero_snr(self): for rescale_betas_zero_snr in [True, False]: self.check_over_configs(rescale_betas_zero_snr=rescale_betas_zero_snr) - def test_full_loop_no_noise(self): + def full_loop(self, **config): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() + scheduler_config = self.get_scheduler_config(**config) scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(self.num_inference_steps) + num_inference_steps = self.num_inference_steps + scheduler.set_timesteps(num_inference_steps) generator = torch.manual_seed(0) @@ -69,19 +70,46 @@ def test_full_loop_no_noise(self): output = scheduler.step(model_output, t, sample, generator=generator) sample = output.prev_sample + return sample - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) + def full_loop_custom_timesteps(self, **config): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config(**config) + scheduler = scheduler_class(**scheduler_config) - assert abs(result_sum.item() - 10.0807) < 1e-2 - assert abs(result_mean.item() - 0.0131) < 1e-3 + num_inference_steps = self.num_inference_steps + scheduler.set_timesteps(num_inference_steps) + timesteps = scheduler.timesteps + # reset the timesteps using `timesteps` + scheduler = scheduler_class(**scheduler_config) + scheduler.set_timesteps(num_inference_steps=None, timesteps=timesteps) - def test_full_loop_with_v_prediction(self): + generator = torch.manual_seed(0) + + model = self.dummy_model() + sample = self.dummy_sample_deter * scheduler.init_noise_sigma + sample = sample.to(torch_device) + + for i, t in enumerate(scheduler.timesteps): + sample = scheduler.scale_model_input(sample, t) + + model_output = model(sample, t) + + output = scheduler.step(model_output, t, sample, generator=generator) + sample = output.prev_sample + return sample + + def full_loop_custom_sigmas(self, **config): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") + scheduler_config = self.get_scheduler_config(**config) scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(self.num_inference_steps) + num_inference_steps = self.num_inference_steps + scheduler.set_timesteps(num_inference_steps) + sigmas = scheduler.sigmas + # reset the timesteps using `sigmas` + scheduler = scheduler_class(**scheduler_config) + scheduler.set_timesteps(num_inference_steps=None, sigmas=sigmas) generator = torch.manual_seed(0) @@ -96,6 +124,19 @@ def test_full_loop_with_v_prediction(self): output = scheduler.step(model_output, t, sample, generator=generator) sample = output.prev_sample + return sample + + def test_full_loop_no_noise(self): + sample = self.full_loop() + + result_sum = torch.sum(torch.abs(sample)) + result_mean = torch.mean(torch.abs(sample)) + + assert abs(result_sum.item() - 10.0807) < 1e-2 + assert abs(result_mean.item() - 0.0131) < 1e-3 + + def test_full_loop_with_v_prediction(self): + sample = self.full_loop(prediction_type="v_prediction") result_sum = torch.sum(torch.abs(sample)) result_mean = torch.mean(torch.abs(sample)) @@ -189,3 +230,36 @@ def test_full_loop_with_noise(self): assert abs(result_sum.item() - 57062.9297) < 1e-2, f" expected result sum 57062.9297, but get {result_sum}" assert abs(result_mean.item() - 74.3007) < 1e-3, f" expected result mean 74.3007, but get {result_mean}" + + def test_custom_timesteps(self): + for prediction_type in ["epsilon", "sample", "v_prediction"]: + for interpolation_type in ["linear", "log_linear"]: + for final_sigmas_type in ["sigma_min", "zero"]: + sample = self.full_loop( + prediction_type=prediction_type, + interpolation_type=interpolation_type, + final_sigmas_type=final_sigmas_type, + ) + sample_custom_timesteps = self.full_loop_custom_timesteps( + prediction_type=prediction_type, + interpolation_type=interpolation_type, + final_sigmas_type=final_sigmas_type, + ) + assert ( + torch.sum(torch.abs(sample - sample_custom_timesteps)) < 1e-5 + ), f"Scheduler outputs are not identical for prediction_type: {prediction_type}, interpolation_type: {interpolation_type} and final_sigmas_type: {final_sigmas_type}" + + def test_custom_sigmas(self): + for prediction_type in ["epsilon", "sample", "v_prediction"]: + for final_sigmas_type in ["sigma_min", "zero"]: + sample = self.full_loop( + prediction_type=prediction_type, + final_sigmas_type=final_sigmas_type, + ) + sample_custom_timesteps = self.full_loop_custom_sigmas( + prediction_type=prediction_type, + final_sigmas_type=final_sigmas_type, + ) + assert ( + torch.sum(torch.abs(sample - sample_custom_timesteps)) < 1e-5 + ), f"Scheduler outputs are not identical for prediction_type: {prediction_type} and final_sigmas_type: {final_sigmas_type}" diff --git a/tests/schedulers/test_scheduler_heun.py b/tests/schedulers/test_scheduler_heun.py index df2b62d97191..a3689ef2ea63 100644 --- a/tests/schedulers/test_scheduler_heun.py +++ b/tests/schedulers/test_scheduler_heun.py @@ -41,12 +41,13 @@ def test_prediction_type(self): for prediction_type in ["epsilon", "v_prediction", "sample"]: self.check_over_configs(prediction_type=prediction_type) - def test_full_loop_no_noise(self): + def full_loop(self, **config): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() + scheduler_config = self.get_scheduler_config(**config) scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(self.num_inference_steps) + num_inference_steps = self.num_inference_steps + scheduler.set_timesteps(num_inference_steps) model = self.dummy_model() sample = self.dummy_sample_deter * scheduler.init_noise_sigma @@ -60,23 +61,20 @@ def test_full_loop_no_noise(self): output = scheduler.step(model_output, t, sample) sample = output.prev_sample - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - if torch_device in ["cpu", "mps"]: - assert abs(result_sum.item() - 0.1233) < 1e-2 - assert abs(result_mean.item() - 0.0002) < 1e-3 - else: - # CUDA - assert abs(result_sum.item() - 0.1233) < 1e-2 - assert abs(result_mean.item() - 0.0002) < 1e-3 + return sample - def test_full_loop_with_v_prediction(self): + def full_loop_custom_timesteps(self, **config): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") + scheduler_config = self.get_scheduler_config(**config) scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(self.num_inference_steps) + num_inference_steps = self.num_inference_steps + scheduler.set_timesteps(num_inference_steps) + timesteps = scheduler.timesteps + timesteps = torch.cat([timesteps[:1], timesteps[1::2]]) + # reset the timesteps using `timesteps` + scheduler = scheduler_class(**scheduler_config) + scheduler.set_timesteps(num_inference_steps=None, timesteps=timesteps) model = self.dummy_model() sample = self.dummy_sample_deter * scheduler.init_noise_sigma @@ -90,6 +88,23 @@ def test_full_loop_with_v_prediction(self): output = scheduler.step(model_output, t, sample) sample = output.prev_sample + return sample + + def test_full_loop_no_noise(self): + sample = self.full_loop() + result_sum = torch.sum(torch.abs(sample)) + result_mean = torch.mean(torch.abs(sample)) + + if torch_device in ["cpu", "mps"]: + assert abs(result_sum.item() - 0.1233) < 1e-2 + assert abs(result_mean.item() - 0.0002) < 1e-3 + else: + # CUDA + assert abs(result_sum.item() - 0.1233) < 1e-2 + assert abs(result_mean.item() - 0.0002) < 1e-3 + + def test_full_loop_with_v_prediction(self): + sample = self.full_loop(prediction_type="v_prediction") result_sum = torch.sum(torch.abs(sample)) result_mean = torch.mean(torch.abs(sample)) @@ -189,3 +204,18 @@ def test_full_loop_with_noise(self): assert abs(result_sum.item() - 75074.8906) < 1e-2, f" expected result sum 75074.8906, but get {result_sum}" assert abs(result_mean.item() - 97.7538) < 1e-3, f" expected result mean 97.7538, but get {result_mean}" + + def test_custom_timesteps(self): + for prediction_type in ["epsilon", "sample", "v_prediction"]: + for timestep_spacing in ["linspace", "leading"]: + sample = self.full_loop( + prediction_type=prediction_type, + timestep_spacing=timestep_spacing, + ) + sample_custom_timesteps = self.full_loop_custom_timesteps( + prediction_type=prediction_type, + timestep_spacing=timestep_spacing, + ) + assert ( + torch.sum(torch.abs(sample - sample_custom_timesteps)) < 1e-5 + ), f"Scheduler outputs are not identical for prediction_type: {prediction_type}, timestep_spacing: {timestep_spacing}" diff --git a/tests/single_file/__init__.py b/tests/single_file/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/single_file/single_file_testing_utils.py b/tests/single_file/single_file_testing_utils.py new file mode 100644 index 000000000000..5157cd4ca63c --- /dev/null +++ b/tests/single_file/single_file_testing_utils.py @@ -0,0 +1,380 @@ +import tempfile +from io import BytesIO + +import requests +import torch +from huggingface_hub import hf_hub_download, snapshot_download + +from diffusers.models.attention_processor import AttnProcessor +from diffusers.utils.testing_utils import ( + numpy_cosine_similarity_distance, + torch_device, +) + + +def download_single_file_checkpoint(repo_id, filename, tmpdir): + path = hf_hub_download(repo_id, filename=filename, local_dir=tmpdir) + return path + + +def download_original_config(config_url, tmpdir): + original_config_file = BytesIO(requests.get(config_url).content) + path = f"{tmpdir}/config.yaml" + with open(path, "wb") as f: + f.write(original_config_file.read()) + + return path + + +def download_diffusers_config(repo_id, tmpdir): + path = snapshot_download( + repo_id, + ignore_patterns=[ + "**/*.ckpt", + "*.ckpt", + "**/*.bin", + "*.bin", + "**/*.pt", + "*.pt", + "**/*.safetensors", + "*.safetensors", + ], + allow_patterns=["**/*.json", "*.json", "*.txt", "**/*.txt"], + local_dir=tmpdir, + ) + return path + + +class SDSingleFileTesterMixin: + def _compare_component_configs(self, pipe, single_file_pipe): + for param_name, param_value in single_file_pipe.text_encoder.config.to_dict().items(): + if param_name in ["torch_dtype", "architectures", "_name_or_path"]: + continue + assert pipe.text_encoder.config.to_dict()[param_name] == param_value + + PARAMS_TO_IGNORE = [ + "torch_dtype", + "_name_or_path", + "architectures", + "_use_default_values", + "_diffusers_version", + ] + for component_name, component in single_file_pipe.components.items(): + if component_name in single_file_pipe._optional_components: + continue + + # skip testing transformer based components here + # skip text encoders / safety checkers since they have already been tested + if component_name in ["text_encoder", "tokenizer", "safety_checker", "feature_extractor"]: + continue + + assert component_name in pipe.components, f"single file {component_name} not found in pretrained pipeline" + assert isinstance( + component, pipe.components[component_name].__class__ + ), f"single file {component.__class__.__name__} and pretrained {pipe.components[component_name].__class__.__name__} are not the same" + + for param_name, param_value in component.config.items(): + if param_name in PARAMS_TO_IGNORE: + continue + + # Some pretrained configs will set upcast attention to None + # In single file loading it defaults to the value in the class __init__ which is False + if param_name == "upcast_attention" and pipe.components[component_name].config[param_name] is None: + pipe.components[component_name].config[param_name] = param_value + + assert ( + pipe.components[component_name].config[param_name] == param_value + ), f"single file {param_name}: {param_value} differs from pretrained {pipe.components[component_name].config[param_name]}" + + def test_single_file_components(self, pipe=None, single_file_pipe=None): + single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file( + self.ckpt_path, safety_checker=None + ) + pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None) + + self._compare_component_configs(pipe, single_file_pipe) + + def test_single_file_components_local_files_only(self, pipe=None, single_file_pipe=None): + pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + + single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file( + local_ckpt_path, safety_checker=None, local_files_only=True + ) + + self._compare_component_configs(pipe, single_file_pipe) + + def test_single_file_components_with_original_config( + self, + pipe=None, + single_file_pipe=None, + ): + pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None) + # Not possible to infer this value when original config is provided + # we just pass it in here otherwise this test will fail + upcast_attention = pipe.unet.config.upcast_attention + + single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file( + self.ckpt_path, + original_config=self.original_config, + safety_checker=None, + upcast_attention=upcast_attention, + ) + + self._compare_component_configs(pipe, single_file_pipe) + + def test_single_file_components_with_original_config_local_files_only( + self, + pipe=None, + single_file_pipe=None, + ): + pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None) + + # Not possible to infer this value when original config is provided + # we just pass it in here otherwise this test will fail + upcast_attention = pipe.unet.config.upcast_attention + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + local_original_config = download_original_config(self.original_config, tmpdir) + + single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file( + local_ckpt_path, + original_config=local_original_config, + safety_checker=None, + upcast_attention=upcast_attention, + local_files_only=True, + ) + + self._compare_component_configs(pipe, single_file_pipe) + + def test_single_file_format_inference_is_same_as_pretrained(self, expected_max_diff=1e-4): + sf_pipe = self.pipeline_class.from_single_file(self.ckpt_path, safety_checker=None) + sf_pipe.unet.set_attn_processor(AttnProcessor()) + sf_pipe.enable_model_cpu_offload() + + inputs = self.get_inputs(torch_device) + image_single_file = sf_pipe(**inputs).images[0] + + pipe = self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None) + pipe.unet.set_attn_processor(AttnProcessor()) + pipe.enable_model_cpu_offload() + + inputs = self.get_inputs(torch_device) + image = pipe(**inputs).images[0] + + max_diff = numpy_cosine_similarity_distance(image.flatten(), image_single_file.flatten()) + + assert max_diff < expected_max_diff + + def test_single_file_components_with_diffusers_config( + self, + pipe=None, + single_file_pipe=None, + ): + single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file( + self.ckpt_path, config=self.repo_id, safety_checker=None + ) + pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None) + + self._compare_component_configs(pipe, single_file_pipe) + + def test_single_file_components_with_diffusers_config_local_files_only( + self, + pipe=None, + single_file_pipe=None, + ): + pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + local_diffusers_config = download_diffusers_config(self.repo_id, tmpdir) + + single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file( + local_ckpt_path, config=local_diffusers_config, safety_checker=None, local_files_only=True + ) + + self._compare_component_configs(pipe, single_file_pipe) + + +class SDXLSingleFileTesterMixin: + def _compare_component_configs(self, pipe, single_file_pipe): + # Skip testing the text_encoder for Refiner Pipelines + if pipe.text_encoder: + for param_name, param_value in single_file_pipe.text_encoder.config.to_dict().items(): + if param_name in ["torch_dtype", "architectures", "_name_or_path"]: + continue + assert pipe.text_encoder.config.to_dict()[param_name] == param_value + + for param_name, param_value in single_file_pipe.text_encoder_2.config.to_dict().items(): + if param_name in ["torch_dtype", "architectures", "_name_or_path"]: + continue + assert pipe.text_encoder_2.config.to_dict()[param_name] == param_value + + PARAMS_TO_IGNORE = [ + "torch_dtype", + "_name_or_path", + "architectures", + "_use_default_values", + "_diffusers_version", + ] + for component_name, component in single_file_pipe.components.items(): + if component_name in single_file_pipe._optional_components: + continue + + # skip text encoders since they have already been tested + if component_name in ["text_encoder", "text_encoder_2", "tokenizer", "tokenizer_2"]: + continue + + # skip safety checker if it is not present in the pipeline + if component_name in ["safety_checker", "feature_extractor"]: + continue + + assert component_name in pipe.components, f"single file {component_name} not found in pretrained pipeline" + assert isinstance( + component, pipe.components[component_name].__class__ + ), f"single file {component.__class__.__name__} and pretrained {pipe.components[component_name].__class__.__name__} are not the same" + + for param_name, param_value in component.config.items(): + if param_name in PARAMS_TO_IGNORE: + continue + + # Some pretrained configs will set upcast attention to None + # In single file loading it defaults to the value in the class __init__ which is False + if param_name == "upcast_attention" and pipe.components[component_name].config[param_name] is None: + pipe.components[component_name].config[param_name] = param_value + + assert ( + pipe.components[component_name].config[param_name] == param_value + ), f"single file {param_name}: {param_value} differs from pretrained {pipe.components[component_name].config[param_name]}" + + def test_single_file_components(self, pipe=None, single_file_pipe=None): + single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file( + self.ckpt_path, safety_checker=None + ) + pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None) + + self._compare_component_configs( + pipe, + single_file_pipe, + ) + + def test_single_file_components_local_files_only( + self, + pipe=None, + single_file_pipe=None, + ): + pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + + single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file( + local_ckpt_path, safety_checker=None, local_files_only=True + ) + + self._compare_component_configs(pipe, single_file_pipe) + + def test_single_file_components_with_original_config( + self, + pipe=None, + single_file_pipe=None, + ): + pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None) + # Not possible to infer this value when original config is provided + # we just pass it in here otherwise this test will fail + upcast_attention = pipe.unet.config.upcast_attention + single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file( + self.ckpt_path, + original_config=self.original_config, + safety_checker=None, + upcast_attention=upcast_attention, + ) + + self._compare_component_configs( + pipe, + single_file_pipe, + ) + + def test_single_file_components_with_original_config_local_files_only( + self, + pipe=None, + single_file_pipe=None, + ): + pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None) + # Not possible to infer this value when original config is provided + # we just pass it in here otherwise this test will fail + upcast_attention = pipe.unet.config.upcast_attention + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + local_original_config = download_original_config(self.original_config, tmpdir) + + single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file( + local_ckpt_path, + original_config=local_original_config, + upcast_attention=upcast_attention, + safety_checker=None, + local_files_only=True, + ) + + self._compare_component_configs( + pipe, + single_file_pipe, + ) + + def test_single_file_components_with_diffusers_config( + self, + pipe=None, + single_file_pipe=None, + ): + single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file( + self.ckpt_path, config=self.repo_id, safety_checker=None + ) + pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None) + + self._compare_component_configs(pipe, single_file_pipe) + + def test_single_file_components_with_diffusers_config_local_files_only( + self, + pipe=None, + single_file_pipe=None, + ): + pipe = pipe or self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + local_diffusers_config = download_diffusers_config(self.repo_id, tmpdir) + + single_file_pipe = single_file_pipe or self.pipeline_class.from_single_file( + local_ckpt_path, config=local_diffusers_config, safety_checker=None, local_files_only=True + ) + + self._compare_component_configs(pipe, single_file_pipe) + + def test_single_file_format_inference_is_same_as_pretrained(self, expected_max_diff=1e-4): + sf_pipe = self.pipeline_class.from_single_file(self.ckpt_path, torch_dtype=torch.float16, safety_checker=None) + sf_pipe.unet.set_default_attn_processor() + sf_pipe.enable_model_cpu_offload() + + inputs = self.get_inputs(torch_device) + image_single_file = sf_pipe(**inputs).images[0] + + pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16, safety_checker=None) + pipe.unet.set_default_attn_processor() + pipe.enable_model_cpu_offload() + + inputs = self.get_inputs(torch_device) + image = pipe(**inputs).images[0] + + max_diff = numpy_cosine_similarity_distance(image.flatten(), image_single_file.flatten()) + + assert max_diff < expected_max_diff diff --git a/tests/single_file/test_model_controlnet_single_file.py b/tests/single_file/test_model_controlnet_single_file.py new file mode 100644 index 000000000000..1d5b790ebb4a --- /dev/null +++ b/tests/single_file/test_model_controlnet_single_file.py @@ -0,0 +1,78 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import unittest + +import torch + +from diffusers import ( + ControlNetModel, +) +from diffusers.utils.testing_utils import ( + enable_full_determinism, + require_torch_gpu, + slow, +) + + +enable_full_determinism() + + +@slow +@require_torch_gpu +class ControlNetModelSingleFileTests(unittest.TestCase): + model_class = ControlNetModel + ckpt_path = "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth" + repo_id = "lllyasviel/control_v11p_sd15_canny" + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_single_file_components(self): + model = self.model_class.from_pretrained(self.repo_id) + model_single_file = self.model_class.from_single_file(self.ckpt_path) + + PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"] + for param_name, param_value in model_single_file.config.items(): + if param_name in PARAMS_TO_IGNORE: + continue + assert ( + model.config[param_name] == param_value + ), f"{param_name} differs between single file loading and pretrained loading" + + def test_single_file_arguments(self): + model_default = self.model_class.from_single_file(self.ckpt_path) + + assert model_default.config.upcast_attention is False + assert model_default.dtype == torch.float32 + + torch_dtype = torch.float16 + upcast_attention = True + + model = self.model_class.from_single_file( + self.ckpt_path, + upcast_attention=upcast_attention, + torch_dtype=torch_dtype, + ) + assert model.config.upcast_attention == upcast_attention + assert model.dtype == torch_dtype diff --git a/tests/single_file/test_model_sd_cascade_unet_single_file.py b/tests/single_file/test_model_sd_cascade_unet_single_file.py new file mode 100644 index 000000000000..43253eb6d59f --- /dev/null +++ b/tests/single_file/test_model_sd_cascade_unet_single_file.py @@ -0,0 +1,114 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import unittest + +import torch + +from diffusers import StableCascadeUNet +from diffusers.utils import logging +from diffusers.utils.testing_utils import ( + enable_full_determinism, + require_torch_gpu, + slow, +) + + +logger = logging.get_logger(__name__) + +enable_full_determinism() + + +@slow +@require_torch_gpu +class StableCascadeUNetSingleFileTest(unittest.TestCase): + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_single_file_components_stage_b(self): + model_single_file = StableCascadeUNet.from_single_file( + "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_bf16.safetensors", + torch_dtype=torch.bfloat16, + ) + model = StableCascadeUNet.from_pretrained( + "stabilityai/stable-cascade", variant="bf16", subfolder="decoder", use_safetensors=True + ) + + PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"] + for param_name, param_value in model_single_file.config.items(): + if param_name in PARAMS_TO_IGNORE: + continue + assert ( + model.config[param_name] == param_value + ), f"{param_name} differs between single file loading and pretrained loading" + + def test_single_file_components_stage_b_lite(self): + model_single_file = StableCascadeUNet.from_single_file( + "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_lite_bf16.safetensors", + torch_dtype=torch.bfloat16, + ) + model = StableCascadeUNet.from_pretrained( + "stabilityai/stable-cascade", variant="bf16", subfolder="decoder_lite" + ) + + PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"] + for param_name, param_value in model_single_file.config.items(): + if param_name in PARAMS_TO_IGNORE: + continue + assert ( + model.config[param_name] == param_value + ), f"{param_name} differs between single file loading and pretrained loading" + + def test_single_file_components_stage_c(self): + model_single_file = StableCascadeUNet.from_single_file( + "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_c_bf16.safetensors", + torch_dtype=torch.bfloat16, + ) + model = StableCascadeUNet.from_pretrained( + "stabilityai/stable-cascade-prior", variant="bf16", subfolder="prior" + ) + + PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"] + for param_name, param_value in model_single_file.config.items(): + if param_name in PARAMS_TO_IGNORE: + continue + assert ( + model.config[param_name] == param_value + ), f"{param_name} differs between single file loading and pretrained loading" + + def test_single_file_components_stage_c_lite(self): + model_single_file = StableCascadeUNet.from_single_file( + "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_c_lite_bf16.safetensors", + torch_dtype=torch.bfloat16, + ) + model = StableCascadeUNet.from_pretrained( + "stabilityai/stable-cascade-prior", variant="bf16", subfolder="prior_lite" + ) + + PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"] + for param_name, param_value in model_single_file.config.items(): + if param_name in PARAMS_TO_IGNORE: + continue + assert ( + model.config[param_name] == param_value + ), f"{param_name} differs between single file loading and pretrained loading" diff --git a/tests/single_file/test_model_vae_single_file.py b/tests/single_file/test_model_vae_single_file.py new file mode 100644 index 000000000000..63f2bb757472 --- /dev/null +++ b/tests/single_file/test_model_vae_single_file.py @@ -0,0 +1,117 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import unittest + +import torch + +from diffusers import ( + AutoencoderKL, +) +from diffusers.utils.testing_utils import ( + enable_full_determinism, + load_hf_numpy, + numpy_cosine_similarity_distance, + require_torch_gpu, + slow, + torch_device, +) + + +enable_full_determinism() + + +@slow +@require_torch_gpu +class AutoencoderKLSingleFileTests(unittest.TestCase): + model_class = AutoencoderKL + ckpt_path = ( + "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors" + ) + repo_id = "stabilityai/sd-vae-ft-mse" + main_input_name = "sample" + base_precision = 1e-2 + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_file_format(self, seed, shape): + return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy" + + def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False): + dtype = torch.float16 if fp16 else torch.float32 + image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype) + return image + + def test_single_file_inference_same_as_pretrained(self): + model_1 = self.model_class.from_pretrained(self.repo_id).to(torch_device) + model_2 = self.model_class.from_single_file(self.ckpt_path, config=self.repo_id).to(torch_device) + + image = self.get_sd_image(33) + + generator = torch.Generator(torch_device) + + with torch.no_grad(): + sample_1 = model_1(image, generator=generator.manual_seed(0)).sample + sample_2 = model_2(image, generator=generator.manual_seed(0)).sample + + assert sample_1.shape == sample_2.shape + + output_slice_1 = sample_1.flatten().float().cpu() + output_slice_2 = sample_2.flatten().float().cpu() + + assert numpy_cosine_similarity_distance(output_slice_1, output_slice_2) < 1e-4 + + def test_single_file_components(self): + model = self.model_class.from_pretrained(self.repo_id) + model_single_file = self.model_class.from_single_file(self.ckpt_path, config=self.repo_id) + + PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"] + for param_name, param_value in model_single_file.config.items(): + if param_name in PARAMS_TO_IGNORE: + continue + assert ( + model.config[param_name] == param_value + ), f"{param_name} differs between pretrained loading and single file loading" + + def test_single_file_arguments(self): + model_default = self.model_class.from_single_file(self.ckpt_path, config=self.repo_id) + + assert model_default.config.scaling_factor == 0.18215 + assert model_default.config.sample_size == 256 + assert model_default.dtype == torch.float32 + + scaling_factor = 2.0 + sample_size = 512 + torch_dtype = torch.float16 + + model = self.model_class.from_single_file( + self.ckpt_path, + config=self.repo_id, + sample_size=sample_size, + scaling_factor=scaling_factor, + torch_dtype=torch_dtype, + ) + assert model.config.scaling_factor == scaling_factor + assert model.config.sample_size == sample_size + assert model.dtype == torch_dtype diff --git a/tests/single_file/test_stable_diffusion_controlnet_img2img_single_file.py b/tests/single_file/test_stable_diffusion_controlnet_img2img_single_file.py new file mode 100644 index 000000000000..8e9ac7973609 --- /dev/null +++ b/tests/single_file/test_stable_diffusion_controlnet_img2img_single_file.py @@ -0,0 +1,182 @@ +import gc +import tempfile +import unittest + +import torch + +from diffusers import ControlNetModel, StableDiffusionControlNetPipeline +from diffusers.utils import load_image +from diffusers.utils.testing_utils import ( + enable_full_determinism, + numpy_cosine_similarity_distance, + require_torch_gpu, + slow, + torch_device, +) + +from .single_file_testing_utils import ( + SDSingleFileTesterMixin, + download_diffusers_config, + download_original_config, + download_single_file_checkpoint, +) + + +enable_full_determinism() + + +@slow +@require_torch_gpu +class StableDiffusionControlNetPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): + pipeline_class = StableDiffusionControlNetPipeline + ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors" + original_config = ( + "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml" + ) + repo_id = "runwayml/stable-diffusion-v1-5" + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + generator = torch.Generator(device=generator_device).manual_seed(seed) + init_image = load_image( + "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" + "/stable_diffusion_img2img/sketch-mountains-input.png" + ) + control_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" + ).resize((512, 512)) + prompt = "bird" + + inputs = { + "prompt": prompt, + "image": init_image, + "control_image": control_image, + "generator": generator, + "num_inference_steps": 3, + "strength": 0.75, + "guidance_scale": 7.5, + "output_type": "np", + } + return inputs + + def test_single_file_format_inference_is_same_as_pretrained(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny") + pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet) + pipe.unet.set_default_attn_processor() + pipe.enable_model_cpu_offload() + + pipe_sf = self.pipeline_class.from_single_file( + self.ckpt_path, + controlnet=controlnet, + ) + pipe_sf.unet.set_default_attn_processor() + pipe_sf.enable_model_cpu_offload() + + inputs = self.get_inputs(torch_device) + output = pipe(**inputs).images[0] + + inputs = self.get_inputs(torch_device) + output_sf = pipe_sf(**inputs).images[0] + + max_diff = numpy_cosine_similarity_distance(output_sf.flatten(), output.flatten()) + assert max_diff < 1e-3 + + def test_single_file_components(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny") + pipe = self.pipeline_class.from_pretrained( + self.repo_id, variant="fp16", safety_checker=None, controlnet=controlnet + ) + pipe_single_file = self.pipeline_class.from_single_file( + self.ckpt_path, + safety_checker=None, + controlnet=controlnet, + ) + + super()._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_local_files_only(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny") + pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + + pipe_single_file = self.pipeline_class.from_single_file( + local_ckpt_path, controlnet=controlnet, safety_checker=None, local_files_only=True + ) + + super()._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_original_config(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny", variant="fp16") + pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet) + pipe_single_file = self.pipeline_class.from_single_file( + self.ckpt_path, controlnet=controlnet, safety_checker=None, original_config=self.original_config + ) + + super()._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_original_config_local_files_only(self): + controlnet = ControlNetModel.from_pretrained( + "lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16, variant="fp16" + ) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + controlnet=controlnet, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + local_original_config = download_original_config(self.original_config, tmpdir) + + pipe_single_file = self.pipeline_class.from_single_file( + local_ckpt_path, + original_config=local_original_config, + controlnet=controlnet, + safety_checker=None, + local_files_only=True, + ) + super()._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_diffusers_config(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny", variant="fp16") + pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet) + pipe_single_file = self.pipeline_class.from_single_file( + self.ckpt_path, controlnet=controlnet, safety_checker=None, original_config=self.original_config + ) + + super()._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_diffusers_config_local_files_only(self): + controlnet = ControlNetModel.from_pretrained( + "lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16, variant="fp16" + ) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + controlnet=controlnet, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + local_diffusers_config = download_diffusers_config(self.repo_id, tmpdir) + + pipe_single_file = self.pipeline_class.from_single_file( + local_ckpt_path, + config=local_diffusers_config, + safety_checker=None, + controlnet=controlnet, + local_files_only=True, + ) + super()._compare_component_configs(pipe, pipe_single_file) diff --git a/tests/single_file/test_stable_diffusion_controlnet_inpaint_single_file.py b/tests/single_file/test_stable_diffusion_controlnet_inpaint_single_file.py new file mode 100644 index 000000000000..8c750437f719 --- /dev/null +++ b/tests/single_file/test_stable_diffusion_controlnet_inpaint_single_file.py @@ -0,0 +1,183 @@ +import gc +import tempfile +import unittest + +import torch + +from diffusers import ControlNetModel, StableDiffusionControlNetInpaintPipeline +from diffusers.utils import load_image +from diffusers.utils.testing_utils import ( + enable_full_determinism, + numpy_cosine_similarity_distance, + require_torch_gpu, + slow, +) + +from .single_file_testing_utils import ( + SDSingleFileTesterMixin, + download_diffusers_config, + download_original_config, + download_single_file_checkpoint, +) + + +enable_full_determinism() + + +@slow +@require_torch_gpu +class StableDiffusionControlNetInpaintPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): + pipeline_class = StableDiffusionControlNetInpaintPipeline + ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-inpainting/blob/main/sd-v1-5-inpainting.ckpt" + original_config = "https://raw.githubusercontent.com/runwayml/stable-diffusion/main/configs/stable-diffusion/v1-inpainting-inference.yaml" + repo_id = "runwayml/stable-diffusion-inpainting" + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self): + control_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" + ).resize((512, 512)) + image = load_image( + "https://huggingface.co/lllyasviel/sd-controlnet-canny/resolve/main/images/bird.png" + ).resize((512, 512)) + mask_image = load_image( + "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" + "/stable_diffusion_inpaint/input_bench_mask.png" + ).resize((512, 512)) + + inputs = { + "prompt": "bird", + "image": image, + "control_image": control_image, + "mask_image": mask_image, + "generator": torch.Generator(device="cpu").manual_seed(0), + "num_inference_steps": 3, + "output_type": "np", + } + + return inputs + + def test_single_file_format_inference_is_same_as_pretrained(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny") + pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet, safety_checker=None) + pipe.unet.set_default_attn_processor() + pipe.enable_model_cpu_offload() + + pipe_sf = self.pipeline_class.from_single_file(self.ckpt_path, controlnet=controlnet, safety_checker=None) + pipe_sf.unet.set_default_attn_processor() + pipe_sf.enable_model_cpu_offload() + + inputs = self.get_inputs() + output = pipe(**inputs).images[0] + + inputs = self.get_inputs() + output_sf = pipe_sf(**inputs).images[0] + + max_diff = numpy_cosine_similarity_distance(output_sf.flatten(), output.flatten()) + assert max_diff < 1e-3 + + def test_single_file_components(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny") + pipe = self.pipeline_class.from_pretrained( + self.repo_id, variant="fp16", safety_checker=None, controlnet=controlnet + ) + pipe_single_file = self.pipeline_class.from_single_file( + self.ckpt_path, + safety_checker=None, + controlnet=controlnet, + ) + + super()._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_local_files_only(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny") + pipe = self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None, controlnet=controlnet) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + + pipe_single_file = self.pipeline_class.from_single_file( + local_ckpt_path, controlnet=controlnet, safety_checker=None, local_files_only=True + ) + + super()._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_original_config(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny", variant="fp16") + pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet) + pipe_single_file = self.pipeline_class.from_single_file( + self.ckpt_path, controlnet=controlnet, original_config=self.original_config + ) + + super()._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_original_config_local_files_only(self): + controlnet = ControlNetModel.from_pretrained( + "lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16, variant="fp16" + ) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + controlnet=controlnet, + safety_checker=None, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + local_original_config = download_original_config(self.original_config, tmpdir) + + pipe_single_file = self.pipeline_class.from_single_file( + local_ckpt_path, + original_config=local_original_config, + controlnet=controlnet, + safety_checker=None, + local_files_only=True, + ) + super()._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_diffusers_config(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny", variant="fp16") + pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet) + pipe_single_file = self.pipeline_class.from_single_file( + self.ckpt_path, + controlnet=controlnet, + config=self.repo_id, + ) + + super()._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_diffusers_config_local_files_only(self): + controlnet = ControlNetModel.from_pretrained( + "lllyasviel/control_v11p_sd15_canny", + torch_dtype=torch.float16, + variant="fp16", + ) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + controlnet=controlnet, + safety_checker=None, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + local_diffusers_config = download_diffusers_config(self.repo_id, tmpdir) + + pipe_single_file = self.pipeline_class.from_single_file( + local_ckpt_path, + config=local_diffusers_config, + controlnet=controlnet, + safety_checker=None, + local_files_only=True, + ) + super()._compare_component_configs(pipe, pipe_single_file) diff --git a/tests/single_file/test_stable_diffusion_controlnet_single_file.py b/tests/single_file/test_stable_diffusion_controlnet_single_file.py new file mode 100644 index 000000000000..abcf4c11d614 --- /dev/null +++ b/tests/single_file/test_stable_diffusion_controlnet_single_file.py @@ -0,0 +1,171 @@ +import gc +import tempfile +import unittest + +import torch + +from diffusers import ControlNetModel, StableDiffusionControlNetPipeline +from diffusers.utils import load_image +from diffusers.utils.testing_utils import ( + enable_full_determinism, + numpy_cosine_similarity_distance, + require_torch_gpu, + slow, +) + +from .single_file_testing_utils import ( + SDSingleFileTesterMixin, + download_diffusers_config, + download_original_config, + download_single_file_checkpoint, +) + + +enable_full_determinism() + + +@slow +@require_torch_gpu +class StableDiffusionControlNetPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): + pipeline_class = StableDiffusionControlNetPipeline + ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors" + original_config = ( + "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml" + ) + repo_id = "runwayml/stable-diffusion-v1-5" + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self): + control_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" + ).resize((512, 512)) + inputs = { + "prompt": "bird", + "image": control_image, + "generator": torch.Generator(device="cpu").manual_seed(0), + "num_inference_steps": 3, + "output_type": "np", + } + + return inputs + + def test_single_file_format_inference_is_same_as_pretrained(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny") + pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet) + pipe.unet.set_default_attn_processor() + pipe.enable_model_cpu_offload() + + pipe_sf = self.pipeline_class.from_single_file( + self.ckpt_path, + controlnet=controlnet, + ) + pipe_sf.unet.set_default_attn_processor() + pipe_sf.enable_model_cpu_offload() + + inputs = self.get_inputs() + output = pipe(**inputs).images[0] + + inputs = self.get_inputs() + output_sf = pipe_sf(**inputs).images[0] + + max_diff = numpy_cosine_similarity_distance(output_sf.flatten(), output.flatten()) + assert max_diff < 1e-3 + + def test_single_file_components(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny") + pipe = self.pipeline_class.from_pretrained( + self.repo_id, variant="fp16", safety_checker=None, controlnet=controlnet + ) + pipe_single_file = self.pipeline_class.from_single_file( + self.ckpt_path, + safety_checker=None, + controlnet=controlnet, + ) + + super()._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_local_files_only(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny") + pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + + pipe_single_file = self.pipeline_class.from_single_file( + local_ckpt_path, controlnet=controlnet, local_files_only=True + ) + + super()._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_original_config(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny", variant="fp16") + pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet) + pipe_single_file = self.pipeline_class.from_single_file( + self.ckpt_path, controlnet=controlnet, original_config=self.original_config + ) + + super()._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_original_config_local_files_only(self): + controlnet = ControlNetModel.from_pretrained( + "lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16, variant="fp16" + ) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + controlnet=controlnet, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + local_original_config = download_original_config(self.original_config, tmpdir) + + pipe_single_file = self.pipeline_class.from_single_file( + local_ckpt_path, original_config=local_original_config, controlnet=controlnet, local_files_only=True + ) + pipe_single_file.scheduler = pipe.scheduler + + super()._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_diffusers_config(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny", variant="fp16") + pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet) + pipe_single_file = self.pipeline_class.from_single_file( + self.ckpt_path, controlnet=controlnet, safety_checker=None, config=self.repo_id + ) + + super()._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_diffusers_config_local_files_only(self): + controlnet = ControlNetModel.from_pretrained( + "lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16, variant="fp16" + ) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + controlnet=controlnet, + safety_checker=None, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + local_diffusers_config = download_diffusers_config(self.repo_id, tmpdir) + + pipe_single_file = self.pipeline_class.from_single_file( + local_ckpt_path, + config=local_diffusers_config, + controlnet=controlnet, + safety_checker=None, + local_files_only=True, + ) + super()._compare_component_configs(pipe, pipe_single_file) diff --git a/tests/single_file/test_stable_diffusion_img2img_single_file.py b/tests/single_file/test_stable_diffusion_img2img_single_file.py new file mode 100644 index 000000000000..1359e66b2c90 --- /dev/null +++ b/tests/single_file/test_stable_diffusion_img2img_single_file.py @@ -0,0 +1,99 @@ +import gc +import unittest + +import torch + +from diffusers import ( + StableDiffusionImg2ImgPipeline, +) +from diffusers.utils import load_image +from diffusers.utils.testing_utils import ( + enable_full_determinism, + require_torch_gpu, + slow, +) + +from .single_file_testing_utils import SDSingleFileTesterMixin + + +enable_full_determinism() + + +@slow +@require_torch_gpu +class StableDiffusionImg2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): + pipeline_class = StableDiffusionImg2ImgPipeline + ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors" + original_config = ( + "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml" + ) + repo_id = "runwayml/stable-diffusion-v1-5" + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + generator = torch.Generator(device=generator_device).manual_seed(seed) + init_image = load_image( + "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" + "/stable_diffusion_img2img/sketch-mountains-input.png" + ) + inputs = { + "prompt": "a fantasy landscape, concept art, high resolution", + "image": init_image, + "generator": generator, + "num_inference_steps": 3, + "strength": 0.75, + "guidance_scale": 7.5, + "output_type": "np", + } + return inputs + + def test_single_file_format_inference_is_same_as_pretrained(self): + super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3) + + +@slow +@require_torch_gpu +class StableDiffusion21Img2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): + pipeline_class = StableDiffusionImg2ImgPipeline + ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-ema-pruned.safetensors" + original_config = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml" + repo_id = "stabilityai/stable-diffusion-2-1" + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + generator = torch.Generator(device=generator_device).manual_seed(seed) + init_image = load_image( + "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" + "/stable_diffusion_img2img/sketch-mountains-input.png" + ) + inputs = { + "prompt": "a fantasy landscape, concept art, high resolution", + "image": init_image, + "generator": generator, + "num_inference_steps": 3, + "strength": 0.75, + "guidance_scale": 7.5, + "output_type": "np", + } + return inputs + + def test_single_file_format_inference_is_same_as_pretrained(self): + super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3) diff --git a/tests/single_file/test_stable_diffusion_inpaint_single_file.py b/tests/single_file/test_stable_diffusion_inpaint_single_file.py new file mode 100644 index 000000000000..3fc72844648b --- /dev/null +++ b/tests/single_file/test_stable_diffusion_inpaint_single_file.py @@ -0,0 +1,114 @@ +import gc +import unittest + +import torch + +from diffusers import ( + StableDiffusionInpaintPipeline, +) +from diffusers.utils import load_image +from diffusers.utils.testing_utils import ( + enable_full_determinism, + require_torch_gpu, + slow, +) + +from .single_file_testing_utils import SDSingleFileTesterMixin + + +enable_full_determinism() + + +@slow +@require_torch_gpu +class StableDiffusionInpaintPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): + pipeline_class = StableDiffusionInpaintPipeline + ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-inpainting/blob/main/sd-v1-5-inpainting.ckpt" + original_config = "https://raw.githubusercontent.com/runwayml/stable-diffusion/main/configs/stable-diffusion/v1-inpainting-inference.yaml" + repo_id = "runwayml/stable-diffusion-inpainting" + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + generator = torch.Generator(device=generator_device).manual_seed(seed) + init_image = load_image( + "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" + "/stable_diffusion_inpaint/input_bench_image.png" + ) + mask_image = load_image( + "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" + "/stable_diffusion_inpaint/input_bench_mask.png" + ) + inputs = { + "prompt": "Face of a yellow cat, high resolution, sitting on a park bench", + "image": init_image, + "mask_image": mask_image, + "generator": generator, + "num_inference_steps": 3, + "guidance_scale": 7.5, + "output_type": "np", + } + return inputs + + def test_single_file_format_inference_is_same_as_pretrained(self): + super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3) + + def test_single_file_loading_4_channel_unet(self): + # Test loading single file inpaint with a 4 channel UNet + ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors" + pipe = self.pipeline_class.from_single_file(ckpt_path) + + assert pipe.unet.config.in_channels == 4 + + +@slow +@require_torch_gpu +class StableDiffusion21InpaintPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): + pipeline_class = StableDiffusionInpaintPipeline + ckpt_path = ( + "https://huggingface.co/stabilityai/stable-diffusion-2-inpainting/blob/main/512-inpainting-ema.safetensors" + ) + original_config = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inpainting-inference.yaml" + repo_id = "stabilityai/stable-diffusion-2-inpainting" + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + generator = torch.Generator(device=generator_device).manual_seed(seed) + init_image = load_image( + "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" + "/stable_diffusion_inpaint/input_bench_image.png" + ) + mask_image = load_image( + "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" + "/stable_diffusion_inpaint/input_bench_mask.png" + ) + inputs = { + "prompt": "Face of a yellow cat, high resolution, sitting on a park bench", + "image": init_image, + "mask_image": mask_image, + "generator": generator, + "num_inference_steps": 3, + "guidance_scale": 7.5, + "output_type": "np", + } + return inputs + + def test_single_file_format_inference_is_same_as_pretrained(self): + super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3) diff --git a/tests/single_file/test_stable_diffusion_single_file.py b/tests/single_file/test_stable_diffusion_single_file.py new file mode 100644 index 000000000000..99c884fae06b --- /dev/null +++ b/tests/single_file/test_stable_diffusion_single_file.py @@ -0,0 +1,115 @@ +import gc +import tempfile +import unittest + +import torch + +from diffusers import EulerDiscreteScheduler, StableDiffusionPipeline +from diffusers.utils.testing_utils import ( + enable_full_determinism, + require_torch_gpu, + slow, +) + +from .single_file_testing_utils import ( + SDSingleFileTesterMixin, + download_original_config, + download_single_file_checkpoint, +) + + +enable_full_determinism() + + +@slow +@require_torch_gpu +class StableDiffusionPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): + pipeline_class = StableDiffusionPipeline + ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors" + original_config = ( + "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml" + ) + repo_id = "runwayml/stable-diffusion-v1-5" + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + generator = torch.Generator(device=generator_device).manual_seed(seed) + inputs = { + "prompt": "a fantasy landscape, concept art, high resolution", + "generator": generator, + "num_inference_steps": 2, + "strength": 0.75, + "guidance_scale": 7.5, + "output_type": "np", + } + return inputs + + def test_single_file_format_inference_is_same_as_pretrained(self): + super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3) + + def test_single_file_legacy_scheduler_loading(self): + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + local_original_config = download_original_config(self.original_config, tmpdir) + + pipe = self.pipeline_class.from_single_file( + local_ckpt_path, + original_config=local_original_config, + cache_dir=tmpdir, + local_files_only=True, + scheduler_type="euler", + ) + + # Default is PNDM for this checkpoint + assert isinstance(pipe.scheduler, EulerDiscreteScheduler) + + def test_single_file_legacy_scaling_factor(self): + new_scaling_factor = 10.0 + init_pipe = self.pipeline_class.from_single_file(self.ckpt_path) + pipe = self.pipeline_class.from_single_file(self.ckpt_path, scaling_factor=new_scaling_factor) + + assert init_pipe.vae.config.scaling_factor != new_scaling_factor + assert pipe.vae.config.scaling_factor == new_scaling_factor + + +@slow +class StableDiffusion21PipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): + pipeline_class = StableDiffusionPipeline + ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-ema-pruned.safetensors" + original_config = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml" + repo_id = "stabilityai/stable-diffusion-2-1" + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + generator = torch.Generator(device=generator_device).manual_seed(seed) + inputs = { + "prompt": "a fantasy landscape, concept art, high resolution", + "generator": generator, + "num_inference_steps": 2, + "strength": 0.75, + "guidance_scale": 7.5, + "output_type": "np", + } + return inputs + + def test_single_file_format_inference_is_same_as_pretrained(self): + super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3) diff --git a/tests/single_file/test_stable_diffusion_upscale_single_file.py b/tests/single_file/test_stable_diffusion_upscale_single_file.py new file mode 100644 index 000000000000..3c26d001c2b0 --- /dev/null +++ b/tests/single_file/test_stable_diffusion_upscale_single_file.py @@ -0,0 +1,68 @@ +import gc +import unittest + +import torch + +from diffusers import ( + StableDiffusionUpscalePipeline, +) +from diffusers.utils import load_image +from diffusers.utils.testing_utils import ( + enable_full_determinism, + numpy_cosine_similarity_distance, + require_torch_gpu, + slow, +) + +from .single_file_testing_utils import SDSingleFileTesterMixin + + +enable_full_determinism() + + +@slow +@require_torch_gpu +class StableDiffusionUpscalePipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): + pipeline_class = StableDiffusionUpscalePipeline + ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler/blob/main/x4-upscaler-ema.safetensors" + original_config = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/x4-upscaling.yaml" + repo_id = "stabilityai/stable-diffusion-x4-upscaler" + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_single_file_format_inference_is_same_as_pretrained(self): + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/sd2-upscale/low_res_cat.png" + ) + + prompt = "a cat sitting on a park bench" + pipe = StableDiffusionUpscalePipeline.from_pretrained(self.repo_id) + pipe.enable_model_cpu_offload() + + generator = torch.Generator("cpu").manual_seed(0) + output = pipe(prompt=prompt, image=image, generator=generator, output_type="np", num_inference_steps=3) + image_from_pretrained = output.images[0] + + pipe_from_single_file = StableDiffusionUpscalePipeline.from_single_file(self.ckpt_path) + pipe_from_single_file.enable_model_cpu_offload() + + generator = torch.Generator("cpu").manual_seed(0) + output_from_single_file = pipe_from_single_file( + prompt=prompt, image=image, generator=generator, output_type="np", num_inference_steps=3 + ) + image_from_single_file = output_from_single_file.images[0] + + assert image_from_pretrained.shape == (512, 512, 3) + assert image_from_single_file.shape == (512, 512, 3) + assert ( + numpy_cosine_similarity_distance(image_from_pretrained.flatten(), image_from_single_file.flatten()) < 1e-3 + ) diff --git a/tests/single_file/test_stable_diffusion_xl_adapter_single_file.py b/tests/single_file/test_stable_diffusion_xl_adapter_single_file.py new file mode 100644 index 000000000000..43881914d3c0 --- /dev/null +++ b/tests/single_file/test_stable_diffusion_xl_adapter_single_file.py @@ -0,0 +1,202 @@ +import gc +import tempfile +import unittest + +import torch + +from diffusers import ( + StableDiffusionXLAdapterPipeline, + T2IAdapter, +) +from diffusers.utils import load_image +from diffusers.utils.testing_utils import ( + enable_full_determinism, + numpy_cosine_similarity_distance, + require_torch_gpu, + slow, +) + +from .single_file_testing_utils import ( + SDXLSingleFileTesterMixin, + download_diffusers_config, + download_original_config, + download_single_file_checkpoint, +) + + +enable_full_determinism() + + +@slow +@require_torch_gpu +class StableDiffusionXLAdapterPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingleFileTesterMixin): + pipeline_class = StableDiffusionXLAdapterPipeline + ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors" + repo_id = "stabilityai/stable-diffusion-xl-base-1.0" + original_config = ( + "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml" + ) + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self): + prompt = "toy" + generator = torch.Generator(device="cpu").manual_seed(0) + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/t2i_adapter/toy_canny.png" + ) + + inputs = { + "prompt": prompt, + "image": image, + "generator": generator, + "num_inference_steps": 2, + "guidance_scale": 7.5, + "output_type": "np", + } + + return inputs + + def test_single_file_format_inference_is_same_as_pretrained(self): + adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16) + pipe_single_file = StableDiffusionXLAdapterPipeline.from_single_file( + self.ckpt_path, + adapter=adapter, + torch_dtype=torch.float16, + safety_checker=None, + ) + pipe_single_file.enable_model_cpu_offload() + pipe_single_file.set_progress_bar_config(disable=None) + + inputs = self.get_inputs() + images_single_file = pipe_single_file(**inputs).images[0] + + pipe = StableDiffusionXLAdapterPipeline.from_pretrained( + self.repo_id, + adapter=adapter, + torch_dtype=torch.float16, + safety_checker=None, + ) + pipe.enable_model_cpu_offload() + + inputs = self.get_inputs() + images = pipe(**inputs).images[0] + + assert images_single_file.shape == (768, 512, 3) + assert images.shape == (768, 512, 3) + + max_diff = numpy_cosine_similarity_distance(images.flatten(), images_single_file.flatten()) + assert max_diff < 5e-3 + + def test_single_file_components(self): + adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + variant="fp16", + adapter=adapter, + torch_dtype=torch.float16, + ) + + pipe_single_file = self.pipeline_class.from_single_file(self.ckpt_path, safety_checker=None, adapter=adapter) + super().test_single_file_components(pipe, pipe_single_file) + + def test_single_file_components_local_files_only(self): + adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + variant="fp16", + adapter=adapter, + torch_dtype=torch.float16, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + + single_file_pipe = self.pipeline_class.from_single_file( + local_ckpt_path, adapter=adapter, safety_checker=None, local_files_only=True + ) + + self._compare_component_configs(pipe, single_file_pipe) + + def test_single_file_components_with_diffusers_config(self): + adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + variant="fp16", + adapter=adapter, + torch_dtype=torch.float16, + safety_checker=None, + ) + + pipe_single_file = self.pipeline_class.from_single_file(self.ckpt_path, config=self.repo_id, adapter=adapter) + self._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_diffusers_config_local_files_only(self): + adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + variant="fp16", + adapter=adapter, + torch_dtype=torch.float16, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + local_diffusers_config = download_diffusers_config(self.repo_id, tmpdir) + + pipe_single_file = self.pipeline_class.from_single_file( + local_ckpt_path, + config=local_diffusers_config, + adapter=adapter, + safety_checker=None, + local_files_only=True, + ) + self._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_original_config(self): + adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + variant="fp16", + adapter=adapter, + torch_dtype=torch.float16, + safety_checker=None, + ) + + pipe_single_file = self.pipeline_class.from_single_file( + self.ckpt_path, original_config=self.original_config, adapter=adapter + ) + self._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_original_config_local_files_only(self): + adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-lineart-sdxl-1.0", torch_dtype=torch.float16) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + variant="fp16", + adapter=adapter, + torch_dtype=torch.float16, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + local_original_config = download_original_config(self.original_config, tmpdir) + + pipe_single_file = self.pipeline_class.from_single_file( + local_ckpt_path, + original_config=local_original_config, + adapter=adapter, + safety_checker=None, + local_files_only=True, + ) + self._compare_component_configs(pipe, pipe_single_file) diff --git a/tests/single_file/test_stable_diffusion_xl_controlnet_single_file.py b/tests/single_file/test_stable_diffusion_xl_controlnet_single_file.py new file mode 100644 index 000000000000..6aebc2b01999 --- /dev/null +++ b/tests/single_file/test_stable_diffusion_xl_controlnet_single_file.py @@ -0,0 +1,197 @@ +import gc +import tempfile +import unittest + +import torch + +from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline +from diffusers.utils import load_image +from diffusers.utils.testing_utils import ( + enable_full_determinism, + numpy_cosine_similarity_distance, + require_torch_gpu, + slow, + torch_device, +) + +from .single_file_testing_utils import ( + SDXLSingleFileTesterMixin, + download_diffusers_config, + download_single_file_checkpoint, +) + + +enable_full_determinism() + + +@slow +@require_torch_gpu +class StableDiffusionXLControlNetPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingleFileTesterMixin): + pipeline_class = StableDiffusionXLControlNetPipeline + ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors" + repo_id = "stabilityai/stable-diffusion-xl-base-1.0" + original_config = ( + "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml" + ) + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + generator = torch.Generator(device=generator_device).manual_seed(seed) + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png" + ) + inputs = { + "prompt": "Stormtrooper's lecture", + "image": image, + "generator": generator, + "num_inference_steps": 2, + "strength": 0.75, + "guidance_scale": 7.5, + "output_type": "np", + } + + return inputs + + def test_single_file_format_inference_is_same_as_pretrained(self): + controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16) + pipe_single_file = self.pipeline_class.from_single_file( + self.ckpt_path, controlnet=controlnet, torch_dtype=torch.float16 + ) + pipe_single_file.unet.set_default_attn_processor() + pipe_single_file.enable_model_cpu_offload() + pipe_single_file.set_progress_bar_config(disable=None) + + inputs = self.get_inputs(torch_device) + single_file_images = pipe_single_file(**inputs).images[0] + + pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet, torch_dtype=torch.float16) + pipe.unet.set_default_attn_processor() + pipe.enable_model_cpu_offload() + + inputs = self.get_inputs(torch_device) + images = pipe(**inputs).images[0] + + assert images.shape == (512, 512, 3) + assert single_file_images.shape == (512, 512, 3) + + max_diff = numpy_cosine_similarity_distance(images[0].flatten(), single_file_images[0].flatten()) + assert max_diff < 5e-2 + + def test_single_file_components(self): + controlnet = ControlNetModel.from_pretrained( + "diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, variant="fp16" + ) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + variant="fp16", + controlnet=controlnet, + torch_dtype=torch.float16, + ) + + pipe_single_file = self.pipeline_class.from_single_file(self.ckpt_path, controlnet=controlnet) + super().test_single_file_components(pipe, pipe_single_file) + + def test_single_file_components_local_files_only(self): + controlnet = ControlNetModel.from_pretrained( + "diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, variant="fp16" + ) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + variant="fp16", + controlnet=controlnet, + torch_dtype=torch.float16, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + + single_file_pipe = self.pipeline_class.from_single_file( + local_ckpt_path, controlnet=controlnet, safety_checker=None, local_files_only=True + ) + + self._compare_component_configs(pipe, single_file_pipe) + + def test_single_file_components_with_original_config(self): + controlnet = ControlNetModel.from_pretrained( + "diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, variant="fp16" + ) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + variant="fp16", + controlnet=controlnet, + torch_dtype=torch.float16, + ) + + pipe_single_file = self.pipeline_class.from_single_file( + self.ckpt_path, + original_config=self.original_config, + controlnet=controlnet, + ) + self._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_original_config_local_files_only(self): + controlnet = ControlNetModel.from_pretrained( + "diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, variant="fp16" + ) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + variant="fp16", + controlnet=controlnet, + torch_dtype=torch.float16, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + + pipe_single_file = self.pipeline_class.from_single_file( + local_ckpt_path, + safety_checker=None, + controlnet=controlnet, + local_files_only=True, + ) + self._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_diffusers_config(self): + controlnet = ControlNetModel.from_pretrained( + "diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, variant="fp16" + ) + pipe = self.pipeline_class.from_pretrained(self.repo_id, controlnet=controlnet) + pipe_single_file = self.pipeline_class.from_single_file( + self.ckpt_path, controlnet=controlnet, config=self.repo_id + ) + + super()._compare_component_configs(pipe, pipe_single_file) + + def test_single_file_components_with_diffusers_config_local_files_only(self): + controlnet = ControlNetModel.from_pretrained( + "diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16, variant="fp16" + ) + pipe = self.pipeline_class.from_pretrained( + self.repo_id, + controlnet=controlnet, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + ckpt_filename = self.ckpt_path.split("/")[-1] + local_ckpt_path = download_single_file_checkpoint(self.repo_id, ckpt_filename, tmpdir) + local_diffusers_config = download_diffusers_config(self.repo_id, tmpdir) + + pipe_single_file = self.pipeline_class.from_single_file( + local_ckpt_path, + config=local_diffusers_config, + safety_checker=None, + controlnet=controlnet, + local_files_only=True, + ) + super()._compare_component_configs(pipe, pipe_single_file) diff --git a/tests/single_file/test_stable_diffusion_xl_img2img_single_file.py b/tests/single_file/test_stable_diffusion_xl_img2img_single_file.py new file mode 100644 index 000000000000..71b57eb7c6c9 --- /dev/null +++ b/tests/single_file/test_stable_diffusion_xl_img2img_single_file.py @@ -0,0 +1,105 @@ +import gc +import unittest + +import torch + +from diffusers import ( + DDIMScheduler, + StableDiffusionXLImg2ImgPipeline, +) +from diffusers.utils import load_image +from diffusers.utils.testing_utils import ( + enable_full_determinism, + numpy_cosine_similarity_distance, + require_torch_gpu, + slow, +) + +from .single_file_testing_utils import SDXLSingleFileTesterMixin + + +enable_full_determinism() + + +@slow +@require_torch_gpu +class StableDiffusionXLImg2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingleFileTesterMixin): + pipeline_class = StableDiffusionXLImg2ImgPipeline + ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors" + repo_id = "stabilityai/stable-diffusion-xl-base-1.0" + original_config = ( + "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml" + ) + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + generator = torch.Generator(device=generator_device).manual_seed(seed) + init_image = load_image( + "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" + "/stable_diffusion_img2img/sketch-mountains-input.png" + ) + inputs = { + "prompt": "a fantasy landscape, concept art, high resolution", + "image": init_image, + "generator": generator, + "num_inference_steps": 3, + "strength": 0.75, + "guidance_scale": 7.5, + "output_type": "np", + } + return inputs + + def test_single_file_format_inference_is_same_as_pretrained(self): + super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3) + + +@slow +@require_torch_gpu +class StableDiffusionXLImg2ImgRefinerPipelineSingleFileSlowTests(unittest.TestCase): + pipeline_class = StableDiffusionXLImg2ImgPipeline + ckpt_path = ( + "https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/blob/main/sd_xl_refiner_1.0.safetensors" + ) + repo_id = "stabilityai/stable-diffusion-xl-refiner-1.0" + original_config = ( + "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_refiner.yaml" + ) + + def test_single_file_format_inference_is_same_as_pretrained(self): + init_image = load_image( + "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" + "/stable_diffusion_img2img/sketch-mountains-input.png" + ) + + pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16) + pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + pipe.unet.set_default_attn_processor() + pipe.enable_model_cpu_offload() + + generator = torch.Generator(device="cpu").manual_seed(0) + image = pipe( + prompt="mountains", image=init_image, num_inference_steps=5, generator=generator, output_type="np" + ).images[0] + + pipe_single_file = self.pipeline_class.from_single_file(self.ckpt_path, torch_dtype=torch.float16) + pipe_single_file.scheduler = DDIMScheduler.from_config(pipe_single_file.scheduler.config) + pipe_single_file.unet.set_default_attn_processor() + pipe_single_file.enable_model_cpu_offload() + + generator = torch.Generator(device="cpu").manual_seed(0) + image_single_file = pipe_single_file( + prompt="mountains", image=init_image, num_inference_steps=5, generator=generator, output_type="np" + ).images[0] + + max_diff = numpy_cosine_similarity_distance(image.flatten(), image_single_file.flatten()) + + assert max_diff < 5e-4 diff --git a/tests/single_file/test_stable_diffusion_xl_instruct_pix2pix.py b/tests/single_file/test_stable_diffusion_xl_instruct_pix2pix.py new file mode 100644 index 000000000000..7ebddc8555bb --- /dev/null +++ b/tests/single_file/test_stable_diffusion_xl_instruct_pix2pix.py @@ -0,0 +1,50 @@ +import gc +import unittest + +import torch + +from diffusers import StableDiffusionXLInstructPix2PixPipeline +from diffusers.utils.testing_utils import ( + enable_full_determinism, + require_torch_gpu, + slow, +) + + +enable_full_determinism() + + +@slow +@require_torch_gpu +class StableDiffusionXLInstructPix2PixPipeline(unittest.TestCase): + pipeline_class = StableDiffusionXLInstructPix2PixPipeline + ckpt_path = "https://huggingface.co/stabilityai/cosxl/blob/main/cosxl_edit.safetensors" + original_config = None + repo_id = "diffusers/sdxl-instructpix2pix-768" + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + generator = torch.Generator(device=generator_device).manual_seed(seed) + inputs = { + "prompt": "a fantasy landscape, concept art, high resolution", + "generator": generator, + "num_inference_steps": 2, + "strength": 0.75, + "guidance_scale": 7.5, + "output_type": "np", + } + return inputs + + def test_single_file_setting_cosxl_edit(self): + # Default is PNDM for this checkpoint + pipe = self.pipeline_class.from_single_file(self.ckpt_path, config=self.repo_id, is_cosxl_edit=True) + assert pipe.is_cosxl_edit is True diff --git a/tests/single_file/test_stable_diffusion_xl_single_file.py b/tests/single_file/test_stable_diffusion_xl_single_file.py new file mode 100644 index 000000000000..a143a35a2bbc --- /dev/null +++ b/tests/single_file/test_stable_diffusion_xl_single_file.py @@ -0,0 +1,54 @@ +import gc +import unittest + +import torch + +from diffusers import ( + StableDiffusionXLPipeline, +) +from diffusers.utils.testing_utils import ( + enable_full_determinism, + require_torch_gpu, + slow, +) + +from .single_file_testing_utils import SDXLSingleFileTesterMixin + + +enable_full_determinism() + + +@slow +@require_torch_gpu +class StableDiffusionXLPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingleFileTesterMixin): + pipeline_class = StableDiffusionXLPipeline + ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors" + repo_id = "stabilityai/stable-diffusion-xl-base-1.0" + original_config = ( + "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml" + ) + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + generator = torch.Generator(device=generator_device).manual_seed(seed) + inputs = { + "prompt": "a fantasy landscape, concept art, high resolution", + "generator": generator, + "num_inference_steps": 2, + "strength": 0.75, + "guidance_scale": 7.5, + "output_type": "np", + } + return inputs + + def test_single_file_format_inference_is_same_as_pretrained(self): + super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3) diff --git a/utils/check_table.py b/utils/check_table.py index aa7554cbc352..80fd5660bb46 100644 --- a/utils/check_table.py +++ b/utils/check_table.py @@ -73,7 +73,7 @@ def _find_text_in_file(filename, start_prompt, end_prompt): # Thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python def camel_case_split(identifier): - "Split a camelcased `identifier` into words." + """Split a camelcased `identifier` into words.""" matches = re.finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier) return [m.group(0) for m in matches]