Skquark · Skquark · Nov 6, 2024 · Oct 30, 2024 · Oct 31, 2024 · Oct 31, 2024
diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml
@@ -180,6 +180,62 @@ jobs:
         pip install slack_sdk tabulate
         python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
 
+  run_big_gpu_torch_tests:
+    name: Torch tests on big GPU
+    strategy:
+      fail-fast: false
+      max-parallel: 2
+    runs-on:
+      group: aws-g6e-xlarge-plus
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host --gpus 0
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: NVIDIA-SMI
+        run: nvidia-smi
+      - name: Install dependencies
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m uv pip install -e [quality,test]
+          python -m uv pip install peft@git+https://github.com/huggingface/peft.git
+          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          python -m uv pip install pytest-reportlog
+      - name: Environment
+        run: |
+          python utils/print_env.py
+      - name: Selected Torch CUDA Test on big GPU
+        env:
+          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+          CUBLAS_WORKSPACE_CONFIG: :16:8
+          BIG_GPU_MEMORY: 40
+        run: |
+          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+            -m "big_gpu_with_torch_cuda" \
+            --make-reports=tests_big_gpu_torch_cuda \
+            --report-log=tests_big_gpu_torch_cuda.log \
+            tests/
+      - name: Failure short reports
+        if: ${{ failure() }}
+        run: |
+          cat reports/tests_big_gpu_torch_cuda_stats.txt
+          cat reports/tests_big_gpu_torch_cuda_failures_short.txt
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch_cuda_big_gpu_test_reports
+          path: reports
+      - name: Generate Report and Notify Channel
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+
   run_flax_tpu_tests:
     name: Nightly Flax TPU Tests
     runs-on: docker-tpu

diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml
@@ -4,12 +4,13 @@ on:
   workflow_dispatch:
     inputs:
       runner_type:
-        description: 'Type of runner to test (aws-g6-4xlarge-plus: a10 or aws-g4dn-2xlarge: t4)'
+        description: 'Type of runner to test (aws-g6-4xlarge-plus: a10, aws-g4dn-2xlarge: t4, aws-g6e-xlarge-plus: L40)'
         type: choice
         required: true
         options:
           - aws-g6-4xlarge-plus
           - aws-g4dn-2xlarge
+          - aws-g6e-xlarge-plus
       docker_image:
         description: 'Name of the Docker image'
         required: true

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -270,6 +270,8 @@
         title: LatteTransformer3DModel
       - local: api/models/lumina_nextdit2d
         title: LuminaNextDiT2DModel
+      - local: api/models/mochi_transformer3d
+        title: MochiTransformer3DModel
       - local: api/models/pixart_transformer2d
         title: PixArtTransformer2DModel
       - local: api/models/prior_transformer
@@ -306,6 +308,8 @@
         title: AutoencoderKLAllegro
       - local: api/models/autoencoderkl_cogvideox
         title: AutoencoderKLCogVideoX
+      - local: api/models/autoencoderkl_mochi
+        title: AutoencoderKLMochi
       - local: api/models/asymmetricautoencoderkl
         title: AsymmetricAutoencoderKL
       - local: api/models/consistency_decoder_vae
@@ -400,6 +404,8 @@
       title: Lumina-T2X
     - local: api/pipelines/marigold
       title: Marigold
+    - local: api/pipelines/mochi
+      title: Mochi
     - local: api/pipelines/panorama
       title: MultiDiffusion
     - local: api/pipelines/musicldm

diff --git a/docs/source/en/api/models/autoencoderkl_mochi.md b/docs/source/en/api/models/autoencoderkl_mochi.md
@@ -0,0 +1,32 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# AutoencoderKLMochi
+
+The 3D variational autoencoder (VAE) model with KL loss used in [Mochi](https://github.com/genmoai/models) was introduced in [Mochi 1 Preview](https://huggingface.co/genmo/mochi-1-preview) by Tsinghua University & ZhipuAI.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import AutoencoderKLMochi
+
+vae = AutoencoderKLMochi.from_pretrained("genmo/mochi-1-preview", subfolder="vae", torch_dtype=torch.float32).to("cuda")
+```
+
+## AutoencoderKLMochi
+
+[[autodoc]] AutoencoderKLMochi
+    - decode
+    - all
+
+## DecoderOutput
+
+[[autodoc]] models.autoencoders.vae.DecoderOutput
diff --git a/docs/source/en/api/models/mochi_transformer3d.md b/docs/source/en/api/models/mochi_transformer3d.md
@@ -0,0 +1,30 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# MochiTransformer3DModel
+
+A Diffusion Transformer model for 3D video-like data was introduced in [Mochi-1 Preview](https://huggingface.co/genmo/mochi-1-preview) by Genmo.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import MochiTransformer3DModel
+
+vae = MochiTransformer3DModel.from_pretrained("genmo/mochi-1-preview", subfolder="transformer", torch_dtype=torch.float16).to("cuda")
+```
+
+## MochiTransformer3DModel
+
+[[autodoc]] MochiTransformer3DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
diff --git a/docs/source/en/api/pipelines/mochi.md b/docs/source/en/api/pipelines/mochi.md
@@ -0,0 +1,36 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+-->
+
+# Mochi
+
+[Mochi 1 Preview](https://huggingface.co/genmo/mochi-1-preview) from Genmo.
+
+*Mochi 1 preview is an open state-of-the-art video generation model with high-fidelity motion and strong prompt adherence in preliminary evaluation. This model dramatically closes the gap between closed and open video generation systems. The model is released under a permissive Apache 2.0 license.*
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+## MochiPipeline
+
+[[autodoc]] MochiPipeline
+  - all
+  - __call__
+
+## MochiPipelineOutput
+
+[[autodoc]] pipelines.mochi.pipeline_output.MochiPipelineOutput
diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md
@@ -183,7 +183,7 @@ Add the transformer model to the pipeline for denoising, but set the other model
 
 ```py
 pipeline = FluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev", ,
+    "black-forest-labs/FLUX.1-dev",
     text_encoder=None,
     text_encoder_2=None,
     tokenizer=None,

diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_flux_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_flux_advanced.py
@@ -1778,15 +1778,10 @@ def load_model_hook(models, input_dir):
         if not args.enable_t5_ti:
             # pure textual inversion - only clip
             if pure_textual_inversion:
-                params_to_optimize = [
-                    text_parameters_one_with_lr,
-                ]
+                params_to_optimize = [text_parameters_one_with_lr]
                 te_idx = 0
             else:  # regular te training or regular pivotal for clip
-                params_to_optimize = [
-                    transformer_parameters_with_lr,
-                    text_parameters_one_with_lr,
-                ]
+                params_to_optimize = [transformer_parameters_with_lr, text_parameters_one_with_lr]
                 te_idx = 1
         elif args.enable_t5_ti:
             # pivotal tuning of clip & t5
@@ -1809,9 +1804,7 @@ def load_model_hook(models, input_dir):
                 ]
                 te_idx = 1
     else:
-        params_to_optimize = [
-            transformer_parameters_with_lr,
-        ]
+        params_to_optimize = [transformer_parameters_with_lr]
 
     # Optimizer creation
     if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
@@ -1871,7 +1864,6 @@ def load_model_hook(models, input_dir):
             params_to_optimize[-1]["lr"] = args.learning_rate
         optimizer = optimizer_class(
             params_to_optimize,
-            lr=args.learning_rate,
             betas=(args.adam_beta1, args.adam_beta2),
             beta3=args.prodigy_beta3,
             weight_decay=args.adam_weight_decay,

diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py
@@ -67,6 +67,7 @@
     convert_state_dict_to_kohya,
     is_wandb_available,
 )
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
 from diffusers.utils.import_utils import is_xformers_available
 
 
@@ -79,30 +80,27 @@
 def save_model_card(
     repo_id: str,
     use_dora: bool,
-    images=None,
-    base_model=str,
+    images: list = None,
+    base_model: str = None,
     train_text_encoder=False,
     train_text_encoder_ti=False,
     token_abstraction_dict=None,
-    instance_prompt=str,
-    validation_prompt=str,
+    instance_prompt=None,
+    validation_prompt=None,
     repo_folder=None,
     vae_path=None,
 ):
-    img_str = "widget:\n"
     lora = "lora" if not use_dora else "dora"
-    for i, image in enumerate(images):
-        image.save(os.path.join(repo_folder, f"image_{i}.png"))
-        img_str += f"""
-        - text: '{validation_prompt if validation_prompt else ' ' }'
-          output:
-            url:
-                "image_{i}.png"
-        """
-    if not images:
-        img_str += f"""
-        - text: '{instance_prompt}'
-        """
+
+    widget_dict = []
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            widget_dict.append(
+                {"text": validation_prompt if validation_prompt else " ", "output": {"url": f"image_{i}.png"}}
+            )
+    else:
+        widget_dict.append({"text": instance_prompt})
     embeddings_filename = f"{repo_folder}_emb"
     instance_prompt_webui = re.sub(r"<s\d+>", "", re.sub(r"<s\d+>", embeddings_filename, instance_prompt, count=1))
     ti_keys = ", ".join(f'"{match}"' for match in re.findall(r"<s\d+>", instance_prompt))
@@ -137,24 +135,7 @@ def save_model_card(
                 trigger_str += f"""
 to trigger concept `{key}` → use `{tokens}` in your prompt \n
 """
-
-    yaml = f"""---
-tags:
-- stable-diffusion
-- stable-diffusion-diffusers
-- diffusers-training
-- text-to-image
-- diffusers
-- {lora}
-- template:sd-lora
-{img_str}
-base_model: {base_model}
-instance_prompt: {instance_prompt}
-license: openrail++
----
-"""
-
-    model_card = f"""
+    model_description = f"""
 # SD1.5 LoRA DreamBooth - {repo_id}
 
 <Gallery />
@@ -202,8 +183,28 @@ def save_model_card(
 Special VAE used for training: {vae_path}.
 
 """
-    with open(os.path.join(repo_folder, "README.md"), "w") as f:
-        f.write(yaml + model_card)
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="openrail++",
+        base_model=base_model,
+        prompt=instance_prompt,
+        model_description=model_description,
+        inference=True,
+        widget=widget_dict,
+    )
+
+    tags = [
+        "text-to-image",
+        "diffusers",
+        "diffusers-training",
+        lora,
+        "template:sd-lora" "stable-diffusion",
+        "stable-diffusion-diffusers",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
 
 
 def import_model_class_from_model_name_or_path(
@@ -1358,10 +1359,7 @@ def load_model_hook(models, input_dir):
             else args.adam_weight_decay,
             "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
         }
-        params_to_optimize = [
-            unet_lora_parameters_with_lr,
-            text_lora_parameters_one_with_lr,
-        ]
+        params_to_optimize = [unet_lora_parameters_with_lr, text_lora_parameters_one_with_lr]
     else:
         params_to_optimize = [unet_lora_parameters_with_lr]
 
@@ -1423,7 +1421,6 @@ def load_model_hook(models, input_dir):
 
         optimizer = optimizer_class(
             params_to_optimize,
-            lr=args.learning_rate,
             betas=(args.adam_beta1, args.adam_beta2),
             beta3=args.prodigy_beta3,
             weight_decay=args.adam_weight_decay,

diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
@@ -1794,7 +1794,6 @@ def load_model_hook(models, input_dir):
 
         optimizer = optimizer_class(
             params_to_optimize,
-            lr=args.learning_rate,
             betas=(args.adam_beta1, args.adam_beta2),
             beta3=args.prodigy_beta3,
             weight_decay=args.adam_weight_decay,

diff --git a/examples/cogvideo/train_cogvideox_image_to_video_lora.py b/examples/cogvideo/train_cogvideox_image_to_video_lora.py
@@ -947,7 +947,6 @@ def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
 
         optimizer = optimizer_class(
             params_to_optimize,
-            lr=args.learning_rate,
             betas=(args.adam_beta1, args.adam_beta2),
             beta3=args.prodigy_beta3,
             weight_decay=args.adam_weight_decay,