From 1bd7a69b1a0e9c9d6a98790d7d31999f08892c60 Mon Sep 17 00:00:00 2001
From: Xibo Sun <xibosun@outlook.com>
Date: Fri, 13 Sep 2024 15:13:29 +0800
Subject: [PATCH 01/12] feat: Add github workflow

---
 .github/workflows/build.yml | 22 ++++++++++++++++++++++
 examples/run.sh             |  6 +++---
 setup.py                    |  3 +++
 3 files changed, 28 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/build.yml

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 00000000..6bd46d48
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,22 @@
+name: GitHub Actions Demo
+run-name: Pushed by ${{ github.actor }} 🚀
+on: [push]
+jobs:
+  test-xfuser:
+    name: Test xfuser
+    runs-on: [self-hosted, linux, x64]
+    strategy:
+      fail-fast: false
+      max-parallel: 5
+      matrix:
+        python-versions: [3.10]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Uninstall xfuser
+        run: pip uninstall -y xfuser
+      - name: Install xfuser
+        run: pip install -e .
+      - name: Test xfuser
+        run: sh ./examples/run.sh
+      - name: Uninstall xfuser
+        run: pip uninstall -y xfuser
diff --git a/examples/run.sh b/examples/run.sh
index ab9e8d53..5ac205a9 100644
--- a/examples/run.sh
+++ b/examples/run.sh
@@ -19,7 +19,7 @@ export PYTHONPATH=$PWD:$PYTHONPATH
 # or you can simply use the model's ID on Hugging Face, 
 # which will then be downloaded to the default cache path on Hugging Face.
 
-export MODEL_TYPE="Sd3"
+export MODEL_TYPE="CogVideoX"
 # Configuration for different model types
 # script, model_id, inference_step
 declare -A MODEL_CONFIGS=(
@@ -28,7 +28,7 @@ declare -A MODEL_CONFIGS=(
     ["Sd3"]="sd3_example.py /cfs/dit/stable-diffusion-3-medium-diffusers 20"
     ["Flux"]="flux_example.py /cfs/dit/FLUX.1-schnell 4"
     ["HunyuanDiT"]="hunyuandit_example.py /mnt/models/SD/HunyuanDiT-v1.2-Diffusers 50"
-    ["CogVideoX"]="cogvideox_example.py /cfs/dit/CogVideoX-2b 1"
+    ["CogVideoX"]="cogvideox_example.py /cfs/dit/CogVideoX-2b 9"
 )
 
 if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then
@@ -43,7 +43,7 @@ mkdir -p ./results
 
 for HEIGHT in 1024
 do
-for N_GPUS in 8;
+for N_GPUS in 1;
 do 
 
 
diff --git a/setup.py b/setup.py
index bd19931d..1e106570 100644
--- a/setup.py
+++ b/setup.py
@@ -32,6 +32,9 @@ def get_cuda_version():
             "yunchang==0.3",
             "pytest",
             "flask",
+            "protobuf", # for SD3
+            "imageio", # for CogVideoX
+            "imageio-ffmpeg" # for CogVideoX
         ],
         extras_require={
             "[flash_attn]": [

From 99f09dfd169fe1e6738a9facf9f493bc59d029a9 Mon Sep 17 00:00:00 2001
From: Xibo Sun <xibosun@outlook.com>
Date: Sat, 14 Sep 2024 20:27:53 +0800
Subject: [PATCH 02/12] feat: add a docker for github action

---
 .github/workflows/build.yml | 19 ++++++++++++-------
 examples/run.sh             | 10 ++++------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6bd46d48..ebef5e8d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -9,14 +9,19 @@ jobs:
       fail-fast: false
       max-parallel: 5
       matrix:
-        python-versions: [3.10]
+        python-versions: [3.11]
     steps:
       - uses: actions/checkout@v4
-      - name: Uninstall xfuser
-        run: pip uninstall -y xfuser
+
+      - name: Setup docker
+        run: docker run --rm --name mycuda -d -i -t --runtime=nvidia --gpus all -v /data/models/:/cfs/dit/ -v /home/github/actions-runner/_work/xDiT/xDiT:/code nvidia/cuda:12.6.1-cudnn-devel-rockylinux9 /bin/bash
+      - run: docker exec mycuda dnf update -y
+      - run: docker exec mycuda dnf install git python${{matrix.python-versions}} -y
+      - run: docker exec mycuda python${{matrix.python-versions}} -m ensurepip --upgrade
+      - run: docker exec mycuda pip${{matrix.python-versions}} install packaging wheel torch
       - name: Install xfuser
-        run: pip install -e .
+        run: docker exec -w /code mycuda pip${{matrix.python-versions}} install -e .
       - name: Test xfuser
-        run: sh ./examples/run.sh
-      - name: Uninstall xfuser
-        run: pip uninstall -y xfuser
+        run: docker exec -w /code mycuda sh -c "torchrun --nproc_per_node=4 ./examples/sd3_example.py --model /cfs/dit/stable-diffusion-3-medium-diffusers --pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 1 --height 1024 --width 1024 --no_use_resolution_binning --num_inference_steps 20 --warmup_steps 0 --prompt 'A small dog'"
+      - name: Destroy docker
+        run: docker stop mycuda
diff --git a/examples/run.sh b/examples/run.sh
index 5ac205a9..b305ff55 100644
--- a/examples/run.sh
+++ b/examples/run.sh
@@ -19,7 +19,7 @@ export PYTHONPATH=$PWD:$PYTHONPATH
 # or you can simply use the model's ID on Hugging Face, 
 # which will then be downloaded to the default cache path on Hugging Face.
 
-export MODEL_TYPE="CogVideoX"
+export MODEL_TYPE="Sd3"
 # Configuration for different model types
 # script, model_id, inference_step
 declare -A MODEL_CONFIGS=(
@@ -28,7 +28,7 @@ declare -A MODEL_CONFIGS=(
     ["Sd3"]="sd3_example.py /cfs/dit/stable-diffusion-3-medium-diffusers 20"
     ["Flux"]="flux_example.py /cfs/dit/FLUX.1-schnell 4"
     ["HunyuanDiT"]="hunyuandit_example.py /mnt/models/SD/HunyuanDiT-v1.2-Diffusers 50"
-    ["CogVideoX"]="cogvideox_example.py /cfs/dit/CogVideoX-2b 9"
+    ["CogVideoX"]="cogvideox_example.py /cfs/dit/CogVideoX-2b 1"
 )
 
 if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then
@@ -43,7 +43,7 @@ mkdir -p ./results
 
 for HEIGHT in 1024
 do
-for N_GPUS in 1;
+for N_GPUS in 8;
 do 
 
 
@@ -94,6 +94,4 @@ $PARALLLEL_VAE \
 $COMPILE_FLAG
 
 done
-done
-
-
+done
\ No newline at end of file

From 211535ae0992b4ff0622f2c3752c4d5b0f02c987 Mon Sep 17 00:00:00 2001
From: Xibo Sun <xibosun@outlook.com>
Date: Sun, 15 Sep 2024 11:30:50 +0800
Subject: [PATCH 03/12] feat: pre-install dependencies in docker

---
 .github/workflows/build.yml | 13 ++++++++-----
 setup.py                    | 24 +++++++++++++-----------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ebef5e8d..6055f729 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,15 +10,18 @@ jobs:
       max-parallel: 5
       matrix:
         python-versions: [3.11]
+        torch-versions: [4.2.1]
+        include:
+          - python-versions: 3.11
+            python-versions-full: 3_11
+          - torch-versions: 4.2.1
+            torch-versions-full: 4_2_1
     steps:
       - uses: actions/checkout@v4
 
       - name: Setup docker
-        run: docker run --rm --name mycuda -d -i -t --runtime=nvidia --gpus all -v /data/models/:/cfs/dit/ -v /home/github/actions-runner/_work/xDiT/xDiT:/code nvidia/cuda:12.6.1-cudnn-devel-rockylinux9 /bin/bash
-      - run: docker exec mycuda dnf update -y
-      - run: docker exec mycuda dnf install git python${{matrix.python-versions}} -y
-      - run: docker exec mycuda python${{matrix.python-versions}} -m ensurepip --upgrade
-      - run: docker exec mycuda pip${{matrix.python-versions}} install packaging wheel torch
+        run: docker run --rm --name mycuda -d -i -t --runtime=nvidia --gpus all -v /data/models/:/cfs/dit/ -v /home/github/actions-runner/_work/xDiT/xDiT:/code xfusertest/test-py${{matrix.python-versions-full}}-torch${{matrix.torch-versions-full}} /bin/bash
+      - run: docker exec mycuda pip${{matrix.python-versions}} install torch
       - name: Install xfuser
         run: docker exec -w /code mycuda pip${{matrix.python-versions}} install -e .
       - name: Test xfuser
diff --git a/setup.py b/setup.py
index 1e106570..9d5e49bb 100644
--- a/setup.py
+++ b/setup.py
@@ -1,14 +1,18 @@
 from setuptools import find_packages, setup
 import subprocess
 
+
 def get_cuda_version():
     try:
         nvcc_version = subprocess.check_output(["nvcc", "--version"]).decode("utf-8")
-        version_line = [line for line in nvcc_version.split('\n') if "release" in line][0]
-        cuda_version = version_line.split(' ')[-2].replace(',', '')
-        return 'cu' + cuda_version.replace('.', '')
+        version_line = [line for line in nvcc_version.split("\n") if "release" in line][
+            0
+        ]
+        cuda_version = version_line.split(" ")[-2].replace(",", "")
+        return "cu" + cuda_version.replace(".", "")
     except Exception as e:
-        return 'no_cuda'
+        return "no_cuda"
+
 
 if __name__ == "__main__":
     with open("README.md", "r") as f:
@@ -22,19 +26,17 @@ def get_cuda_version():
         author_email="fangjiarui123@gmail.com",
         packages=find_packages(),
         install_requires=[
-            "torch>=2.3.0",
-            "accelerate==0.33.0",
-            "diffusers==0.30.2",
+            "torch>=2.1.0",
+            "accelerate>=0.33.0",
+            "diffusers @ git+https://github.com/huggingface/diffusers.git",
             "transformers>=4.39.1",
             "sentencepiece>=0.1.99",
             "beautifulsoup4>=4.12.3",
             "distvae",
-            "yunchang==0.3",
+            "yunchang>=0.3.0",
             "pytest",
             "flask",
-            "protobuf", # for SD3
-            "imageio", # for CogVideoX
-            "imageio-ffmpeg" # for CogVideoX
+            "opencv-python",
         ],
         extras_require={
             "[flash_attn]": [

From 6c3ae751e3515cbf8f092fba76c29e37a33092b7 Mon Sep 17 00:00:00 2001
From: Xibo Sun <xibosun@outlook.com>
Date: Mon, 16 Sep 2024 10:54:42 +0800
Subject: [PATCH 04/12] fix: fix typos and rename containers

---
 .github/workflows/build.yml | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6055f729..9979d7ec 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,21 +10,20 @@ jobs:
       max-parallel: 5
       matrix:
         python-versions: [3.11]
-        torch-versions: [4.2.1]
+        torch-versions: [2.4.1]
         include:
           - python-versions: 3.11
             python-versions-full: 3_11
-          - torch-versions: 4.2.1
-            torch-versions-full: 4_2_1
+          - torch-versions: 2.4.1
+            torch-versions-full: 2_4_1
     steps:
       - uses: actions/checkout@v4
 
       - name: Setup docker
-        run: docker run --rm --name mycuda -d -i -t --runtime=nvidia --gpus all -v /data/models/:/cfs/dit/ -v /home/github/actions-runner/_work/xDiT/xDiT:/code xfusertest/test-py${{matrix.python-versions-full}}-torch${{matrix.torch-versions-full}} /bin/bash
-      - run: docker exec mycuda pip${{matrix.python-versions}} install torch
+        run: docker run --rm --name xfuser_test_docker -d -i -t --runtime=nvidia --gpus all -v /data/models/:/cfs/dit/ -v /home/github/actions-runner/_work/xDiT/xDiT:/code xfuser_cicd/test-py_${{matrix.python-versions-full}}-torch_${{matrix.torch-versions-full}} /bin/bash
       - name: Install xfuser
-        run: docker exec -w /code mycuda pip${{matrix.python-versions}} install -e .
+        run: docker exec -w /code xfuser_test_docker pip${{matrix.python-versions}} install -e .
       - name: Test xfuser
-        run: docker exec -w /code mycuda sh -c "torchrun --nproc_per_node=4 ./examples/sd3_example.py --model /cfs/dit/stable-diffusion-3-medium-diffusers --pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 1 --height 1024 --width 1024 --no_use_resolution_binning --num_inference_steps 20 --warmup_steps 0 --prompt 'A small dog'"
+        run: docker exec -w /code xfuser_test_docker sh -c "torchrun --nproc_per_node=4 ./examples/sd3_example.py --model /cfs/dit/stable-diffusion-3-medium-diffusers --pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 1 --height 1024 --width 1024 --no_use_resolution_binning --num_inference_steps 20 --warmup_steps 0 --prompt 'A small dog'"
       - name: Destroy docker
-        run: docker stop mycuda
+        run: docker stop xfuser_test_docker

From 97927009ce4d125f46e4353b9a6bca654833e191 Mon Sep 17 00:00:00 2001
From: Xibo Sun <xibosun@outlook.com>
Date: Wed, 18 Sep 2024 16:05:04 +0800
Subject: [PATCH 05/12] fix: change a runner machine

---
 .github/workflows/build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 9979d7ec..0b822441 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -20,10 +20,10 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Setup docker
-        run: docker run --rm --name xfuser_test_docker -d -i -t --runtime=nvidia --gpus all -v /data/models/:/cfs/dit/ -v /home/github/actions-runner/_work/xDiT/xDiT:/code xfuser_cicd/test-py_${{matrix.python-versions-full}}-torch_${{matrix.torch-versions-full}} /bin/bash
+        run: docker run --rm --name xfuser_test_docker -d -i -t --runtime=nvidia --gpus all -v ~/actions-runner/_work/xDiT/xDiT:/code xfuser_cicd/test-py_${{matrix.python-versions-full}}-torch_${{matrix.torch-versions-full}} /bin/bash
       - name: Install xfuser
         run: docker exec -w /code xfuser_test_docker pip${{matrix.python-versions}} install -e .
       - name: Test xfuser
-        run: docker exec -w /code xfuser_test_docker sh -c "torchrun --nproc_per_node=4 ./examples/sd3_example.py --model /cfs/dit/stable-diffusion-3-medium-diffusers --pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 1 --height 1024 --width 1024 --no_use_resolution_binning --num_inference_steps 20 --warmup_steps 0 --prompt 'A small dog'"
+        run: docker exec -w /code xfuser_test_docker sh -c "torchrun --nproc_per_node=8 ./examples/sd3_example.py --model /cfs/dit/stable-diffusion-3-medium-diffusers --pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 1 --height 1024 --width 1024 --no_use_resolution_binning --num_inference_steps 20 --warmup_steps 0 --prompt 'A small dog' --use_cfg_parallel"
       - name: Destroy docker
         run: docker stop xfuser_test_docker

From e2a1f6c606300caf41ada90d0bd62d92767d8a6a Mon Sep 17 00:00:00 2001
From: Xibo Sun <xibosun@outlook.com>
Date: Fri, 20 Sep 2024 21:14:18 +0800
Subject: [PATCH 06/12] fix: use xFuserJointLongContextAttention as the
 underlying SP method for CogVideoX

---
 .../long_ctx_attention/hybrid/attn_layer.py   | 17 ++++++++++-
 .../layers/attention_processor.py             | 30 +++++++++++++++----
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/xfuser/core/long_ctx_attention/hybrid/attn_layer.py b/xfuser/core/long_ctx_attention/hybrid/attn_layer.py
index 60d71264..eb467ba0 100644
--- a/xfuser/core/long_ctx_attention/hybrid/attn_layer.py
+++ b/xfuser/core/long_ctx_attention/hybrid/attn_layer.py
@@ -146,7 +146,22 @@ def forward(
     ):
         # 3 X (bs, seq_len/N, head_cnt, head_size) -> 3 X (bs, seq_len, head_cnt/N, head_size)
         # scatter 2, gather 1
-        query = torch.cat([query, joint_tensor_query], dim=1)
+        supported_joint_strategy = ["none", "front", "rear"]
+        if joint_strategy not in supported_joint_strategy:
+            raise ValueError(
+                f"joint_strategy: {joint_strategy} not supprted. supported joint strategy: {supported_joint_strategy}"
+            )
+        elif joint_strategy != "none" and joint_tensor_query is None:
+            raise ValueError(
+                f"joint_tensor_query must not be None when joint_strategy is not None"
+            )
+        elif joint_strategy == "rear":
+            query = torch.cat([query, joint_tensor_query], dim=1)
+        elif joint_strategy == "front":
+            query = torch.cat([joint_tensor_query, query], dim=1)
+        else:
+            pass
+            
         ulysses_world_size = torch.distributed.get_world_size(self.ulysses_pg)
         ulysses_rank = torch.distributed.get_rank(self.ulysses_pg)
         attn_heads_per_ulysses_rank = joint_tensor_key.shape[-2] // ulysses_world_size
diff --git a/xfuser/model_executor/layers/attention_processor.py b/xfuser/model_executor/layers/attention_processor.py
index 6a81f390..d23d6a0b 100644
--- a/xfuser/model_executor/layers/attention_processor.py
+++ b/xfuser/model_executor/layers/attention_processor.py
@@ -121,7 +121,7 @@ def __init__(
         assert (to_k.bias is None) == (to_v.bias is None)
         assert to_k.weight.shape == to_v.weight.shape
 
-        in_size, out_size = to_k.in_features, to_k.out_features
+        '''in_size, out_size = to_k.in_features, to_k.out_features
         to_kv = nn.Linear(
             in_size,
             out_size * 2,
@@ -137,7 +137,7 @@ def __init__(
             to_kv.bias.data[:out_size].copy_(to_k.bias.data)
             to_kv.bias.data[out_size:].copy_(to_v.bias.data)
 
-        self.to_kv = to_kv
+        self.to_kv = to_kv'''
 
 
 class xFuserAttentionProcessorRegister:
@@ -1013,12 +1013,12 @@ def __init__(self):
             )
             if HAS_LONG_CTX_ATTN and get_sequence_parallel_world_size() > 1:
                 from xfuser.core.long_ctx_attention import (
-                    xFuserLongContextAttention,
+                    xFuserJointLongContextAttention,
                     xFuserUlyssesAttention,
                 )
 
                 if HAS_FLASH_ATTN:
-                    self.hybrid_seq_parallel_attn = xFuserLongContextAttention(
+                    self.hybrid_seq_parallel_attn = xFuserJointLongContextAttention(
                         use_kv_cache=self.use_long_ctx_attn_kvcache
                     )
                 else:
@@ -1040,6 +1040,7 @@ def __call__(
             **kwargs,
         ) -> torch.Tensor:
             text_seq_length = encoder_hidden_states.size(1)
+            latent_seq_length = hidden_states.size(1)
 
             hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
 
@@ -1095,9 +1096,20 @@ def __call__(
 
             #! ---------------------------------------- ATTENTION ----------------------------------------
             if HAS_LONG_CTX_ATTN and get_sequence_parallel_world_size() > 1:
+                encoder_query = query[:, :, :text_seq_length, :]
+                query = query[:, :, text_seq_length:, :]
+                encoder_key = key[:, :, :text_seq_length, :]
+                key = key[:, :, text_seq_length:, :]
+                encoder_value = value[:, :, :text_seq_length, :]
+                value = value[:, :, text_seq_length:, :]
+
                 query = query.transpose(1, 2)
                 key = key.transpose(1, 2)
                 value = value.transpose(1, 2)
+                encoder_query = encoder_query.transpose(1, 2)
+                encoder_key = encoder_key.transpose(1, 2)
+                encoder_value = encoder_value.transpose(1, 2)
+
                 hidden_states = self.hybrid_seq_parallel_attn(
                     attn,
                     query,
@@ -1105,8 +1117,12 @@ def __call__(
                     value,
                     dropout_p=0.0,
                     causal=False,
-                    joint_strategy="none",
+                    joint_tensor_query=encoder_query,
+                    joint_tensor_key=encoder_key,
+                    joint_tensor_value=encoder_value,
+                    joint_strategy="front",
                 )
+
                 hidden_states = hidden_states.reshape(
                     batch_size, -1, attn.heads * head_dim
                 )
@@ -1141,12 +1157,14 @@ def __call__(
             # hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
             #! ---------------------------------------- ATTENTION ----------------------------------------
 
+            assert text_seq_length + latent_seq_length == hidden_states.shape[1]
             # linear proj
             hidden_states = attn.to_out[0](hidden_states)
             # dropout
             hidden_states = attn.to_out[1](hidden_states)
 
+
             encoder_hidden_states, hidden_states = hidden_states.split(
-                [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
+                [text_seq_length, latent_seq_length], dim=1
             )
             return hidden_states, encoder_hidden_states

From 4e283109c7791fcd8dc7a369b5e96d5e35d1563f Mon Sep 17 00:00:00 2001
From: Xibo Sun <xibosun@outlook.com>
Date: Mon, 23 Sep 2024 09:56:24 +0800
Subject: [PATCH 07/12] fix: update the comment format and update github
 actions

---
 .github/workflows/build.yml                   | 53 ++++++++++++-------
 .../layers/attention_processor.py             | 20 -------
 2 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0b822441..0a220142 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,29 +1,42 @@
 name: GitHub Actions Demo
 run-name: Pushed by ${{ github.actor }} 🚀
-on: [push]
+on: [push, pull_request_target]
 jobs:
-  test-xfuser:
-    name: Test xfuser
+  upload:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Run a multi-line script
+      run: git archive --format zip HEAD > xDiT.zip
+    - name: copy file via ssh password
+      uses: appleboy/scp-action@v0.1.7
+      with:
+        host: ${{ secrets.SSH_HOST }}
+        port: ${{ secrets.SSH_PORT }}
+        username: ${{ secrets.SSH_USER }}
+        password: ${{ secrets.SSH_KEY }}
+        source: "xDiT.zip"
+        target: "~/"
+  setup-env-and-test:
+    needs: upload
     runs-on: [self-hosted, linux, x64]
-    strategy:
-      fail-fast: false
-      max-parallel: 5
-      matrix:
-        python-versions: [3.11]
-        torch-versions: [2.4.1]
-        include:
-          - python-versions: 3.11
-            python-versions-full: 3_11
-          - torch-versions: 2.4.1
-            torch-versions-full: 2_4_1
+    continue-on-error: true
     steps:
-      - uses: actions/checkout@v4
-
+      - name: unzip
+        run: rm -rf ~/xDiT
+      - run: mkdir ~/xDiT
+      - run: unzip ~/xDiT.zip -d ~/xDiT
       - name: Setup docker
-        run: docker run --rm --name xfuser_test_docker -d -i -t --runtime=nvidia --gpus all -v ~/actions-runner/_work/xDiT/xDiT:/code xfuser_cicd/test-py_${{matrix.python-versions-full}}-torch_${{matrix.torch-versions-full}} /bin/bash
+        run: docker run --rm --name xfuser_test_docker_${{github.repository_owner_id}} -d -i -t --runtime=nvidia --gpus all -v /cfs:/cfs -v /mnt:/mnt -v ~/xDiT:/code xfuser_cicd/test-py_3_11-torch_2_4_1 /bin/bash
       - name: Install xfuser
-        run: docker exec -w /code xfuser_test_docker pip${{matrix.python-versions}} install -e .
+        run: docker exec -w /code xfuser_test_docker_${{github.repository_owner_id}} pip3.11 install -e .
       - name: Test xfuser
-        run: docker exec -w /code xfuser_test_docker sh -c "torchrun --nproc_per_node=8 ./examples/sd3_example.py --model /cfs/dit/stable-diffusion-3-medium-diffusers --pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 1 --height 1024 --width 1024 --no_use_resolution_binning --num_inference_steps 20 --warmup_steps 0 --prompt 'A small dog' --use_cfg_parallel"
+        run: docker exec -w /code xfuser_test_docker_${{github.repository_owner_id}} sh -c "torchrun --nproc_per_node=8 ./examples/sd3_example.py --model /cfs/dit/stable-diffusion-3-medium-diffusers --pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 1 --height 1024 --width 1024 --no_use_resolution_binning --num_inference_steps 20 --warmup_steps 0 --prompt 'A small dog' --use_cfg_parallel"
+  clear-env:
+    needs: setup-env-and-test
+    runs-on: [self-hosted, linux, x64]
+    steps:
+      - name: Remove Files
+        run: docker exec -w /code xfuser_test_docker_${{github.repository_owner_id}} sh -c "rm -r *"
       - name: Destroy docker
-        run: docker stop xfuser_test_docker
+        run: docker stop xfuser_test_docker_${{github.repository_owner_id}}
diff --git a/xfuser/model_executor/layers/attention_processor.py b/xfuser/model_executor/layers/attention_processor.py
index d23d6a0b..cb08f953 100644
--- a/xfuser/model_executor/layers/attention_processor.py
+++ b/xfuser/model_executor/layers/attention_processor.py
@@ -121,25 +121,6 @@ def __init__(
         assert (to_k.bias is None) == (to_v.bias is None)
         assert to_k.weight.shape == to_v.weight.shape
 
-        '''in_size, out_size = to_k.in_features, to_k.out_features
-        to_kv = nn.Linear(
-            in_size,
-            out_size * 2,
-            bias=to_k.bias is not None,
-            device=to_k.weight.device,
-            dtype=to_k.weight.dtype,
-        )
-        to_kv.weight.data[:out_size].copy_(to_k.weight.data)
-        to_kv.weight.data[out_size:].copy_(to_v.weight.data)
-
-        if to_k.bias is not None:
-            assert to_v.bias is not None
-            to_kv.bias.data[:out_size].copy_(to_k.bias.data)
-            to_kv.bias.data[out_size:].copy_(to_v.bias.data)
-
-        self.to_kv = to_kv'''
-
-
 class xFuserAttentionProcessorRegister:
     _XFUSER_ATTENTION_PROCESSOR_MAPPING = {}
 
@@ -878,7 +859,6 @@ def __call__(
                 encoder_hidden_states
             )
 
-        # kv = attn.to_kv(encoder_hidden_states)
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
 

From afea6decdb663bd06b8489fd5b18289188c55907 Mon Sep 17 00:00:00 2001
From: Xibo Sun <xibosun@outlook.com>
Date: Mon, 23 Sep 2024 16:08:31 +0800
Subject: [PATCH 08/12] fix: patch embedding for CogVideo

---
 examples/run.sh                               |  2 +-
 xfuser/model_executor/layers/embeddings.py    | 60 +++++++++++++------
 .../transformers/cogvideox_transformer_3d.py  |  2 +-
 3 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/examples/run.sh b/examples/run.sh
index ca96f394..e71744bd 100644
--- a/examples/run.sh
+++ b/examples/run.sh
@@ -28,7 +28,7 @@ declare -A MODEL_CONFIGS=(
     ["Sd3"]="sd3_example.py /cfs/dit/stable-diffusion-3-medium-diffusers 20"
     ["Flux"]="flux_example.py /cfs/dit/FLUX.1-schnell 4"
     ["HunyuanDiT"]="hunyuandit_example.py /mnt/models/SD/HunyuanDiT-v1.2-Diffusers 50"
-    ["CogVideoX"]="cogvideox_example.py /cfs/dit/CogVideoX-2b 1"
+    ["CogVideoX"]="cogvideox_example.py /cfs/dit/CogVideoX-2b 20"
 )
 
 if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then
diff --git a/xfuser/model_executor/layers/embeddings.py b/xfuser/model_executor/layers/embeddings.py
index 6f6d67dc..5d8e05ac 100644
--- a/xfuser/model_executor/layers/embeddings.py
+++ b/xfuser/model_executor/layers/embeddings.py
@@ -115,6 +115,7 @@ def __init__(
         super().__init__(
             module=patch_embedding,
         )
+        self.module: CogVideoXPatchEmbed
 
     def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
         r"""
@@ -124,6 +125,10 @@ def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
             image_embeds (`torch.Tensor`):
                 Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
         """
+        sum_height = (
+            get_runtime_state().input_config.height
+            // get_runtime_state().vae_scale_factor_spatial
+        )
         text_embeds = self.text_proj(text_embeds)
         batch, num_frames, channels, height, width = image_embeds.shape
         
@@ -133,28 +138,47 @@ def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
         image_embeds = image_embeds.view(batch, num_frames, *image_embeds.shape[1:])
         image_embeds = image_embeds.flatten(3).transpose(2, 3)  # [batch, num_frames, height x width, channels]
         image_embeds = image_embeds.flatten(1, 2)  # [batch, num_frames x height x width, channels]
-        
-        if get_runtime_state().patch_mode:
-            start, end = get_runtime_state().pp_patches_token_start_end_idx_global[
-                get_runtime_state().pipeline_patch_idx
-            ]
-            image_embeds = image_embeds[
-                :,
-                start:end,
-                :,
-            ]
-        else:
-            image_embeds_list = [
-                image_embeds[
+
+        if self.use_positional_embeddings or self.use_learned_positional_embeddings:
+            if self.use_learned_positional_embeddings and (self.sample_width != width or self.sample_height != sum_height):
+                raise ValueError(
+                    "It is currently not possible to generate videos at a different resolution that the defaults. This should only be the case with 'THUDM/CogVideoX-5b-I2V'."
+                    "If you think this is incorrect, please open an issue at https://github.com/huggingface/diffusers/issues."
+                )
+
+            pre_time_compression_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
+
+            if (
+                self.sample_height != sum_height
+                or self.sample_width != width
+                or self.sample_frames != pre_time_compression_frames
+            ):
+                pos_embedding = self._get_positional_embeddings(sum_height, width, pre_time_compression_frames)
+                pos_embedding = pos_embedding.to(image_embeds.device, dtype=image_embeds.dtype)
+            else:
+                pos_embedding = self.pos_embedding
+
+            # extract the image part of the positional embedding
+            pos_embedding = pos_embedding[:, self.max_text_seq_length :]
+
+            # slice the positional embedding
+            post_patch_height = sum_height // self.patch_size
+            post_patch_width = width // self.patch_size
+            post_time_compression_frames = (pre_time_compression_frames - 1) // self.temporal_compression_ratio + 1
+
+            pos_embed_list = [
+                pos_embedding[
                     :,
-                    get_runtime_state()
-                    .pp_patches_token_start_end_idx_global[i][0] : get_runtime_state()
-                    .pp_patches_token_start_end_idx_global[i][1],
+                    post_patch_height * post_patch_width * i + get_runtime_state().pp_patches_token_start_end_idx_global[0][0]: 
+                    post_patch_height * post_patch_width * i + get_runtime_state().pp_patches_token_start_end_idx_global[0][1],
                     :,
                 ]
-                for i in range(get_runtime_state().num_pipeline_patch)
+                for i in range(post_time_compression_frames)
             ]
-            image_embeds = torch.cat(image_embeds_list, dim=1)
+            pos_embedding = torch.cat(pos_embed_list, dim=1)
+
+            image_embeds = image_embeds + pos_embedding
+
         embeds = torch.cat(
             [text_embeds, image_embeds], dim=1
         ).contiguous()  # [batch, seq_length + num_frames x height x width, channels]
diff --git a/xfuser/model_executor/models/transformers/cogvideox_transformer_3d.py b/xfuser/model_executor/models/transformers/cogvideox_transformer_3d.py
index 48793152..51a74a66 100644
--- a/xfuser/model_executor/models/transformers/cogvideox_transformer_3d.py
+++ b/xfuser/model_executor/models/transformers/cogvideox_transformer_3d.py
@@ -41,7 +41,7 @@ def __init__(
     ):
         super().__init__(
             transformer=transformer,
-            submodule_classes_to_wrap=[nn.Conv2d],
+            submodule_classes_to_wrap=[nn.Conv2d, CogVideoXPatchEmbed],
             submodule_name_to_wrap=["attn1"]
         )
     

From 62a3ed3ee62c18c8cc0dd4f50462b9e7cffc1636 Mon Sep 17 00:00:00 2001
From: Xibo Sun <xibosun@outlook.com>
Date: Mon, 23 Sep 2024 20:29:00 +0800
Subject: [PATCH 09/12] feat: add cfg support to CogVideo

---
 .../pipelines/pipeline_cogvideox.py           | 35 ++++++++-----------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/xfuser/model_executor/pipelines/pipeline_cogvideox.py b/xfuser/model_executor/pipelines/pipeline_cogvideox.py
index 4ca79c07..4cb2be6f 100644
--- a/xfuser/model_executor/pipelines/pipeline_cogvideox.py
+++ b/xfuser/model_executor/pipelines/pipeline_cogvideox.py
@@ -226,8 +226,7 @@ def __call__(
             max_sequence_length=max_sequence_length,
             device=device,
         )
-        if do_classifier_free_guidance:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        prompt_embeds = self._process_cfg_split_batch_latte(prompt_embeds, negative_prompt_embeds)
 
         # 4. Prepare timesteps
         timesteps, num_inference_steps = retrieve_timesteps(
@@ -272,9 +271,11 @@ def __call__(
                 if self.interrupt:
                     continue
 
-                latent_model_input = (
-                    torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                )
+                if do_classifier_free_guidance:
+                    latent_model_input = torch.cat(
+                        [latents] * (2 // get_classifier_free_guidance_world_size())
+                    )
+
                 latent_model_input = self.scheduler.scale_model_input(
                     latent_model_input, t
                 )
@@ -295,21 +296,15 @@ def __call__(
                 # perform guidance
                 if use_dynamic_cfg:
                     self._guidance_scale = 1 + guidance_scale * (
-                        (
-                            1
-                            - math.cos(
-                                math.pi
-                                * (
-                                    (num_inference_steps - t.item())
-                                    / num_inference_steps
-                                )
-                                ** 5.0
-                            )
-                        )
-                        / 2
+                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
                     )
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    if get_classifier_free_guidance_world_size() == 1:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    elif get_classifier_free_guidance_world_size() == 2:
+                        noise_pred_uncond, noise_pred_text = get_cfg_group().all_gather(
+                            noise_pred, separate_tensors=True
+                        )
                     noise_pred = noise_pred_uncond + self.guidance_scale * (
                         noise_pred_text - noise_pred_uncond
                     )
@@ -344,9 +339,7 @@ def __call__(
                         "negative_prompt_embeds", negative_prompt_embeds
                     )
 
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
-                ):
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
 
         if get_sequence_parallel_world_size() > 1:

From 4ab5d3668063a0ea4b10100802aa007fa55d40e8 Mon Sep 17 00:00:00 2001
From: Xibo Sun <xibosun@outlook.com>
Date: Mon, 14 Oct 2024 11:26:01 +0800
Subject: [PATCH 10/12] support VAE tiling/slicing and sequential/model
 offloading in CogVideoX

---
 examples/cogvideox_example.py | 12 +++++++++++-
 examples/run_cogvideo.sh      |  2 ++
 xfuser/config/args.py         | 15 +++++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/examples/cogvideox_example.py b/examples/cogvideox_example.py
index 86abfa7d..4ee43bf5 100644
--- a/examples/cogvideox_example.py
+++ b/examples/cogvideox_example.py
@@ -1,3 +1,4 @@
+import logging
 import time
 import torch
 import torch.distributed
@@ -35,12 +36,21 @@ def main():
         torch_dtype=torch.bfloat16,
     )
     if args.enable_sequential_cpu_offload:
+        pipe.enable_sequential_cpu_offload(gpu_id=local_rank)
+        logging.info(f"rank {local_rank} sequential CPU offload enabled")
+    elif args.enable_model_cpu_offload:
         pipe.enable_model_cpu_offload(gpu_id=local_rank)
-        pipe.vae.enable_tiling()
+        logging.info(f"rank {local_rank} model CPU offload enabled")
     else:
         device = torch.device(f"cuda:{local_rank}")
         pipe = pipe.to(device)
 
+    if args.enable_tiling:
+        pipe.vae.enable_tiling()
+
+    if args.enable_slicing:
+        pipe.vae.enable_slicing()
+
     torch.cuda.reset_peak_memory_stats()
     start_time = time.time()
 
diff --git a/examples/run_cogvideo.sh b/examples/run_cogvideo.sh
index 4b7a771d..0e46894e 100644
--- a/examples/run_cogvideo.sh
+++ b/examples/run_cogvideo.sh
@@ -22,6 +22,7 @@ CFG_ARGS="--use_cfg_parallel"
 # PIPEFUSION_ARGS="--num_pipeline_patch 8"
 # OUTPUT_ARGS="--output_type latent"
 # PARALLLEL_VAE="--use_parallel_vae"
+ENABLE_TILING="--enable_tiling"
 # COMPILE_FLAG="--use_torch_compile"
 
 torchrun --nproc_per_node=$N_GPUS ./examples/$SCRIPT \
@@ -35,4 +36,5 @@ $OUTPUT_ARGS \
 --prompt "A small dog" \
 $CFG_ARGS \
 $PARALLLEL_VAE \
+$ENABLE_TILING \
 $COMPILE_FLAG
\ No newline at end of file
diff --git a/xfuser/config/args.py b/xfuser/config/args.py
index 2ef59a02..88c39f6a 100644
--- a/xfuser/config/args.py
+++ b/xfuser/config/args.py
@@ -247,6 +247,21 @@ def add_cli_args(parser: FlexibleArgumentParser):
             action="store_true",
             help="Offloading the weights to the CPU.",
         )
+        runtime_group.add_argument(
+            "--enable_model_cpu_offload",
+            action="store_true",
+            help="Offloading the weights to the CPU.",
+        )
+        runtime_group.add_argument(
+            "--enable_tiling",
+            action="store_true",
+            help="Making VAE decode a tile at a time to save GPU memory.",
+        )
+        runtime_group.add_argument(
+            "--enable_slicing",
+            action="store_true",
+            help="Making VAE decode a tile at a time to save GPU memory.",
+        )
 
         # DiTFastAttn arguments
         fast_attn_group = parser.add_argument_group("DiTFastAttn Options")

From a29817537a3b5d051209a555f0e38cde26194e75 Mon Sep 17 00:00:00 2001
From: Xibo Sun <xibosun@outlook.com>
Date: Mon, 14 Oct 2024 19:08:30 +0800
Subject: [PATCH 11/12] fix xFuserArgs members

---
 xfuser/config/args.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/xfuser/config/args.py b/xfuser/config/args.py
index 88c39f6a..d3c5d96f 100644
--- a/xfuser/config/args.py
+++ b/xfuser/config/args.py
@@ -94,7 +94,10 @@ class xFuserArgs:
     no_use_resolution_binning: bool = False
     seed: int = 42
     output_type: str = "pil"
+    enable_model_cpu_offload: bool = False
     enable_sequential_cpu_offload: bool = False
+    enable_tiling: bool = False
+    enable_slicing: bool = False
     # DiTFastAttn arguments
     use_fast_attn: bool = False
     n_calib: int = 8

From 70fdd6320aa9694680c9eb6dcda31dba512b31c0 Mon Sep 17 00:00:00 2001
From: Xibo Sun <xibosun@outlook.com>
Date: Fri, 18 Oct 2024 20:01:18 +0800
Subject: [PATCH 12/12] fix bugs in allgather

---
 examples/run_cogvideo.sh                        |  6 +++---
 .../pipelines/pipeline_cogvideox.py             | 17 +----------------
 2 files changed, 4 insertions(+), 19 deletions(-)

diff --git a/examples/run_cogvideo.sh b/examples/run_cogvideo.sh
index 0e46894e..2b10caa9 100644
--- a/examples/run_cogvideo.sh
+++ b/examples/run_cogvideo.sh
@@ -14,9 +14,9 @@ mkdir -p ./results
 TASK_ARGS="--height 480 --width 720 --num_frames 9"
 
 # CogVideoX parallel configuration
-N_GPUS=4
-PARALLEL_ARGS="--ulysses_degree 2 --ring_degree 1"
-CFG_ARGS="--use_cfg_parallel"
+N_GPUS=6
+PARALLEL_ARGS="--ulysses_degree 2 --ring_degree 3"
+#CFG_ARGS="--use_cfg_parallel"
 
 # Uncomment and modify these as needed
 # PIPEFUSION_ARGS="--num_pipeline_patch 8"
diff --git a/xfuser/model_executor/pipelines/pipeline_cogvideox.py b/xfuser/model_executor/pipelines/pipeline_cogvideox.py
index 4cb2be6f..5d04352c 100644
--- a/xfuser/model_executor/pipelines/pipeline_cogvideox.py
+++ b/xfuser/model_executor/pipelines/pipeline_cogvideox.py
@@ -343,22 +343,7 @@ def __call__(
                     progress_bar.update()
 
         if get_sequence_parallel_world_size() > 1:
-            sp_degree = get_sequence_parallel_world_size()
-            sp_latents_list = get_sp_group().all_gather(latents, separate_tensors=True)
-            latents_list = []
-            for pp_patch_idx in range(get_runtime_state().num_pipeline_patch):
-                latents_list += [
-                    sp_latents_list[sp_patch_idx][
-                        :,
-                        :,
-                        get_runtime_state()
-                        .pp_patches_start_idx_local[pp_patch_idx] : get_runtime_state()
-                        .pp_patches_start_idx_local[pp_patch_idx + 1],
-                        :,
-                    ]
-                    for sp_patch_idx in range(sp_degree)
-                ]
-            latents = torch.cat(latents_list, dim=-2)
+            latents = get_sp_group().all_gather(latents, dim=-2)
 
         if is_dp_last_group():
             if not (output_type == "latents" or output_type == "latent"):