From 1bd7a69b1a0e9c9d6a98790d7d31999f08892c60 Mon Sep 17 00:00:00 2001 From: Xibo Sun Date: Fri, 13 Sep 2024 15:13:29 +0800 Subject: [PATCH 01/12] feat: Add github workflow --- .github/workflows/build.yml | 22 ++++++++++++++++++++++ examples/run.sh | 6 +++--- setup.py | 3 +++ 3 files changed, 28 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/build.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000..6bd46d48 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,22 @@ +name: GitHub Actions Demo +run-name: Pushed by ${{ github.actor }} 🚀 +on: [push] +jobs: + test-xfuser: + name: Test xfuser + runs-on: [self-hosted, linux, x64] + strategy: + fail-fast: false + max-parallel: 5 + matrix: + python-versions: [3.10] + steps: + - uses: actions/checkout@v4 + - name: Uninstall xfuser + run: pip uninstall -y xfuser + - name: Install xfuser + run: pip install -e . + - name: Test xfuser + run: sh ./examples/run.sh + - name: Uninstall xfuser + run: pip uninstall -y xfuser diff --git a/examples/run.sh b/examples/run.sh index ab9e8d53..5ac205a9 100644 --- a/examples/run.sh +++ b/examples/run.sh @@ -19,7 +19,7 @@ export PYTHONPATH=$PWD:$PYTHONPATH # or you can simply use the model's ID on Hugging Face, # which will then be downloaded to the default cache path on Hugging Face. -export MODEL_TYPE="Sd3" +export MODEL_TYPE="CogVideoX" # Configuration for different model types # script, model_id, inference_step declare -A MODEL_CONFIGS=( @@ -28,7 +28,7 @@ declare -A MODEL_CONFIGS=( ["Sd3"]="sd3_example.py /cfs/dit/stable-diffusion-3-medium-diffusers 20" ["Flux"]="flux_example.py /cfs/dit/FLUX.1-schnell 4" ["HunyuanDiT"]="hunyuandit_example.py /mnt/models/SD/HunyuanDiT-v1.2-Diffusers 50" - ["CogVideoX"]="cogvideox_example.py /cfs/dit/CogVideoX-2b 1" + ["CogVideoX"]="cogvideox_example.py /cfs/dit/CogVideoX-2b 9" ) if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then @@ -43,7 +43,7 @@ mkdir -p ./results for HEIGHT in 1024 do -for N_GPUS in 8; +for N_GPUS in 1; do diff --git a/setup.py b/setup.py index bd19931d..1e106570 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,9 @@ def get_cuda_version(): "yunchang==0.3", "pytest", "flask", + "protobuf", # for SD3 + "imageio", # for CogVideoX + "imageio-ffmpeg" # for CogVideoX ], extras_require={ "[flash_attn]": [ From 99f09dfd169fe1e6738a9facf9f493bc59d029a9 Mon Sep 17 00:00:00 2001 From: Xibo Sun Date: Sat, 14 Sep 2024 20:27:53 +0800 Subject: [PATCH 02/12] feat: add a docker for github action --- .github/workflows/build.yml | 19 ++++++++++++------- examples/run.sh | 10 ++++------ 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6bd46d48..ebef5e8d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -9,14 +9,19 @@ jobs: fail-fast: false max-parallel: 5 matrix: - python-versions: [3.10] + python-versions: [3.11] steps: - uses: actions/checkout@v4 - - name: Uninstall xfuser - run: pip uninstall -y xfuser + + - name: Setup docker + run: docker run --rm --name mycuda -d -i -t --runtime=nvidia --gpus all -v /data/models/:/cfs/dit/ -v /home/github/actions-runner/_work/xDiT/xDiT:/code nvidia/cuda:12.6.1-cudnn-devel-rockylinux9 /bin/bash + - run: docker exec mycuda dnf update -y + - run: docker exec mycuda dnf install git python${{matrix.python-versions}} -y + - run: docker exec mycuda python${{matrix.python-versions}} -m ensurepip --upgrade + - run: docker exec mycuda pip${{matrix.python-versions}} install packaging wheel torch - name: Install xfuser - run: pip install -e . + run: docker exec -w /code mycuda pip${{matrix.python-versions}} install -e . - name: Test xfuser - run: sh ./examples/run.sh - - name: Uninstall xfuser - run: pip uninstall -y xfuser + run: docker exec -w /code mycuda sh -c "torchrun --nproc_per_node=4 ./examples/sd3_example.py --model /cfs/dit/stable-diffusion-3-medium-diffusers --pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 1 --height 1024 --width 1024 --no_use_resolution_binning --num_inference_steps 20 --warmup_steps 0 --prompt 'A small dog'" + - name: Destroy docker + run: docker stop mycuda diff --git a/examples/run.sh b/examples/run.sh index 5ac205a9..b305ff55 100644 --- a/examples/run.sh +++ b/examples/run.sh @@ -19,7 +19,7 @@ export PYTHONPATH=$PWD:$PYTHONPATH # or you can simply use the model's ID on Hugging Face, # which will then be downloaded to the default cache path on Hugging Face. -export MODEL_TYPE="CogVideoX" +export MODEL_TYPE="Sd3" # Configuration for different model types # script, model_id, inference_step declare -A MODEL_CONFIGS=( @@ -28,7 +28,7 @@ declare -A MODEL_CONFIGS=( ["Sd3"]="sd3_example.py /cfs/dit/stable-diffusion-3-medium-diffusers 20" ["Flux"]="flux_example.py /cfs/dit/FLUX.1-schnell 4" ["HunyuanDiT"]="hunyuandit_example.py /mnt/models/SD/HunyuanDiT-v1.2-Diffusers 50" - ["CogVideoX"]="cogvideox_example.py /cfs/dit/CogVideoX-2b 9" + ["CogVideoX"]="cogvideox_example.py /cfs/dit/CogVideoX-2b 1" ) if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then @@ -43,7 +43,7 @@ mkdir -p ./results for HEIGHT in 1024 do -for N_GPUS in 1; +for N_GPUS in 8; do @@ -94,6 +94,4 @@ $PARALLLEL_VAE \ $COMPILE_FLAG done -done - - +done \ No newline at end of file From 211535ae0992b4ff0622f2c3752c4d5b0f02c987 Mon Sep 17 00:00:00 2001 From: Xibo Sun Date: Sun, 15 Sep 2024 11:30:50 +0800 Subject: [PATCH 03/12] feat: pre-install dependencies in docker --- .github/workflows/build.yml | 13 ++++++++----- setup.py | 24 +++++++++++++----------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ebef5e8d..6055f729 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,15 +10,18 @@ jobs: max-parallel: 5 matrix: python-versions: [3.11] + torch-versions: [4.2.1] + include: + - python-versions: 3.11 + python-versions-full: 3_11 + - torch-versions: 4.2.1 + torch-versions-full: 4_2_1 steps: - uses: actions/checkout@v4 - name: Setup docker - run: docker run --rm --name mycuda -d -i -t --runtime=nvidia --gpus all -v /data/models/:/cfs/dit/ -v /home/github/actions-runner/_work/xDiT/xDiT:/code nvidia/cuda:12.6.1-cudnn-devel-rockylinux9 /bin/bash - - run: docker exec mycuda dnf update -y - - run: docker exec mycuda dnf install git python${{matrix.python-versions}} -y - - run: docker exec mycuda python${{matrix.python-versions}} -m ensurepip --upgrade - - run: docker exec mycuda pip${{matrix.python-versions}} install packaging wheel torch + run: docker run --rm --name mycuda -d -i -t --runtime=nvidia --gpus all -v /data/models/:/cfs/dit/ -v /home/github/actions-runner/_work/xDiT/xDiT:/code xfusertest/test-py${{matrix.python-versions-full}}-torch${{matrix.torch-versions-full}} /bin/bash + - run: docker exec mycuda pip${{matrix.python-versions}} install torch - name: Install xfuser run: docker exec -w /code mycuda pip${{matrix.python-versions}} install -e . - name: Test xfuser diff --git a/setup.py b/setup.py index 1e106570..9d5e49bb 100644 --- a/setup.py +++ b/setup.py @@ -1,14 +1,18 @@ from setuptools import find_packages, setup import subprocess + def get_cuda_version(): try: nvcc_version = subprocess.check_output(["nvcc", "--version"]).decode("utf-8") - version_line = [line for line in nvcc_version.split('\n') if "release" in line][0] - cuda_version = version_line.split(' ')[-2].replace(',', '') - return 'cu' + cuda_version.replace('.', '') + version_line = [line for line in nvcc_version.split("\n") if "release" in line][ + 0 + ] + cuda_version = version_line.split(" ")[-2].replace(",", "") + return "cu" + cuda_version.replace(".", "") except Exception as e: - return 'no_cuda' + return "no_cuda" + if __name__ == "__main__": with open("README.md", "r") as f: @@ -22,19 +26,17 @@ def get_cuda_version(): author_email="fangjiarui123@gmail.com", packages=find_packages(), install_requires=[ - "torch>=2.3.0", - "accelerate==0.33.0", - "diffusers==0.30.2", + "torch>=2.1.0", + "accelerate>=0.33.0", + "diffusers @ git+https://github.com/huggingface/diffusers.git", "transformers>=4.39.1", "sentencepiece>=0.1.99", "beautifulsoup4>=4.12.3", "distvae", - "yunchang==0.3", + "yunchang>=0.3.0", "pytest", "flask", - "protobuf", # for SD3 - "imageio", # for CogVideoX - "imageio-ffmpeg" # for CogVideoX + "opencv-python", ], extras_require={ "[flash_attn]": [ From 6c3ae751e3515cbf8f092fba76c29e37a33092b7 Mon Sep 17 00:00:00 2001 From: Xibo Sun Date: Mon, 16 Sep 2024 10:54:42 +0800 Subject: [PATCH 04/12] fix: fix typos and rename containers --- .github/workflows/build.yml | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6055f729..9979d7ec 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,21 +10,20 @@ jobs: max-parallel: 5 matrix: python-versions: [3.11] - torch-versions: [4.2.1] + torch-versions: [2.4.1] include: - python-versions: 3.11 python-versions-full: 3_11 - - torch-versions: 4.2.1 - torch-versions-full: 4_2_1 + - torch-versions: 2.4.1 + torch-versions-full: 2_4_1 steps: - uses: actions/checkout@v4 - name: Setup docker - run: docker run --rm --name mycuda -d -i -t --runtime=nvidia --gpus all -v /data/models/:/cfs/dit/ -v /home/github/actions-runner/_work/xDiT/xDiT:/code xfusertest/test-py${{matrix.python-versions-full}}-torch${{matrix.torch-versions-full}} /bin/bash - - run: docker exec mycuda pip${{matrix.python-versions}} install torch + run: docker run --rm --name xfuser_test_docker -d -i -t --runtime=nvidia --gpus all -v /data/models/:/cfs/dit/ -v /home/github/actions-runner/_work/xDiT/xDiT:/code xfuser_cicd/test-py_${{matrix.python-versions-full}}-torch_${{matrix.torch-versions-full}} /bin/bash - name: Install xfuser - run: docker exec -w /code mycuda pip${{matrix.python-versions}} install -e . + run: docker exec -w /code xfuser_test_docker pip${{matrix.python-versions}} install -e . - name: Test xfuser - run: docker exec -w /code mycuda sh -c "torchrun --nproc_per_node=4 ./examples/sd3_example.py --model /cfs/dit/stable-diffusion-3-medium-diffusers --pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 1 --height 1024 --width 1024 --no_use_resolution_binning --num_inference_steps 20 --warmup_steps 0 --prompt 'A small dog'" + run: docker exec -w /code xfuser_test_docker sh -c "torchrun --nproc_per_node=4 ./examples/sd3_example.py --model /cfs/dit/stable-diffusion-3-medium-diffusers --pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 1 --height 1024 --width 1024 --no_use_resolution_binning --num_inference_steps 20 --warmup_steps 0 --prompt 'A small dog'" - name: Destroy docker - run: docker stop mycuda + run: docker stop xfuser_test_docker From 97927009ce4d125f46e4353b9a6bca654833e191 Mon Sep 17 00:00:00 2001 From: Xibo Sun Date: Wed, 18 Sep 2024 16:05:04 +0800 Subject: [PATCH 05/12] fix: change a runner machine --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9979d7ec..0b822441 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,10 +20,10 @@ jobs: - uses: actions/checkout@v4 - name: Setup docker - run: docker run --rm --name xfuser_test_docker -d -i -t --runtime=nvidia --gpus all -v /data/models/:/cfs/dit/ -v /home/github/actions-runner/_work/xDiT/xDiT:/code xfuser_cicd/test-py_${{matrix.python-versions-full}}-torch_${{matrix.torch-versions-full}} /bin/bash + run: docker run --rm --name xfuser_test_docker -d -i -t --runtime=nvidia --gpus all -v ~/actions-runner/_work/xDiT/xDiT:/code xfuser_cicd/test-py_${{matrix.python-versions-full}}-torch_${{matrix.torch-versions-full}} /bin/bash - name: Install xfuser run: docker exec -w /code xfuser_test_docker pip${{matrix.python-versions}} install -e . - name: Test xfuser - run: docker exec -w /code xfuser_test_docker sh -c "torchrun --nproc_per_node=4 ./examples/sd3_example.py --model /cfs/dit/stable-diffusion-3-medium-diffusers --pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 1 --height 1024 --width 1024 --no_use_resolution_binning --num_inference_steps 20 --warmup_steps 0 --prompt 'A small dog'" + run: docker exec -w /code xfuser_test_docker sh -c "torchrun --nproc_per_node=8 ./examples/sd3_example.py --model /cfs/dit/stable-diffusion-3-medium-diffusers --pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 1 --height 1024 --width 1024 --no_use_resolution_binning --num_inference_steps 20 --warmup_steps 0 --prompt 'A small dog' --use_cfg_parallel" - name: Destroy docker run: docker stop xfuser_test_docker From e2a1f6c606300caf41ada90d0bd62d92767d8a6a Mon Sep 17 00:00:00 2001 From: Xibo Sun Date: Fri, 20 Sep 2024 21:14:18 +0800 Subject: [PATCH 06/12] fix: use xFuserJointLongContextAttention as the underlying SP method for CogVideoX --- .../long_ctx_attention/hybrid/attn_layer.py | 17 ++++++++++- .../layers/attention_processor.py | 30 +++++++++++++++---- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/xfuser/core/long_ctx_attention/hybrid/attn_layer.py b/xfuser/core/long_ctx_attention/hybrid/attn_layer.py index 60d71264..eb467ba0 100644 --- a/xfuser/core/long_ctx_attention/hybrid/attn_layer.py +++ b/xfuser/core/long_ctx_attention/hybrid/attn_layer.py @@ -146,7 +146,22 @@ def forward( ): # 3 X (bs, seq_len/N, head_cnt, head_size) -> 3 X (bs, seq_len, head_cnt/N, head_size) # scatter 2, gather 1 - query = torch.cat([query, joint_tensor_query], dim=1) + supported_joint_strategy = ["none", "front", "rear"] + if joint_strategy not in supported_joint_strategy: + raise ValueError( + f"joint_strategy: {joint_strategy} not supprted. supported joint strategy: {supported_joint_strategy}" + ) + elif joint_strategy != "none" and joint_tensor_query is None: + raise ValueError( + f"joint_tensor_query must not be None when joint_strategy is not None" + ) + elif joint_strategy == "rear": + query = torch.cat([query, joint_tensor_query], dim=1) + elif joint_strategy == "front": + query = torch.cat([joint_tensor_query, query], dim=1) + else: + pass + ulysses_world_size = torch.distributed.get_world_size(self.ulysses_pg) ulysses_rank = torch.distributed.get_rank(self.ulysses_pg) attn_heads_per_ulysses_rank = joint_tensor_key.shape[-2] // ulysses_world_size diff --git a/xfuser/model_executor/layers/attention_processor.py b/xfuser/model_executor/layers/attention_processor.py index 6a81f390..d23d6a0b 100644 --- a/xfuser/model_executor/layers/attention_processor.py +++ b/xfuser/model_executor/layers/attention_processor.py @@ -121,7 +121,7 @@ def __init__( assert (to_k.bias is None) == (to_v.bias is None) assert to_k.weight.shape == to_v.weight.shape - in_size, out_size = to_k.in_features, to_k.out_features + '''in_size, out_size = to_k.in_features, to_k.out_features to_kv = nn.Linear( in_size, out_size * 2, @@ -137,7 +137,7 @@ def __init__( to_kv.bias.data[:out_size].copy_(to_k.bias.data) to_kv.bias.data[out_size:].copy_(to_v.bias.data) - self.to_kv = to_kv + self.to_kv = to_kv''' class xFuserAttentionProcessorRegister: @@ -1013,12 +1013,12 @@ def __init__(self): ) if HAS_LONG_CTX_ATTN and get_sequence_parallel_world_size() > 1: from xfuser.core.long_ctx_attention import ( - xFuserLongContextAttention, + xFuserJointLongContextAttention, xFuserUlyssesAttention, ) if HAS_FLASH_ATTN: - self.hybrid_seq_parallel_attn = xFuserLongContextAttention( + self.hybrid_seq_parallel_attn = xFuserJointLongContextAttention( use_kv_cache=self.use_long_ctx_attn_kvcache ) else: @@ -1040,6 +1040,7 @@ def __call__( **kwargs, ) -> torch.Tensor: text_seq_length = encoder_hidden_states.size(1) + latent_seq_length = hidden_states.size(1) hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1) @@ -1095,9 +1096,20 @@ def __call__( #! ---------------------------------------- ATTENTION ---------------------------------------- if HAS_LONG_CTX_ATTN and get_sequence_parallel_world_size() > 1: + encoder_query = query[:, :, :text_seq_length, :] + query = query[:, :, text_seq_length:, :] + encoder_key = key[:, :, :text_seq_length, :] + key = key[:, :, text_seq_length:, :] + encoder_value = value[:, :, :text_seq_length, :] + value = value[:, :, text_seq_length:, :] + query = query.transpose(1, 2) key = key.transpose(1, 2) value = value.transpose(1, 2) + encoder_query = encoder_query.transpose(1, 2) + encoder_key = encoder_key.transpose(1, 2) + encoder_value = encoder_value.transpose(1, 2) + hidden_states = self.hybrid_seq_parallel_attn( attn, query, @@ -1105,8 +1117,12 @@ def __call__( value, dropout_p=0.0, causal=False, - joint_strategy="none", + joint_tensor_query=encoder_query, + joint_tensor_key=encoder_key, + joint_tensor_value=encoder_value, + joint_strategy="front", ) + hidden_states = hidden_states.reshape( batch_size, -1, attn.heads * head_dim ) @@ -1141,12 +1157,14 @@ def __call__( # hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) #! ---------------------------------------- ATTENTION ---------------------------------------- + assert text_seq_length + latent_seq_length == hidden_states.shape[1] # linear proj hidden_states = attn.to_out[0](hidden_states) # dropout hidden_states = attn.to_out[1](hidden_states) + encoder_hidden_states, hidden_states = hidden_states.split( - [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1 + [text_seq_length, latent_seq_length], dim=1 ) return hidden_states, encoder_hidden_states From 4e283109c7791fcd8dc7a369b5e96d5e35d1563f Mon Sep 17 00:00:00 2001 From: Xibo Sun Date: Mon, 23 Sep 2024 09:56:24 +0800 Subject: [PATCH 07/12] fix: update the comment format and update github actions --- .github/workflows/build.yml | 53 ++++++++++++------- .../layers/attention_processor.py | 20 ------- 2 files changed, 33 insertions(+), 40 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0b822441..0a220142 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,29 +1,42 @@ name: GitHub Actions Demo run-name: Pushed by ${{ github.actor }} 🚀 -on: [push] +on: [push, pull_request_target] jobs: - test-xfuser: - name: Test xfuser + upload: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Run a multi-line script + run: git archive --format zip HEAD > xDiT.zip + - name: copy file via ssh password + uses: appleboy/scp-action@v0.1.7 + with: + host: ${{ secrets.SSH_HOST }} + port: ${{ secrets.SSH_PORT }} + username: ${{ secrets.SSH_USER }} + password: ${{ secrets.SSH_KEY }} + source: "xDiT.zip" + target: "~/" + setup-env-and-test: + needs: upload runs-on: [self-hosted, linux, x64] - strategy: - fail-fast: false - max-parallel: 5 - matrix: - python-versions: [3.11] - torch-versions: [2.4.1] - include: - - python-versions: 3.11 - python-versions-full: 3_11 - - torch-versions: 2.4.1 - torch-versions-full: 2_4_1 + continue-on-error: true steps: - - uses: actions/checkout@v4 - + - name: unzip + run: rm -rf ~/xDiT + - run: mkdir ~/xDiT + - run: unzip ~/xDiT.zip -d ~/xDiT - name: Setup docker - run: docker run --rm --name xfuser_test_docker -d -i -t --runtime=nvidia --gpus all -v ~/actions-runner/_work/xDiT/xDiT:/code xfuser_cicd/test-py_${{matrix.python-versions-full}}-torch_${{matrix.torch-versions-full}} /bin/bash + run: docker run --rm --name xfuser_test_docker_${{github.repository_owner_id}} -d -i -t --runtime=nvidia --gpus all -v /cfs:/cfs -v /mnt:/mnt -v ~/xDiT:/code xfuser_cicd/test-py_3_11-torch_2_4_1 /bin/bash - name: Install xfuser - run: docker exec -w /code xfuser_test_docker pip${{matrix.python-versions}} install -e . + run: docker exec -w /code xfuser_test_docker_${{github.repository_owner_id}} pip3.11 install -e . - name: Test xfuser - run: docker exec -w /code xfuser_test_docker sh -c "torchrun --nproc_per_node=8 ./examples/sd3_example.py --model /cfs/dit/stable-diffusion-3-medium-diffusers --pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 1 --height 1024 --width 1024 --no_use_resolution_binning --num_inference_steps 20 --warmup_steps 0 --prompt 'A small dog' --use_cfg_parallel" + run: docker exec -w /code xfuser_test_docker_${{github.repository_owner_id}} sh -c "torchrun --nproc_per_node=8 ./examples/sd3_example.py --model /cfs/dit/stable-diffusion-3-medium-diffusers --pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 1 --height 1024 --width 1024 --no_use_resolution_binning --num_inference_steps 20 --warmup_steps 0 --prompt 'A small dog' --use_cfg_parallel" + clear-env: + needs: setup-env-and-test + runs-on: [self-hosted, linux, x64] + steps: + - name: Remove Files + run: docker exec -w /code xfuser_test_docker_${{github.repository_owner_id}} sh -c "rm -r *" - name: Destroy docker - run: docker stop xfuser_test_docker + run: docker stop xfuser_test_docker_${{github.repository_owner_id}} diff --git a/xfuser/model_executor/layers/attention_processor.py b/xfuser/model_executor/layers/attention_processor.py index d23d6a0b..cb08f953 100644 --- a/xfuser/model_executor/layers/attention_processor.py +++ b/xfuser/model_executor/layers/attention_processor.py @@ -121,25 +121,6 @@ def __init__( assert (to_k.bias is None) == (to_v.bias is None) assert to_k.weight.shape == to_v.weight.shape - '''in_size, out_size = to_k.in_features, to_k.out_features - to_kv = nn.Linear( - in_size, - out_size * 2, - bias=to_k.bias is not None, - device=to_k.weight.device, - dtype=to_k.weight.dtype, - ) - to_kv.weight.data[:out_size].copy_(to_k.weight.data) - to_kv.weight.data[out_size:].copy_(to_v.weight.data) - - if to_k.bias is not None: - assert to_v.bias is not None - to_kv.bias.data[:out_size].copy_(to_k.bias.data) - to_kv.bias.data[out_size:].copy_(to_v.bias.data) - - self.to_kv = to_kv''' - - class xFuserAttentionProcessorRegister: _XFUSER_ATTENTION_PROCESSOR_MAPPING = {} @@ -878,7 +859,6 @@ def __call__( encoder_hidden_states ) - # kv = attn.to_kv(encoder_hidden_states) key = attn.to_k(encoder_hidden_states) value = attn.to_v(encoder_hidden_states) From afea6decdb663bd06b8489fd5b18289188c55907 Mon Sep 17 00:00:00 2001 From: Xibo Sun Date: Mon, 23 Sep 2024 16:08:31 +0800 Subject: [PATCH 08/12] fix: patch embedding for CogVideo --- examples/run.sh | 2 +- xfuser/model_executor/layers/embeddings.py | 60 +++++++++++++------ .../transformers/cogvideox_transformer_3d.py | 2 +- 3 files changed, 44 insertions(+), 20 deletions(-) diff --git a/examples/run.sh b/examples/run.sh index ca96f394..e71744bd 100644 --- a/examples/run.sh +++ b/examples/run.sh @@ -28,7 +28,7 @@ declare -A MODEL_CONFIGS=( ["Sd3"]="sd3_example.py /cfs/dit/stable-diffusion-3-medium-diffusers 20" ["Flux"]="flux_example.py /cfs/dit/FLUX.1-schnell 4" ["HunyuanDiT"]="hunyuandit_example.py /mnt/models/SD/HunyuanDiT-v1.2-Diffusers 50" - ["CogVideoX"]="cogvideox_example.py /cfs/dit/CogVideoX-2b 1" + ["CogVideoX"]="cogvideox_example.py /cfs/dit/CogVideoX-2b 20" ) if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then diff --git a/xfuser/model_executor/layers/embeddings.py b/xfuser/model_executor/layers/embeddings.py index 6f6d67dc..5d8e05ac 100644 --- a/xfuser/model_executor/layers/embeddings.py +++ b/xfuser/model_executor/layers/embeddings.py @@ -115,6 +115,7 @@ def __init__( super().__init__( module=patch_embedding, ) + self.module: CogVideoXPatchEmbed def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor): r""" @@ -124,6 +125,10 @@ def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor): image_embeds (`torch.Tensor`): Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width). """ + sum_height = ( + get_runtime_state().input_config.height + // get_runtime_state().vae_scale_factor_spatial + ) text_embeds = self.text_proj(text_embeds) batch, num_frames, channels, height, width = image_embeds.shape @@ -133,28 +138,47 @@ def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor): image_embeds = image_embeds.view(batch, num_frames, *image_embeds.shape[1:]) image_embeds = image_embeds.flatten(3).transpose(2, 3) # [batch, num_frames, height x width, channels] image_embeds = image_embeds.flatten(1, 2) # [batch, num_frames x height x width, channels] - - if get_runtime_state().patch_mode: - start, end = get_runtime_state().pp_patches_token_start_end_idx_global[ - get_runtime_state().pipeline_patch_idx - ] - image_embeds = image_embeds[ - :, - start:end, - :, - ] - else: - image_embeds_list = [ - image_embeds[ + + if self.use_positional_embeddings or self.use_learned_positional_embeddings: + if self.use_learned_positional_embeddings and (self.sample_width != width or self.sample_height != sum_height): + raise ValueError( + "It is currently not possible to generate videos at a different resolution that the defaults. This should only be the case with 'THUDM/CogVideoX-5b-I2V'." + "If you think this is incorrect, please open an issue at https://github.com/huggingface/diffusers/issues." + ) + + pre_time_compression_frames = (num_frames - 1) * self.temporal_compression_ratio + 1 + + if ( + self.sample_height != sum_height + or self.sample_width != width + or self.sample_frames != pre_time_compression_frames + ): + pos_embedding = self._get_positional_embeddings(sum_height, width, pre_time_compression_frames) + pos_embedding = pos_embedding.to(image_embeds.device, dtype=image_embeds.dtype) + else: + pos_embedding = self.pos_embedding + + # extract the image part of the positional embedding + pos_embedding = pos_embedding[:, self.max_text_seq_length :] + + # slice the positional embedding + post_patch_height = sum_height // self.patch_size + post_patch_width = width // self.patch_size + post_time_compression_frames = (pre_time_compression_frames - 1) // self.temporal_compression_ratio + 1 + + pos_embed_list = [ + pos_embedding[ :, - get_runtime_state() - .pp_patches_token_start_end_idx_global[i][0] : get_runtime_state() - .pp_patches_token_start_end_idx_global[i][1], + post_patch_height * post_patch_width * i + get_runtime_state().pp_patches_token_start_end_idx_global[0][0]: + post_patch_height * post_patch_width * i + get_runtime_state().pp_patches_token_start_end_idx_global[0][1], :, ] - for i in range(get_runtime_state().num_pipeline_patch) + for i in range(post_time_compression_frames) ] - image_embeds = torch.cat(image_embeds_list, dim=1) + pos_embedding = torch.cat(pos_embed_list, dim=1) + + image_embeds = image_embeds + pos_embedding + embeds = torch.cat( [text_embeds, image_embeds], dim=1 ).contiguous() # [batch, seq_length + num_frames x height x width, channels] diff --git a/xfuser/model_executor/models/transformers/cogvideox_transformer_3d.py b/xfuser/model_executor/models/transformers/cogvideox_transformer_3d.py index 48793152..51a74a66 100644 --- a/xfuser/model_executor/models/transformers/cogvideox_transformer_3d.py +++ b/xfuser/model_executor/models/transformers/cogvideox_transformer_3d.py @@ -41,7 +41,7 @@ def __init__( ): super().__init__( transformer=transformer, - submodule_classes_to_wrap=[nn.Conv2d], + submodule_classes_to_wrap=[nn.Conv2d, CogVideoXPatchEmbed], submodule_name_to_wrap=["attn1"] ) From 62a3ed3ee62c18c8cc0dd4f50462b9e7cffc1636 Mon Sep 17 00:00:00 2001 From: Xibo Sun Date: Mon, 23 Sep 2024 20:29:00 +0800 Subject: [PATCH 09/12] feat: add cfg support to CogVideo --- .../pipelines/pipeline_cogvideox.py | 35 ++++++++----------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/xfuser/model_executor/pipelines/pipeline_cogvideox.py b/xfuser/model_executor/pipelines/pipeline_cogvideox.py index 4ca79c07..4cb2be6f 100644 --- a/xfuser/model_executor/pipelines/pipeline_cogvideox.py +++ b/xfuser/model_executor/pipelines/pipeline_cogvideox.py @@ -226,8 +226,7 @@ def __call__( max_sequence_length=max_sequence_length, device=device, ) - if do_classifier_free_guidance: - prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) + prompt_embeds = self._process_cfg_split_batch_latte(prompt_embeds, negative_prompt_embeds) # 4. Prepare timesteps timesteps, num_inference_steps = retrieve_timesteps( @@ -272,9 +271,11 @@ def __call__( if self.interrupt: continue - latent_model_input = ( - torch.cat([latents] * 2) if do_classifier_free_guidance else latents - ) + if do_classifier_free_guidance: + latent_model_input = torch.cat( + [latents] * (2 // get_classifier_free_guidance_world_size()) + ) + latent_model_input = self.scheduler.scale_model_input( latent_model_input, t ) @@ -295,21 +296,15 @@ def __call__( # perform guidance if use_dynamic_cfg: self._guidance_scale = 1 + guidance_scale * ( - ( - 1 - - math.cos( - math.pi - * ( - (num_inference_steps - t.item()) - / num_inference_steps - ) - ** 5.0 - ) - ) - / 2 + (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2 ) if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + if get_classifier_free_guidance_world_size() == 1: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + elif get_classifier_free_guidance_world_size() == 2: + noise_pred_uncond, noise_pred_text = get_cfg_group().all_gather( + noise_pred, separate_tensors=True + ) noise_pred = noise_pred_uncond + self.guidance_scale * ( noise_pred_text - noise_pred_uncond ) @@ -344,9 +339,7 @@ def __call__( "negative_prompt_embeds", negative_prompt_embeds ) - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 - ): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if get_sequence_parallel_world_size() > 1: From 4ab5d3668063a0ea4b10100802aa007fa55d40e8 Mon Sep 17 00:00:00 2001 From: Xibo Sun Date: Mon, 14 Oct 2024 11:26:01 +0800 Subject: [PATCH 10/12] support VAE tiling/slicing and sequential/model offloading in CogVideoX --- examples/cogvideox_example.py | 12 +++++++++++- examples/run_cogvideo.sh | 2 ++ xfuser/config/args.py | 15 +++++++++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/examples/cogvideox_example.py b/examples/cogvideox_example.py index 86abfa7d..4ee43bf5 100644 --- a/examples/cogvideox_example.py +++ b/examples/cogvideox_example.py @@ -1,3 +1,4 @@ +import logging import time import torch import torch.distributed @@ -35,12 +36,21 @@ def main(): torch_dtype=torch.bfloat16, ) if args.enable_sequential_cpu_offload: + pipe.enable_sequential_cpu_offload(gpu_id=local_rank) + logging.info(f"rank {local_rank} sequential CPU offload enabled") + elif args.enable_model_cpu_offload: pipe.enable_model_cpu_offload(gpu_id=local_rank) - pipe.vae.enable_tiling() + logging.info(f"rank {local_rank} model CPU offload enabled") else: device = torch.device(f"cuda:{local_rank}") pipe = pipe.to(device) + if args.enable_tiling: + pipe.vae.enable_tiling() + + if args.enable_slicing: + pipe.vae.enable_slicing() + torch.cuda.reset_peak_memory_stats() start_time = time.time() diff --git a/examples/run_cogvideo.sh b/examples/run_cogvideo.sh index 4b7a771d..0e46894e 100644 --- a/examples/run_cogvideo.sh +++ b/examples/run_cogvideo.sh @@ -22,6 +22,7 @@ CFG_ARGS="--use_cfg_parallel" # PIPEFUSION_ARGS="--num_pipeline_patch 8" # OUTPUT_ARGS="--output_type latent" # PARALLLEL_VAE="--use_parallel_vae" +ENABLE_TILING="--enable_tiling" # COMPILE_FLAG="--use_torch_compile" torchrun --nproc_per_node=$N_GPUS ./examples/$SCRIPT \ @@ -35,4 +36,5 @@ $OUTPUT_ARGS \ --prompt "A small dog" \ $CFG_ARGS \ $PARALLLEL_VAE \ +$ENABLE_TILING \ $COMPILE_FLAG \ No newline at end of file diff --git a/xfuser/config/args.py b/xfuser/config/args.py index 2ef59a02..88c39f6a 100644 --- a/xfuser/config/args.py +++ b/xfuser/config/args.py @@ -247,6 +247,21 @@ def add_cli_args(parser: FlexibleArgumentParser): action="store_true", help="Offloading the weights to the CPU.", ) + runtime_group.add_argument( + "--enable_model_cpu_offload", + action="store_true", + help="Offloading the weights to the CPU.", + ) + runtime_group.add_argument( + "--enable_tiling", + action="store_true", + help="Making VAE decode a tile at a time to save GPU memory.", + ) + runtime_group.add_argument( + "--enable_slicing", + action="store_true", + help="Making VAE decode a tile at a time to save GPU memory.", + ) # DiTFastAttn arguments fast_attn_group = parser.add_argument_group("DiTFastAttn Options") From a29817537a3b5d051209a555f0e38cde26194e75 Mon Sep 17 00:00:00 2001 From: Xibo Sun Date: Mon, 14 Oct 2024 19:08:30 +0800 Subject: [PATCH 11/12] fix xFuserArgs members --- xfuser/config/args.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xfuser/config/args.py b/xfuser/config/args.py index 88c39f6a..d3c5d96f 100644 --- a/xfuser/config/args.py +++ b/xfuser/config/args.py @@ -94,7 +94,10 @@ class xFuserArgs: no_use_resolution_binning: bool = False seed: int = 42 output_type: str = "pil" + enable_model_cpu_offload: bool = False enable_sequential_cpu_offload: bool = False + enable_tiling: bool = False + enable_slicing: bool = False # DiTFastAttn arguments use_fast_attn: bool = False n_calib: int = 8 From 70fdd6320aa9694680c9eb6dcda31dba512b31c0 Mon Sep 17 00:00:00 2001 From: Xibo Sun Date: Fri, 18 Oct 2024 20:01:18 +0800 Subject: [PATCH 12/12] fix bugs in allgather --- examples/run_cogvideo.sh | 6 +++--- .../pipelines/pipeline_cogvideox.py | 17 +---------------- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/examples/run_cogvideo.sh b/examples/run_cogvideo.sh index 0e46894e..2b10caa9 100644 --- a/examples/run_cogvideo.sh +++ b/examples/run_cogvideo.sh @@ -14,9 +14,9 @@ mkdir -p ./results TASK_ARGS="--height 480 --width 720 --num_frames 9" # CogVideoX parallel configuration -N_GPUS=4 -PARALLEL_ARGS="--ulysses_degree 2 --ring_degree 1" -CFG_ARGS="--use_cfg_parallel" +N_GPUS=6 +PARALLEL_ARGS="--ulysses_degree 2 --ring_degree 3" +#CFG_ARGS="--use_cfg_parallel" # Uncomment and modify these as needed # PIPEFUSION_ARGS="--num_pipeline_patch 8" diff --git a/xfuser/model_executor/pipelines/pipeline_cogvideox.py b/xfuser/model_executor/pipelines/pipeline_cogvideox.py index 4cb2be6f..5d04352c 100644 --- a/xfuser/model_executor/pipelines/pipeline_cogvideox.py +++ b/xfuser/model_executor/pipelines/pipeline_cogvideox.py @@ -343,22 +343,7 @@ def __call__( progress_bar.update() if get_sequence_parallel_world_size() > 1: - sp_degree = get_sequence_parallel_world_size() - sp_latents_list = get_sp_group().all_gather(latents, separate_tensors=True) - latents_list = [] - for pp_patch_idx in range(get_runtime_state().num_pipeline_patch): - latents_list += [ - sp_latents_list[sp_patch_idx][ - :, - :, - get_runtime_state() - .pp_patches_start_idx_local[pp_patch_idx] : get_runtime_state() - .pp_patches_start_idx_local[pp_patch_idx + 1], - :, - ] - for sp_patch_idx in range(sp_degree) - ] - latents = torch.cat(latents_list, dim=-2) + latents = get_sp_group().all_gather(latents, dim=-2) if is_dp_last_group(): if not (output_type == "latents" or output_type == "latent"):