Spaces:

fffiloni
/

Meigen-MultiTalk

Running on L40S

App Files Files Community

fffiloni commited on 3 days ago

Commit

2d438a0

verified ·

1 Parent(s): feff0f7

Migrated from GitHub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +14 -0
LICENSE.txt +201 -0
ORIGINAL_README.md +322 -0
assets/efficiency.png +3 -0
assets/logo.png +3 -0
assets/logo2.jpeg +3 -0
assets/pipe.png +3 -0
examples/multi/1/1.WAV +3 -0
examples/multi/1/2.WAV +3 -0
examples/multi/1/multi1.png +3 -0
examples/multi/2/1.wav +3 -0
examples/multi/2/multi2.png +3 -0
examples/multi/3/1-man.WAV +3 -0
examples/multi/3/1-woman.WAV +3 -0
examples/multi/3/multi3.png +3 -0
examples/multitalk_example_1.json +13 -0
examples/multitalk_example_2.json +9 -0
examples/multitalk_example_3.json +9 -0
examples/single/1.wav +3 -0
examples/single/single1.png +3 -0
examples/single_example_1.json +7 -0
generate_multitalk.py +500 -0
requirements.txt +15 -0
src/audio_analysis/torch_utils.py +20 -0
src/audio_analysis/wav2vec2.py +125 -0
src/utils.py +60 -0
src/vram_management/__init__.py +1 -0
src/vram_management/layers.py +179 -0
wan/__init__.py +6 -0
wan/configs/__init__.py +58 -0
wan/configs/shared_config.py +19 -0
wan/configs/wan_i2v_14B.py +24 -0
wan/configs/wan_multitalk_14B.py +36 -0
wan/configs/wan_t2v_14B.py +29 -0
wan/configs/wan_t2v_1_3B.py +29 -0
wan/distributed/__init__.py +0 -0
wan/distributed/fsdp.py +43 -0
wan/distributed/xdit_context_parallel.py +550 -0
wan/first_last_frame2video.py +377 -0
wan/image2video.py +350 -0
wan/modules/__init__.py +18 -0
wan/modules/attention.py +393 -0
wan/modules/clip.py +542 -0
wan/modules/model.py +631 -0
wan/modules/multitalk_model.py +799 -0
wan/modules/t5.py +513 -0
wan/modules/tokenizers.py +82 -0
wan/modules/vace_model.py +250 -0
wan/modules/vae.py +663 -0
wan/modules/xlm_roberta.py +170 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/efficiency.png filter=lfs diff=lfs merge=lfs -text
+assets/logo.png filter=lfs diff=lfs merge=lfs -text
+assets/logo2.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/pipe.png filter=lfs diff=lfs merge=lfs -text
+examples/multi/1/1.WAV filter=lfs diff=lfs merge=lfs -text
+examples/multi/1/2.WAV filter=lfs diff=lfs merge=lfs -text
+examples/multi/1/multi1.png filter=lfs diff=lfs merge=lfs -text
+examples/multi/2/1.wav filter=lfs diff=lfs merge=lfs -text
+examples/multi/2/multi2.png filter=lfs diff=lfs merge=lfs -text
+examples/multi/3/1-man.WAV filter=lfs diff=lfs merge=lfs -text
+examples/multi/3/1-woman.WAV filter=lfs diff=lfs merge=lfs -text
+examples/multi/3/multi3.png filter=lfs diff=lfs merge=lfs -text
+examples/single/1.wav filter=lfs diff=lfs merge=lfs -text
+examples/single/single1.png filter=lfs diff=lfs merge=lfs -text

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

ORIGINAL_README.md ADDED Viewed

	@@ -0,0 +1,322 @@

+<div align="center">
+<p align="center">
+  <img src="assets/logo2.jpeg" alt="MultiTalk" width="240"/>
+</p>
+<h1>Let Them Talk: Audio-Driven Multi-Person Conversational Video Generation</h1>
+[Zhe Kong*](https://scholar.google.com/citations?user=4X3yLwsAAAAJ&hl=zh-CN) · [Feng Gao*](https://scholar.google.com/citations?user=lFkCeoYAAAAJ) ·[Yong Zhang](https://yzhang2016.github.io/)<sup>&#9993;</sup> · [Zhuoliang Kang](https://scholar.google.com/citations?user=W1ZXjMkAAAAJ&hl=en) · [Xiaoming Wei](https://scholar.google.com/citations?user=JXV5yrZxj5MC&hl=zh-CN) · [Xunliang Cai](https://openreview.net/profile?id=~Xunliang_Cai1)
+[Guanying Chen](https://guanyingc.github.io/) · [Wenhan Luo](https://whluo.github.io/)<sup>&#9993;</sup>
+<sup>*</sup>Equal Contribution
+<sup>&#9993;</sup>Corresponding Authors
+<a href='https://meigen-ai.github.io/multi-talk/'><img src='https://img.shields.io/badge/Project-Page-green'></a>
+<a href='https://arxiv.org/abs/2505.22647'><img src='https://img.shields.io/badge/Technique-Report-red'></a>
+<a href='https://huggingface.co/MeiGen-AI/MeiGen-MultiTalk'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a>
+</div>
+> **TL; DR:**  MultiTalk is an audio-driven multi-person conversational video generation. It enables the video creation of multi-person conversation 💬, singing  🎤,  interaction control 👬, and cartoon 🙊.
+<p align="center">
+  <img src="assets/pipe.png">
+</p>
+## Video Demos
+<table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
+  <tr>
+      <td>
+          <video src="https://github.com/user-attachments/assets/e55952e6-e1b2-44a5-9887-a89307a378da" width="320" controls loop></video>
+      </td>
+      <td>
+          <video src="https://github.com/user-attachments/assets/f0396c19-d459-42aa-9d78-34fdea10de18" width="320" controls loop></video>
+      </td>
+       <td>
+          <video src="https://github.com/user-attachments/assets/3576fd04-3e5f-4933-ac7b-1c4e6a601379" width="320" controls loop></video>
+     </td>
+  </tr>
+  <tr>
+      <td>
+          <video src="https://github.com/user-attachments/assets/5589056e-3202-442d-a62a-2cad7a7ecb19" width="320" controls loop></video>
+      </td>
+      <td>
+          <video src="https://github.com/user-attachments/assets/554bfbe7-0090-492c-94be-329f5e39e175" width="320" controls loop></video>
+      </td>
+       <td>
+          <video src="https://github.com/user-attachments/assets/9e961f35-9413-4846-a806-8186d54061da" width="320" controls loop></video>
+     </td>
+  </tr>
+  <tr>
+      <td>
+          <video src="https://github.com/user-attachments/assets/342595ab-cf75-4872-8182-f20fe8c95611" width="320" controls loop></video>
+      </td>
+      <td>
+          <video src="https://github.com/user-attachments/assets/6476f9f0-35e0-4484-91a4-8aa646aa994a" width="320" controls loop></video>
+      </td>
+       <td>
+          <video src="https://github.com/user-attachments/assets/d8fc8e94-0cba-4c25-9f3a-a8d7e0a785e1" width="320" controls loop></video>
+     </td>
+  </tr>
+</table>
+## ✨ Key Features
+We propose **MultiTalk** , a novel framework for audio-driven multi-person conversational video generation. Given a multi-stream audio input, a reference image and a prompt, MultiTalk generates a video containing interactions following the prompt, with consistent lip motions aligned with the audio.
+> - 💬 **Realistic Conversations** - Support single & multi-person generation
+> - 👥 **Interactive Character Control** - Direct virtual humans via prompts
+> - 🎤 **Generalization Performances** - Support the generation of cartoon character and singing
+> - 📺 **Resolution Flexibility**: 480p & 720p output at arbitrary aspect ratios
+> - ⏱️ **Long Video Generation**: Support video generation up to 15 seconds
+## 🔥 Latest News
+* June 14, 2025: 🔥🔥 We release `MultiTalk` with support for `multi-GPU inference`, `teacache acceleration`, `APG` and `low-VRAM inference` (enabling 480P video generation on a single RTX 4090). [APG](https://arxiv.org/abs/2410.02416) is used to alleviate the color error accumulation in long video generation. TeaCache is capable of increasing speed by approximately 2~3x.
+* June 9, 2025: 🔥🔥 We release the [weights](https://huggingface.co/MeiGen-AI/MeiGen-MultiTalk) and inference code of **MultiTalk**
+* May 29, 2025: We release the [Technique-Report](https://arxiv.org/abs/2505.22647) of **MultiTalk**
+* May 29, 2025: We release the [project page](https://meigen-ai.github.io/multi-talk/) of **MultiTalk**
+## 🌐 Community  Works
+- [ComfyUI](https://github.com/kijai/ComfyUI-WanVideoWrapper/tree/multitalk): thanks [kijai](https://github.com/kijai) for integrating MultiTalk into ComfyUI-WanVideoWrapper. [Rudra](https://github.com/Rudra-ai-coder) found something interesting that MultiTalk can be combined with Wanx T2V and VACE in the [issue](https://github.com/kijai/ComfyUI-WanVideoWrapper/issues/635).
+- [Google Colab example](https://colab.research.google.com/drive/185OyRIpJDlpnRjhBRb7FnaRlq11BLZTa?usp=sharing), an exmaple for inference on A100 provided by [Braffolk](https://github.com/Braffolk).
+## 📑 Todo List
+- [x] Release the technical report
+- [x] Inference
+- [x] Checkpoints
+- [x] Multi-GPU Inference
+- [ ] Inference acceleration
+  - [x] TeaCache
+  - [ ] int8 quantization
+  - [ ] LCM distillation
+  - [ ] Sparse Attention
+- [x] Run with very low VRAM
+- [ ] TTS integration
+- [ ] Gradio demo
+- [ ] ComfyUI
+- [ ] 1.3B model
+## Quick Start
+### 🛠️Installation
+#### 1. Create a conda environment and install pytorch, xformers
+```
+conda create -n multitalk python=3.10
+conda activate multitalk
+pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121
+pip install -U xformers==0.0.28 --index-url https://download.pytorch.org/whl/cu121
+```
+#### 2. Flash-attn installation:
+```
+pip install ninja
+pip install psutil
+pip install packaging
+pip install flash_attn
+```
+#### 3. Other dependencies
+```
+pip install -r requirements.txt
+conda install -c conda-forge librosa
+```
+#### 4. FFmeg installation
+```
+conda install -c conda-forge ffmpeg
+```
+or
+```
+sudo yum install ffmpeg ffmpeg-devel
+```
+### 🧱Model Preparation
+#### 1. Model Download
+| Models        |                       Download Link                                           |    Notes                      |
+| --------------|-------------------------------------------------------------------------------|-------------------------------|
+| Wan2.1-I2V-14B-480P  |      🤗 [Huggingface](https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-480P)       | Base model
+| chinese-wav2vec2-base |      🤗 [Huggingface](https://huggingface.co/TencentGameMate/chinese-wav2vec2-base)          | Audio encoder
+| MeiGen-MultiTalk      |      🤗 [Huggingface](https://huggingface.co/MeiGen-AI/MeiGen-MultiTalk)              | Our audio condition weights
+Download models using huggingface-cli:
+``` sh
+huggingface-cli download Wan-AI/Wan2.1-I2V-14B-480P --local-dir ./weights/Wan2.1-I2V-14B-480P
+huggingface-cli download TencentGameMate/chinese-wav2vec2-base --local-dir ./weights/chinese-wav2vec2-base
+huggingface-cli download TencentGameMate/chinese-wav2vec2-base model.safetensors --revision refs/pr/1 --local-dir ./weights/chinese-wav2vec2-base
+huggingface-cli download MeiGen-AI/MeiGen-MultiTalk --local-dir ./weights/MeiGen-MultiTalk
+```
+#### 2. Link or Copy MultiTalk Model to Wan2.1-I2V-14B-480P Directory
+Link through:
+```
+mv weights/Wan2.1-I2V-14B-480P/diffusion_pytorch_model.safetensors.index.json weights/Wan2.1-I2V-14B-480P/diffusion_pytorch_model.safetensors.index.json_old
+sudo ln -s {Absolute path}/weights/MeiGen-MultiTalk/diffusion_pytorch_model.safetensors.index.json weights/Wan2.1-I2V-14B-480P/
+sudo ln -s {Absolute path}/weights/MeiGen-MultiTalk/multitalk.safetensors weights/Wan2.1-I2V-14B-480P/
+```
+Or, copy through:
+```
+mv weights/Wan2.1-I2V-14B-480P/diffusion_pytorch_model.safetensors.index.json weights/Wan2.1-I2V-14B-480P/diffusion_pytorch_model.safetensors.index.json_old
+cp weights/MeiGen-MultiTalk/diffusion_pytorch_model.safetensors.index.json weights/Wan2.1-I2V-14B-480P/
+cp weights/MeiGen-MultiTalk/multitalk.safetensors weights/Wan2.1-I2V-14B-480P/
+```
+### 🔑 Quick Inference
+Our model is compatible with both 480P and 720P resolutions. The current code only supports 480P inference. 720P inference requires multiple GPUs, and we will provide an update soon.
+> Some tips
+> - Lip synchronization accuracy: Audio CFG works optimally between 3–5. Increase the audio CFG value for better synchronization.
+> - Video clip length: The model was trained on 81-frame videos at 25 FPS. For optimal prompt following performance, generate clips at 81 frames. Generating up to 201 frames is possible, though longer clips might reduce prompt-following performance.
+> - Long video generation: Audio CFG influences color tone consistency across segments. Set this value to 3 to alleviate tonal variations.
+> - Sampling steps: If you want to generate a video fast, you can decrease the sampling steps to even 10 that will not hurt the lip synchronization accuracy, but affects the motion and visual quality. More sampling steps, better video quality.
+> - TeaCache accelerate: The optimal range for `--teacache_thresh` is between 0.2 and 0.5. Increasing this value can further improve acceleration, but may also lead to a decline in the quality of the generated video.
+#### Usage of MultiTalk
+```
+--mode streaming: long video generation.
+--mode clip: generate short video with one chunk.
+--use_teacache: run with TeaCache.
+--size multitalk-480: generate 480P video.
+--size multitalk-720: generate 720P video.
+--use_apg: run with APG.
+--teacache_thresh: A coefficient used for TeaCache acceleration
+```
+#### 1. Single-Person
+##### 1) Run with single GPU
+```
+python generate_multitalk.py \
+    --ckpt_dir weights/Wan2.1-I2V-14B-480P \
+    --wav2vec_dir 'weights/chinese-wav2vec2-base' \
+    --input_json examples/single_example_1.json \
+    --sample_steps 40 \
+    --mode streaming \
+    --use_teacache \
+    --save_file single_long_exp
+```
+##### 2) Run with very low VRAM
+If you want run with very low VRAM, set `--num_persistent_param_in_dit 0`:
+```
+python generate_multitalk.py \
+    --ckpt_dir weights/Wan2.1-I2V-14B-480P \
+    --wav2vec_dir 'weights/chinese-wav2vec2-base' \
+    --input_json examples/single_example_1.json \
+    --sample_steps 40 \
+    --mode streaming \
+    --num_persistent_param_in_dit 0 \
+    --use_teacache \
+    --save_file single_long_lowvram_exp
+```
+##### 3) Multi-GPU inference
+```
+GPU_NUM=8
+torchrun --nproc_per_node=$GPU_NUM --standalone generate_multitalk.py \
+    --ckpt_dir weights/Wan2.1-I2V-14B-480P \
+    --wav2vec_dir 'weights/chinese-wav2vec2-base' \
+    --dit_fsdp --t5_fsdp \
+    --ulysses_size=$GPU_NUM \
+    --input_json examples/single_example_1.json \
+    --sample_steps 40 \
+    --mode streaming \
+    --use_teacache \
+    --save_file single_long_multigpu_exp
+```
+#### 2. Multi-Person
+##### 1) Run with single GPU
+```
+python generate_multitalk.py \
+    --ckpt_dir weights/Wan2.1-I2V-14B-480P \
+    --wav2vec_dir 'weights/chinese-wav2vec2-base' \
+    --input_json examples/multitalk_example_2.json \
+    --sample_steps 40 \
+    --mode streaming \
+    --use_teacache \
+    --save_file multi_long_exp
+```
+##### 2) Run with very low VRAM
+```
+python generate_multitalk.py \
+    --ckpt_dir weights/Wan2.1-I2V-14B-480P \
+    --wav2vec_dir 'weights/chinese-wav2vec2-base' \
+    --input_json examples/multitalk_example_2.json \
+    --sample_steps 40 \
+    --mode streaming \
+    --num_persistent_param_in_dit 0 \
+    --use_teacache \
+    --save_file multi_long_lowvram_exp
+```
+##### 3) Multi-GPU inference
+```
+GPU_NUM=8
+torchrun --nproc_per_node=$GPU_NUM --standalone generate_multitalk.py \
+    --ckpt_dir weights/Wan2.1-I2V-14B-480P \
+    --wav2vec_dir 'weights/chinese-wav2vec2-base' \
+    --dit_fsdp --t5_fsdp --ulysses_size=$GPU_NUM \
+    --input_json examples/multitalk_example_2.json \
+    --sample_steps 40 \
+    --mode streaming --use_teacache \
+    --save_file multi_long_multigpu_exp
+```
+## 🚀Computational Efficiency
+The results are evaluated on A100 GPUs for multi-person generation. Single-person generation uses less memory and provides faster inference.
+<p align="center">
+  <img src="assets/efficiency.png">
+</p>
+TeaCache is capable of increasing speed by approximately 2~3x.
+## 📚 Citation
+If you find our work useful in your research, please consider citing:
+```
+@article{kong2025let,
+  title={Let Them Talk: Audio-Driven Multi-Person Conversational Video Generation},
+  author={Kong, Zhe and Gao, Feng and Zhang, Yong and Kang, Zhuoliang and Wei, Xiaoming and Cai, Xunliang and Chen, Guanying and Luo, Wenhan},
+  journal={arXiv preprint arXiv:2505.22647},
+  year={2025}
+}
+```
+## 📜 License
+The models in this repository are licensed under the Apache 2.0 License. We claim no rights over the your generated contents,
+granting you the freedom to use them while ensuring that your usage complies with the provisions of this license.
+You are fully accountable for your use of the models, which must not involve sharing any content that violates applicable laws,
+causes harm to individuals or groups, disseminates personal information intended for harm, spreads misinformation, or targets vulnerable populations.

assets/efficiency.png ADDED Viewed

Git LFS Details

SHA256: 619fd6d33e5719377eb4b446d448c22a1c92499952b69605f984d95369da8ecc
Pointer size: 131 Bytes
Size of remote file: 193 kB

assets/logo.png ADDED Viewed

Git LFS Details

SHA256: 2fb97620f1515b94de007f5b5cde23e51aaa84a5cdc1eb91c021bb46b4cae3f0
Pointer size: 132 Bytes
Size of remote file: 3.31 MB

assets/logo2.jpeg ADDED Viewed

Git LFS Details

SHA256: 984efa12db10f378f37ba0576be90517658ed5c4a4146f2483121e9ae8fbd800
Pointer size: 131 Bytes
Size of remote file: 446 kB

assets/pipe.png ADDED Viewed

Git LFS Details

SHA256: dca19575d5c512b93d0eab2359cc75878da2064d4ef0e1f44aaf6accc04d6e0a
Pointer size: 132 Bytes
Size of remote file: 1.18 MB

examples/multi/1/1.WAV ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8397a9b3c0add26384afe7e544e36cbc4806d8f2d7c705e11bb2897dc1bc993b
+size 315436

examples/multi/1/2.WAV ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:753120ceadbdab3ce206423a1419f73018695682787414ca2f4613306be50bfc
+size 544812

examples/multi/1/multi1.png ADDED Viewed

Git LFS Details

SHA256: 210b89972b810e760d15828323186771a56f1220e806b09fe06b0584a9f55537
Pointer size: 132 Bytes
Size of remote file: 3 MB

examples/multi/2/1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51eb6a408a8b5b33a732378e2a38e7412ba273186b85c324ec6a099d23fe38af
+size 1273592

examples/multi/2/multi2.png ADDED Viewed

Git LFS Details

SHA256: 95bce20a7ed654515b33cdbda435fa152405215ae7e098ee16c8e04c4eac3f49
Pointer size: 132 Bytes
Size of remote file: 2.88 MB

examples/multi/3/1-man.WAV ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d304fd88850d6673649d1844db2894e03bf5a775123048eebcb01ab3b79bff5e
+size 1503276

examples/multi/3/1-woman.WAV ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e1ebd7ae1587ebc7f0986f8b61e7fcc99c6fb57fbb15ab9373968e701afc8bf
+size 1503276

examples/multi/3/multi3.png ADDED Viewed

Git LFS Details

SHA256: dbe21be842c62c26cba264630b96aa775cc739dfc3b02cbc8da44a9eb8b9671e
Pointer size: 132 Bytes
Size of remote file: 2.58 MB

examples/multitalk_example_1.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "prompt": "In a casual, intimate setting, a man and a woman are engaged in a heartfelt conversation inside a car. The man, sporting a denim jacket over a blue shirt, sits attentively with a seatbelt fastened, his gaze fixed on the woman beside him. The woman, wearing a black tank top and a denim jacket draped over her shoulders, smiles warmly, her eyes reflecting genuine interest and connection. The car's interior, with its beige seats and simple design, provides a backdrop that emphasizes their interaction. The scene captures a moment of shared understanding and connection, set against the soft, diffused light of an overcast day. A medium shot from a slightly angled perspective, focusing on their expressions and body language.",
+    "cond_image": "examples/multi/1/multi1.png",
+    "audio_type": "add",
+    "cond_audio": {
+        "person1": "examples/multi/1/1.WAV",
+        "person2": "examples/multi/1/2.WAV"
+    },
+    "bbox": {
+        "person1": [160, 120, 1280, 1080],
+        "person2": [160, 1320, 1280, 2280]
+    }
+}

examples/multitalk_example_2.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "prompt": "In a cozy recording studio, a man and a woman are singing together with passion and emotion. The man, with short brown hair, wears a light gray button-up shirt, his expression filled with concentration and warmth. The woman, with long wavy brown hair, dons a sleeveless dress adorned with small polka dots, her eyes closed as she belts out a heartfelt melody. The studio is equipped with professional microphones, and the background features soundproofing panels, creating an intimate and focused atmosphere. A close-up shot captures their expressions and the intensity of their performance.",
+    "cond_image": "examples/multi/2/multi2.png",
+    "audio_type": "para",
+    "cond_audio": {
+        "person1": "examples/multi/2/1.wav",
+        "person2": "examples/multi/2/1.wav"
+    }
+}

examples/multitalk_example_3.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "prompt": "In a cozy recording studio, a man and a woman are singing together. The man, with tousled brown hair, stands to the left, wearing a light green button-down shirt. His gaze is directed towards the woman, who is smiling warmly. She, with wavy dark hair, is dressed in a black floral dress and stands to the right, her eyes closed in enjoyment. Between them is a professional microphone, capturing their harmonious voices. The background features wooden panels and various audio equipment, creating an intimate and focused atmosphere. The lighting is soft and warm, highlighting their expressions and the intimate setting. A medium shot captures their interaction closely.",
+    "cond_image": "examples/multi/3/multi3.png",
+    "audio_type": "para",
+    "cond_audio": {
+        "person1": "examples/multi/3/1-man.WAV",
+        "person2": "examples/multi/3/1-woman.WAV"
+    }
+}

examples/single/1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba2733897f561f747e6508734bff4eeee29d0a73638e5c39c0c0b806701d4e8b
+size 1888320

examples/single/single1.png ADDED Viewed

Git LFS Details

SHA256: 5a47d458721c4a7419d3c8ef9a5c3d89cf161ab31de9451b9bb4f321a37bc705
Pointer size: 132 Bytes
Size of remote file: 2.79 MB

examples/single_example_1.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "prompt": "A woman is passionately singing into a professional microphone in a recording studio. She wears large black headphones and a dark cardigan over a gray top. Her long, wavy brown hair frames her face as she looks slightly upwards, her mouth open mid-song. The studio is equipped with various audio equipment, including a mixing console and a keyboard, with soundproofing panels on the walls. The lighting is warm and focused on her, creating a professional and intimate atmosphere. A close-up shot captures her expressive performance.",
+    "cond_image": "examples/single/single1.png",
+    "cond_audio": {
+        "person1": "examples/single/1.wav"
+    }
+}

generate_multitalk.py ADDED Viewed

	@@ -0,0 +1,500 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import argparse
+import logging
+import os
+import sys
+import json
+import warnings
+from datetime import datetime
+warnings.filterwarnings('ignore')
+import random
+import torch
+import torch.distributed as dist
+from PIL import Image
+import subprocess
+import wan
+from wan.configs import SIZE_CONFIGS, SUPPORTED_SIZES, WAN_CONFIGS
+from wan.utils.utils import cache_image, cache_video, str2bool
+from wan.utils.multitalk_utils import save_video_ffmpeg
+from transformers import Wav2Vec2FeatureExtractor
+from src.audio_analysis.wav2vec2 import Wav2Vec2Model
+import librosa
+import pyloudnorm as pyln
+import numpy as np
+from einops import rearrange
+import soundfile as sf
+def _validate_args(args):
+    # Basic check
+    assert args.ckpt_dir is not None, "Please specify the checkpoint directory."
+    assert args.task in WAN_CONFIGS, f"Unsupport task: {args.task}"
+    # The default sampling steps are 40 for image-to-video tasks and 50 for text-to-video tasks.
+    if args.sample_steps is None:
+        args.sample_steps = 40
+    if args.sample_shift is None:
+        if args.size == 'multitalk-480':
+            args.sample_shift = 7
+        elif args.size == 'multitalk-720':
+            args.sample_shift = 11
+        else:
+            raise NotImplementedError(f'Not supported size')
+    args.base_seed = args.base_seed if args.base_seed >= 0 else random.randint(
+        0, 99999999)
+    # Size check
+    assert args.size in SUPPORTED_SIZES[
+        args.
+        task], f"Unsupport size {args.size} for task {args.task}, supported sizes are: {', '.join(SUPPORTED_SIZES[args.task])}"
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        description="Generate a image or video from a text prompt or image using Wan"
+    )
+    parser.add_argument(
+        "--task",
+        type=str,
+        default="multitalk-14B",
+        choices=list(WAN_CONFIGS.keys()),
+        help="The task to run.")
+    parser.add_argument(
+        "--size",
+        type=str,
+        default="multitalk-480",
+        choices=list(SIZE_CONFIGS.keys()),
+        help="The buckget size of the generated video. The aspect ratio of the output video will follow that of the input image."
+    )
+    parser.add_argument(
+        "--frame_num",
+        type=int,
+        default=81,
+        help="How many frames to be generated in one clip. The number should be 4n+1"
+    )
+    parser.add_argument(
+        "--ckpt_dir",
+        type=str,
+        default=None,
+        help="The path to the Wan checkpoint directory.")
+    parser.add_argument(
+        "--wav2vec_dir",
+        type=str,
+        default=None,
+        help="The path to the wav2vec checkpoint directory.")
+    parser.add_argument(
+        "--offload_model",
+        type=str2bool,
+        default=None,
+        help="Whether to offload the model to CPU after each model forward, reducing GPU memory usage."
+    )
+    parser.add_argument(
+        "--ulysses_size",
+        type=int,
+        default=1,
+        help="The size of the ulysses parallelism in DiT.")
+    parser.add_argument(
+        "--ring_size",
+        type=int,
+        default=1,
+        help="The size of the ring attention parallelism in DiT.")
+    parser.add_argument(
+        "--t5_fsdp",
+        action="store_true",
+        default=False,
+        help="Whether to use FSDP for T5.")
+    parser.add_argument(
+        "--t5_cpu",
+        action="store_true",
+        default=False,
+        help="Whether to place T5 model on CPU.")
+    parser.add_argument(
+        "--dit_fsdp",
+        action="store_true",
+        default=False,
+        help="Whether to use FSDP for DiT.")
+    parser.add_argument(
+        "--save_file",
+        type=str,
+        default=None,
+        help="The file to save the generated image or video to.")
+    parser.add_argument(
+        "--audio_save_dir",
+        type=str,
+        default='save_audio',
+        help="The path to save the audio embedding.")
+    parser.add_argument(
+        "--base_seed",
+        type=int,
+        default=42,
+        help="The seed to use for generating the image or video.")
+    parser.add_argument(
+        "--input_json",
+        type=str,
+        default='examples.json',
+        help="[meta file] The condition path to generate the video.")
+    parser.add_argument(
+        "--motion_frame",
+        type=int,
+        default=25,
+        help="Driven frame length used in the mode of long video genration.")
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="clip",
+        choices=['clip', 'streaming'],
+        help="clip: generate one video chunk, streaming: long video generation")
+    parser.add_argument(
+        "--sample_steps", type=int, default=None, help="The sampling steps.")
+    parser.add_argument(
+        "--sample_shift",
+        type=float,
+        default=None,
+        help="Sampling shift factor for flow matching schedulers.")
+    parser.add_argument(
+        "--sample_text_guide_scale",
+        type=float,
+        default=5.0,
+        help="Classifier free guidance scale for text control.")
+    parser.add_argument(
+        "--sample_audio_guide_scale",
+        type=float,
+        default=4.0,
+        help="Classifier free guidance scale for audio control.")
+    parser.add_argument(
+        "--num_persistent_param_in_dit",
+        type=int,
+        default=None,
+        required=False,
+        help="Maximum parameter quantity retained in video memory, small number to reduce VRAM required",
+    )
+    parser.add_argument(
+        "--use_teacache",
+        action="store_true",
+        default=False,
+        help="Enable teacache for video generation."
+    )
+    parser.add_argument(
+        "--teacache_thresh",
+        type=float,
+        default=0.2,
+        help="Threshold for teacache."
+    )
+    parser.add_argument(
+        "--use_apg",
+        action="store_true",
+        default=False,
+        help="Enable adaptive projected guidance for video generation (APG)."
+    )
+    parser.add_argument(
+        "--apg_momentum",
+        type=float,
+        default=-0.75,
+        help="Momentum used in adaptive projected guidance (APG)."
+    )
+    parser.add_argument(
+        "--apg_norm_threshold",
+        type=float,
+        default=55,
+        help="Norm threshold used in adaptive projected guidance (APG)."
+    )
+    args = parser.parse_args()
+    _validate_args(args)
+    return args
+def custom_init(device, wav2vec):
+    audio_encoder = Wav2Vec2Model.from_pretrained(wav2vec, local_files_only=True).to(device)
+    audio_encoder.feature_extractor._freeze_parameters()
+    wav2vec_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(wav2vec, local_files_only=True)
+    return wav2vec_feature_extractor, audio_encoder
+def loudness_norm(audio_array, sr=16000, lufs=-23):
+    meter = pyln.Meter(sr)
+    loudness = meter.integrated_loudness(audio_array)
+    if abs(loudness) > 100:
+        return audio_array
+    normalized_audio = pyln.normalize.loudness(audio_array, loudness, lufs)
+    return normalized_audio
+def audio_prepare_multi(left_path, right_path, audio_type, sample_rate=16000):
+    if not (left_path=='None' or right_path=='None'):
+        human_speech_array1 = audio_prepare_single(left_path)
+        human_speech_array2 = audio_prepare_single(right_path)
+    elif left_path=='None':
+        human_speech_array2 = audio_prepare_single(right_path)
+        human_speech_array1 = np.zeros(human_speech_array2.shape[0])
+    elif right_path=='None':
+        human_speech_array1 = audio_prepare_single(left_path)
+        human_speech_array2 = np.zeros(human_speech_array1.shape[0])
+    if audio_type=='para':
+        new_human_speech1 = human_speech_array1
+        new_human_speech2 = human_speech_array2
+    elif audio_type=='add':
+        new_human_speech1 = np.concatenate([human_speech_array1[: human_speech_array1.shape[0]], np.zeros(human_speech_array2.shape[0])])
+        new_human_speech2 = np.concatenate([np.zeros(human_speech_array1.shape[0]), human_speech_array2[:human_speech_array2.shape[0]]])
+    sum_human_speechs = new_human_speech1 + new_human_speech2
+    return new_human_speech1, new_human_speech2, sum_human_speechs
+def _init_logging(rank):
+    # logging
+    if rank == 0:
+        # set format
+        logging.basicConfig(
+            level=logging.INFO,
+            format="[%(asctime)s] %(levelname)s: %(message)s",
+            handlers=[logging.StreamHandler(stream=sys.stdout)])
+    else:
+        logging.basicConfig(level=logging.ERROR)
+def get_embedding(speech_array, wav2vec_feature_extractor, audio_encoder, sr=16000, device='cpu'):
+    audio_duration = len(speech_array) / sr
+    video_length = audio_duration * 25 # Assume the video fps is 25
+    # wav2vec_feature_extractor
+    audio_feature = np.squeeze(
+        wav2vec_feature_extractor(speech_array, sampling_rate=sr).input_values
+    )
+    audio_feature = torch.from_numpy(audio_feature).float().to(device=device)
+    audio_feature = audio_feature.unsqueeze(0)
+    # audio encoder
+    with torch.no_grad():
+        embeddings = audio_encoder(audio_feature, seq_len=int(video_length), output_hidden_states=True)
+    if len(embeddings) == 0:
+        print("Fail to extract audio embedding")
+        return None
+    audio_emb = torch.stack(embeddings.hidden_states[1:], dim=1).squeeze(0)
+    audio_emb = rearrange(audio_emb, "b s d -> s b d")
+    audio_emb = audio_emb.cpu().detach()
+    return audio_emb
+def extract_audio_from_video(filename, sample_rate):
+    raw_audio_path = filename.split('/')[-1].split('.')[0]+'.wav'
+    ffmpeg_command = [
+        "ffmpeg",
+        "-y",
+        "-i",
+        str(filename),
+        "-vn",
+        "-acodec",
+        "pcm_s16le",
+        "-ar",
+        "16000",
+        "-ac",
+        "2",
+        str(raw_audio_path),
+    ]
+    subprocess.run(ffmpeg_command, check=True)
+    human_speech_array, sr = librosa.load(raw_audio_path, sr=sample_rate)
+    human_speech_array = loudness_norm(human_speech_array, sr)
+    os.remove(raw_audio_path)
+    return human_speech_array
+def audio_prepare_single(audio_path, sample_rate=16000):
+    ext = os.path.splitext(audio_path)[1].lower()
+    if ext in ['.mp4', '.mov', '.avi', '.mkv']:
+        human_speech_array = extract_audio_from_video(audio_path, sample_rate)
+        return human_speech_array
+    else:
+        human_speech_array, sr = librosa.load(audio_path, sr=sample_rate)
+        human_speech_array = loudness_norm(human_speech_array, sr)
+        return human_speech_array
+def generate(args):
+    rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    local_rank = int(os.getenv("LOCAL_RANK", 0))
+    device = local_rank
+    _init_logging(rank)
+    if args.offload_model is None:
+        args.offload_model = False if world_size > 1 else True
+        logging.info(
+            f"offload_model is not specified, set to {args.offload_model}.")
+    if world_size > 1:
+        torch.cuda.set_device(local_rank)
+        dist.init_process_group(
+            backend="nccl",
+            init_method="env://",
+            rank=rank,
+            world_size=world_size)
+    else:
+        assert not (
+            args.t5_fsdp or args.dit_fsdp
+        ), f"t5_fsdp and dit_fsdp are not supported in non-distributed environments."
+        assert not (
+            args.ulysses_size > 1 or args.ring_size > 1
+        ), f"context parallel are not supported in non-distributed environments."
+    if args.ulysses_size > 1 or args.ring_size > 1:
+        assert args.ulysses_size * args.ring_size == world_size, f"The number of ulysses_size and ring_size should be equal to the world size."
+        from xfuser.core.distributed import (
+            init_distributed_environment,
+            initialize_model_parallel,
+        )
+        init_distributed_environment(
+            rank=dist.get_rank(), world_size=dist.get_world_size())
+        initialize_model_parallel(
+            sequence_parallel_degree=dist.get_world_size(),
+            ring_degree=args.ring_size,
+            ulysses_degree=args.ulysses_size,
+        )
+    # TODO: use prompt refine
+    # if args.use_prompt_extend:
+    #     if args.prompt_extend_method == "dashscope":
+    #         prompt_expander = DashScopePromptExpander(
+    #             model_name=args.prompt_extend_model,
+    #             is_vl="i2v" in args.task or "flf2v" in args.task)
+    #     elif args.prompt_extend_method == "local_qwen":
+    #         prompt_expander = QwenPromptExpander(
+    #             model_name=args.prompt_extend_model,
+    #             is_vl="i2v" in args.task,
+    #             device=rank)
+    #     else:
+    #         raise NotImplementedError(
+    #             f"Unsupport prompt_extend_method: {args.prompt_extend_method}")
+    cfg = WAN_CONFIGS[args.task]
+    if args.ulysses_size > 1:
+        assert cfg.num_heads % args.ulysses_size == 0, f"`{cfg.num_heads=}` cannot be divided evenly by `{args.ulysses_size=}`."
+    logging.info(f"Generation job args: {args}")
+    logging.info(f"Generation model config: {cfg}")
+    if dist.is_initialized():
+        base_seed = [args.base_seed] if rank == 0 else [None]
+        dist.broadcast_object_list(base_seed, src=0)
+        args.base_seed = base_seed[0]
+    assert args.task == "multitalk-14B", 'You should choose multitalk in args.task.'
+    # TODO: add prompt refine
+    # img = Image.open(args.image).convert("RGB")
+    # if args.use_prompt_extend:
+    #     logging.info("Extending prompt ...")
+    #     if rank == 0:
+    #         prompt_output = prompt_expander(
+    #             args.prompt,
+    #             tar_lang=args.prompt_extend_target_lang,
+    #             image=img,
+    #             seed=args.base_seed)
+    #         if prompt_output.status == False:
+    #             logging.info(
+    #                 f"Extending prompt failed: {prompt_output.message}")
+    #             logging.info("Falling back to original prompt.")
+    #             input_prompt = args.prompt
+    #         else:
+    #             input_prompt = prompt_output.prompt
+    #         input_prompt = [input_prompt]
+    #     else:
+    #         input_prompt = [None]
+    #     if dist.is_initialized():
+    #         dist.broadcast_object_list(input_prompt, src=0)
+    #     args.prompt = input_prompt[0]
+    #     logging.info(f"Extended prompt: {args.prompt}")
+    # read input files
+    with open(args.input_json, 'r', encoding='utf-8') as f:
+        input_data = json.load(f)
+        wav2vec_feature_extractor, audio_encoder= custom_init('cpu', args.wav2vec_dir)
+        args.audio_save_dir = os.path.join(args.audio_save_dir, input_data['cond_image'].split('/')[-1].split('.')[0])
+        os.makedirs(args.audio_save_dir,exist_ok=True)
+        if len(input_data['cond_audio'])==2:
+            new_human_speech1, new_human_speech2, sum_human_speechs = audio_prepare_multi(input_data['cond_audio']['person1'], input_data['cond_audio']['person2'], input_data['audio_type'])
+            audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder)
+            audio_embedding_2 = get_embedding(new_human_speech2, wav2vec_feature_extractor, audio_encoder)
+            emb1_path = os.path.join(args.audio_save_dir, '1.pt')
+            emb2_path = os.path.join(args.audio_save_dir, '2.pt')
+            sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
+            sf.write(sum_audio, sum_human_speechs, 16000)
+            torch.save(audio_embedding_1, emb1_path)
+            torch.save(audio_embedding_2, emb2_path)
+            input_data['cond_audio']['person1'] = emb1_path
+            input_data['cond_audio']['person2'] = emb2_path
+            input_data['video_audio'] = sum_audio
+        elif len(input_data['cond_audio'])==1:
+            human_speech = audio_prepare_single(input_data['cond_audio']['person1'])
+            audio_embedding = get_embedding(human_speech, wav2vec_feature_extractor, audio_encoder)
+            emb_path = os.path.join(args.audio_save_dir, '1.pt')
+            sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
+            sf.write(sum_audio, human_speech, 16000)
+            torch.save(audio_embedding, emb_path)
+            input_data['cond_audio']['person1'] = emb_path
+            input_data['video_audio'] = sum_audio
+    logging.info("Creating MultiTalk pipeline.")
+    wan_i2v = wan.MultiTalkPipeline(
+        config=cfg,
+        checkpoint_dir=args.ckpt_dir,
+        device_id=device,
+        rank=rank,
+        t5_fsdp=args.t5_fsdp,
+        dit_fsdp=args.dit_fsdp,
+        use_usp=(args.ulysses_size > 1 or args.ring_size > 1),
+        t5_cpu=args.t5_cpu
+    )
+    if args.num_persistent_param_in_dit is not None:
+        wan_i2v.vram_management = True
+        wan_i2v.enable_vram_management(
+            num_persistent_param_in_dit=args.num_persistent_param_in_dit
+        )
+    logging.info("Generating video ...")
+    video = wan_i2v.generate(
+        input_data,
+        size_buckget=args.size,
+        motion_frame=args.motion_frame,
+        frame_num=args.frame_num,
+        shift=args.sample_shift,
+        sampling_steps=args.sample_steps,
+        text_guide_scale=args.sample_text_guide_scale,
+        audio_guide_scale=args.sample_audio_guide_scale,
+        seed=args.base_seed,
+        offload_model=args.offload_model,
+        max_frames_num=args.frame_num if args.mode == 'clip' else 1000,
+        extra_args=args,
+        )
+    if rank == 0:
+        if args.save_file is None:
+            formatted_time = datetime.now().strftime("%Y%m%d_%H%M%S")
+            formatted_prompt = input_data['prompt'].replace(" ", "_").replace("/",
+                                                                        "_")[:50]
+            args.save_file = f"{args.task}_{args.size.replace('*','x') if sys.platform=='win32' else args.size}_{args.ulysses_size}_{args.ring_size}_{formatted_prompt}_{formatted_time}"
+        logging.info(f"Saving generated video to {args.save_file}.mp4")
+        save_video_ffmpeg(video, args.save_file, [input_data['video_audio']])
+    logging.info("Finished.")
+if __name__ == "__main__":
+    args = _parse_args()
+    generate(args)

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+opencv-python>=4.9.0.80
+diffusers>=0.31.0
+transformers>=4.49.0
+tokenizers>=0.20.3
+accelerate>=1.1.1
+tqdm
+imageio
+easydict
+ftfy
+dashscope
+imageio-ffmpeg
+gradio>=5.0.0
+numpy>=1.23.5,<2
+xfuser>=0.4.1
+pyloudnorm

src/audio_analysis/torch_utils.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+import torch.nn.functional as F
+def get_mask_from_lengths(lengths, max_len=None):
+    lengths = lengths.to(torch.long)
+    if max_len is None:
+        max_len = torch.max(lengths).item()
+    ids = torch.arange(0, max_len).unsqueeze(0).expand(lengths.shape[0], -1).to(lengths.device)
+    mask = ids < lengths.unsqueeze(1).expand(-1, max_len)
+    return mask
+def linear_interpolation(features, seq_len):
+    features = features.transpose(1, 2)
+    output_features = F.interpolate(features, size=seq_len, align_corners=True, mode='linear')
+    return output_features.transpose(1, 2)

src/audio_analysis/wav2vec2.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from transformers import Wav2Vec2Config, Wav2Vec2Model
+from transformers.modeling_outputs import BaseModelOutput
+from src.audio_analysis.torch_utils import linear_interpolation
+# the implementation of Wav2Vec2Model is borrowed from
+# https://github.com/huggingface/transformers/blob/HEAD/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+# initialize our encoder with the pre-trained wav2vec 2.0 weights.
+class Wav2Vec2Model(Wav2Vec2Model):
+    def __init__(self, config: Wav2Vec2Config):
+        super().__init__(config)
+    def forward(
+        self,
+        input_values,
+        seq_len,
+        attention_mask=None,
+        mask_time_indices=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        self.config.output_attentions = True
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+        if not return_dict:
+            return (hidden_states, ) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+    def feature_extract(
+        self,
+        input_values,
+        seq_len,
+    ):
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+        return extract_features
+    def encode(
+        self,
+        extract_features,
+        attention_mask=None,
+        mask_time_indices=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        self.config.output_attentions = True
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+        if not return_dict:
+            return (hidden_states, ) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )

src/utils.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from contextlib import contextmanager
+import torch
+@contextmanager
+def init_weights_on_device(device=torch.device("meta"), include_buffers: bool = False):
+    old_register_parameter = torch.nn.Module.register_parameter
+    if include_buffers:
+        old_register_buffer = torch.nn.Module.register_buffer
+    def register_empty_parameter(module, name, param):
+        old_register_parameter(module, name, param)
+        if param is not None:
+            param_cls = type(module._parameters[name])
+            kwargs = module._parameters[name].__dict__
+            kwargs["requires_grad"] = param.requires_grad
+            module._parameters[name] = param_cls(
+                module._parameters[name].to(device), **kwargs
+            )
+    def register_empty_buffer(module, name, buffer, persistent=True):
+        old_register_buffer(module, name, buffer, persistent=persistent)
+        if buffer is not None:
+            module._buffers[name] = module._buffers[name].to(device)
+    def patch_tensor_constructor(fn):
+        def wrapper(*args, **kwargs):
+            kwargs["device"] = device
+            return fn(*args, **kwargs)
+        return wrapper
+    if include_buffers:
+        tensor_constructors_to_patch = {
+            torch_function_name: getattr(torch, torch_function_name)
+            for torch_function_name in ["empty", "zeros", "ones", "full"]
+        }
+    else:
+        tensor_constructors_to_patch = {}
+    try:
+        torch.nn.Module.register_parameter = register_empty_parameter
+        if include_buffers:
+            torch.nn.Module.register_buffer = register_empty_buffer
+        for torch_function_name in tensor_constructors_to_patch.keys():
+            setattr(
+                torch,
+                torch_function_name,
+                patch_tensor_constructor(getattr(torch, torch_function_name)),
+            )
+        yield
+    finally:
+        torch.nn.Module.register_parameter = old_register_parameter
+        if include_buffers:
+            torch.nn.Module.register_buffer = old_register_buffer
+        for (
+            torch_function_name,
+            old_torch_function,
+        ) in tensor_constructors_to_patch.items():
+            setattr(torch, torch_function_name, old_torch_function)

src/vram_management/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .layers import *

src/vram_management/layers.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import copy
+import torch
+from src.utils import init_weights_on_device
+def cast_to(weight, dtype, device):
+    r = torch.empty_like(weight, dtype=dtype, device=device)
+    r.copy_(weight)
+    return r
+class AutoWrappedModule(torch.nn.Module):
+    def __init__(
+        self,
+        module: torch.nn.Module,
+        offload_dtype,
+        offload_device,
+        onload_dtype,
+        onload_device,
+        computation_dtype,
+        computation_device,
+    ):
+        super().__init__()
+        self.module = module.to(dtype=offload_dtype, device=offload_device)
+        self.offload_dtype = offload_dtype
+        self.offload_device = offload_device
+        self.onload_dtype = onload_dtype
+        self.onload_device = onload_device
+        self.computation_dtype = computation_dtype
+        self.computation_device = computation_device
+        self.state = 0
+    def offload(self):
+        if self.state == 1 and (
+            self.offload_dtype != self.onload_dtype
+            or self.offload_device != self.onload_device
+        ):
+            self.module.to(dtype=self.offload_dtype, device=self.offload_device)
+            self.state = 0
+    def onload(self):
+        if self.state == 0 and (
+            self.offload_dtype != self.onload_dtype
+            or self.offload_device != self.onload_device
+        ):
+            self.module.to(dtype=self.onload_dtype, device=self.onload_device)
+            self.state = 1
+    def forward(self, *args, **kwargs):
+        if (
+            self.onload_dtype == self.computation_dtype
+            and self.onload_device == self.computation_device
+        ):
+            module = self.module
+        else:
+            module = copy.deepcopy(self.module).to(
+                dtype=self.computation_dtype, device=self.computation_device
+            )
+        return module(*args, **kwargs)
+class AutoWrappedLinear(torch.nn.Linear):
+    def __init__(
+        self,
+        module: torch.nn.Linear,
+        offload_dtype,
+        offload_device,
+        onload_dtype,
+        onload_device,
+        computation_dtype,
+        computation_device,
+    ):
+        with init_weights_on_device(device=torch.device("meta")):
+            super().__init__(
+                in_features=module.in_features,
+                out_features=module.out_features,
+                bias=module.bias is not None,
+                dtype=offload_dtype,
+                device=offload_device,
+            )
+        self.weight = module.weight
+        self.bias = module.bias
+        self.offload_dtype = offload_dtype
+        self.offload_device = offload_device
+        self.onload_dtype = onload_dtype
+        self.onload_device = onload_device
+        self.computation_dtype = computation_dtype
+        self.computation_device = computation_device
+        self.state = 0
+    def offload(self):
+        if self.state == 1 and (
+            self.offload_dtype != self.onload_dtype
+            or self.offload_device != self.onload_device
+        ):
+            self.to(dtype=self.offload_dtype, device=self.offload_device)
+            self.state = 0
+    def onload(self):
+        if self.state == 0 and (
+            self.offload_dtype != self.onload_dtype
+            or self.offload_device != self.onload_device
+        ):
+            self.to(dtype=self.onload_dtype, device=self.onload_device)
+            self.state = 1
+    def forward(self, x, *args, **kwargs):
+        if (
+            self.onload_dtype == self.computation_dtype
+            and self.onload_device == self.computation_device
+        ):
+            weight, bias = self.weight, self.bias
+        else:
+            weight = cast_to(
+                self.weight, self.computation_dtype, self.computation_device
+            )
+            bias = (
+                None
+                if self.bias is None
+                else cast_to(self.bias, self.computation_dtype, self.computation_device)
+            )
+        return torch.nn.functional.linear(x, weight, bias)
+def enable_vram_management_recursively(
+    model: torch.nn.Module,
+    module_map: dict,
+    module_config: dict,
+    max_num_param=None,
+    overflow_module_config: dict = None,
+    total_num_param=0,
+):
+    for name, module in model.named_children():
+        for source_module, target_module in module_map.items():
+            if isinstance(module, source_module):
+                num_param = sum(p.numel() for p in module.parameters())
+                # print(str(module) + ':' + str(num_param))
+                if (
+                    max_num_param is not None
+                    and total_num_param + num_param > max_num_param
+                ):
+                    # print(str(module) + '-->\t\t num:' + str(num_param) + "\t total:" + str(total_num_param))
+                    module_config_ = overflow_module_config
+                else:
+                    module_config_ = module_config
+                module_ = target_module(module, **module_config_)
+                setattr(model, name, module_)
+                total_num_param += num_param
+                break
+        else:
+            total_num_param = enable_vram_management_recursively(
+                module,
+                module_map,
+                module_config,
+                max_num_param,
+                overflow_module_config,
+                total_num_param,
+            )
+    return total_num_param
+def enable_vram_management(
+    model: torch.nn.Module,
+    module_map: dict,
+    module_config: dict,
+    max_num_param=None,
+    overflow_module_config: dict = None,
+):
+    enable_vram_management_recursively(
+        model,
+        module_map,
+        module_config,
+        max_num_param,
+        overflow_module_config,
+        total_num_param=0,
+    )
+    model.vram_management_enabled = True

wan/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from . import configs, distributed, modules
+from .first_last_frame2video import WanFLF2V
+from .image2video import WanI2V
+from .text2video import WanT2V
+from .vace import WanVace, WanVaceMP
+from .multitalk import MultiTalkPipeline

wan/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import copy
+import os
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+from .wan_i2v_14B import i2v_14B
+from .wan_t2v_1_3B import t2v_1_3B
+from .wan_t2v_14B import t2v_14B
+from .wan_multitalk_14B import multitalk_14B
+# the config of t2i_14B is the same as t2v_14B
+t2i_14B = copy.deepcopy(t2v_14B)
+t2i_14B.__name__ = 'Config: Wan T2I 14B'
+# the config of flf2v_14B is the same as i2v_14B
+flf2v_14B = copy.deepcopy(i2v_14B)
+flf2v_14B.__name__ = 'Config: Wan FLF2V 14B'
+flf2v_14B.sample_neg_prompt = "镜头切换，" + flf2v_14B.sample_neg_prompt
+WAN_CONFIGS = {
+    't2v-14B': t2v_14B,
+    't2v-1.3B': t2v_1_3B,
+    'i2v-14B': i2v_14B,
+    't2i-14B': t2i_14B,
+    'flf2v-14B': flf2v_14B,
+    'vace-1.3B': t2v_1_3B,
+    'vace-14B': t2v_14B,
+    'multitalk-14B': multitalk_14B,
+}
+SIZE_CONFIGS = {
+    '720*1280': (720, 1280),
+    '1280*720': (1280, 720),
+    '480*832': (480, 832),
+    '832*480': (832, 480),
+    '1024*1024': (1024, 1024),
+    'multitalk-480': (640, 640),
+    'multitalk-720': (960, 960),
+}
+MAX_AREA_CONFIGS = {
+    '720*1280': 720 * 1280,
+    '1280*720': 1280 * 720,
+    '480*832': 480 * 832,
+    '832*480': 832 * 480,
+}
+SUPPORTED_SIZES = {
+    't2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
+    't2v-1.3B': ('480*832', '832*480'),
+    'i2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
+    'flf2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
+    't2i-14B': tuple(SIZE_CONFIGS.keys()),
+    'vace-1.3B': ('480*832', '832*480'),
+    'vace-14B': ('720*1280', '1280*720', '480*832', '832*480'),
+    'multitalk-14B': ('multitalk-480', 'multitalk-720'),
+}

wan/configs/shared_config.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+from easydict import EasyDict
+#------------------------ Wan shared config ------------------------#
+wan_shared_cfg = EasyDict()
+# t5
+wan_shared_cfg.t5_model = 'umt5_xxl'
+wan_shared_cfg.t5_dtype = torch.bfloat16
+wan_shared_cfg.text_len = 512
+# transformer
+wan_shared_cfg.param_dtype = torch.bfloat16
+# inference
+wan_shared_cfg.num_train_timesteps = 1000
+wan_shared_cfg.sample_fps = 16
+wan_shared_cfg.sample_neg_prompt = '色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走'

wan/configs/wan_i2v_14B.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+from easydict import EasyDict
+from .shared_config import wan_shared_cfg
+#------------------------ Wan I2V 14B ------------------------#
+i2v_14B = EasyDict(__name__='Config: Wan I2V 14B')
+i2v_14B.update(wan_shared_cfg)
+i2v_14B.sample_neg_prompt = "镜头晃动，" + i2v_14B.sample_neg_prompt
+i2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+i2v_14B.t5_tokenizer = 'google/umt5-xxl'
+# clip
+i2v_14B.clip_model = 'clip_xlm_roberta_vit_h_14'
+i2v_14B.clip_dtype = torch.float16
+i2v_14B.clip_checkpoint = 'models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth'
+i2v_14B.clip_tokenizer = 'xlm-roberta-large'
+# vae
+i2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+i2v_14B.vae_stride = (4, 8, 8)

wan/configs/wan_multitalk_14B.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+from easydict import EasyDict
+from .shared_config import wan_shared_cfg
+#------------------------ Wan I2V 14B ------------------------#
+multitalk_14B = EasyDict(__name__='Config: Wan MultiTalk AI2V 14B')
+multitalk_14B.update(wan_shared_cfg)
+multitalk_14B.sample_neg_prompt = 'bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards'
+multitalk_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+multitalk_14B.t5_tokenizer = 'google/umt5-xxl'
+# clip
+multitalk_14B.clip_model = 'clip_xlm_roberta_vit_h_14'
+multitalk_14B.clip_dtype = torch.float16
+multitalk_14B.clip_checkpoint = 'models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth'
+multitalk_14B.clip_tokenizer = 'xlm-roberta-large'
+# vae
+multitalk_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+multitalk_14B.vae_stride = (4, 8, 8)
+# transformer
+multitalk_14B.patch_size = (1, 2, 2)
+multitalk_14B.dim = 5120
+multitalk_14B.ffn_dim = 13824
+multitalk_14B.freq_dim = 256
+multitalk_14B.num_heads = 40
+multitalk_14B.num_layers = 40
+multitalk_14B.window_size = (-1, -1)
+multitalk_14B.qk_norm = True
+multitalk_14B.cross_attn_norm = True
+multitalk_14B.eps = 1e-6

wan/configs/wan_t2v_14B.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from easydict import EasyDict
+from .shared_config import wan_shared_cfg
+#------------------------ Wan T2V 14B ------------------------#
+t2v_14B = EasyDict(__name__='Config: Wan T2V 14B')
+t2v_14B.update(wan_shared_cfg)
+# t5
+t2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+t2v_14B.t5_tokenizer = 'google/umt5-xxl'
+# vae
+t2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+t2v_14B.vae_stride = (4, 8, 8)
+# transformer
+t2v_14B.patch_size = (1, 2, 2)
+t2v_14B.dim = 5120
+t2v_14B.ffn_dim = 13824
+t2v_14B.freq_dim = 256
+t2v_14B.num_heads = 40
+t2v_14B.num_layers = 40
+t2v_14B.window_size = (-1, -1)
+t2v_14B.qk_norm = True
+t2v_14B.cross_attn_norm = True
+t2v_14B.eps = 1e-6

wan/configs/wan_t2v_1_3B.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from easydict import EasyDict
+from .shared_config import wan_shared_cfg
+#------------------------ Wan T2V 1.3B ------------------------#
+t2v_1_3B = EasyDict(__name__='Config: Wan T2V 1.3B')
+t2v_1_3B.update(wan_shared_cfg)
+# t5
+t2v_1_3B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+t2v_1_3B.t5_tokenizer = 'google/umt5-xxl'
+# vae
+t2v_1_3B.vae_checkpoint = 'Wan2.1_VAE.pth'
+t2v_1_3B.vae_stride = (4, 8, 8)
+# transformer
+t2v_1_3B.patch_size = (1, 2, 2)
+t2v_1_3B.dim = 1536
+t2v_1_3B.ffn_dim = 8960
+t2v_1_3B.freq_dim = 256
+t2v_1_3B.num_heads = 12
+t2v_1_3B.num_layers = 30
+t2v_1_3B.window_size = (-1, -1)
+t2v_1_3B.qk_norm = True
+t2v_1_3B.cross_attn_norm = True
+t2v_1_3B.eps = 1e-6

wan/distributed/__init__.py ADDED Viewed

File without changes

wan/distributed/fsdp.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import gc
+from functools import partial
+import torch
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
+from torch.distributed.fsdp.wrap import lambda_auto_wrap_policy
+from torch.distributed.utils import _free_storage
+def shard_model(
+    model,
+    device_id,
+    param_dtype=torch.bfloat16,
+    reduce_dtype=torch.float32,
+    buffer_dtype=torch.float32,
+    process_group=None,
+    sharding_strategy=ShardingStrategy.FULL_SHARD,
+    sync_module_states=True,
+):
+    model = FSDP(
+        module=model,
+        process_group=process_group,
+        sharding_strategy=sharding_strategy,
+        auto_wrap_policy=partial(
+            lambda_auto_wrap_policy, lambda_fn=lambda m: m in model.blocks),
+        # mixed_precision=MixedPrecision(
+        #     param_dtype=param_dtype,
+        #     reduce_dtype=reduce_dtype,
+        #     buffer_dtype=buffer_dtype),
+        device_id=device_id,
+        sync_module_states=sync_module_states)
+    return model
+def free_model(model):
+    for m in model.modules():
+        if isinstance(m, FSDP):
+            _free_storage(m._handle.flat_param.data)
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()

wan/distributed/xdit_context_parallel.py ADDED Viewed

	@@ -0,0 +1,550 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.cuda.amp as amp
+from xfuser.core.distributed import (
+    get_sequence_parallel_rank,
+    get_sequence_parallel_world_size,
+    get_sp_group,
+)
+from einops import rearrange
+from xfuser.core.long_ctx_attention import xFuserLongContextAttention
+import xformers.ops
+from ..modules.model import sinusoidal_embedding_1d
+from ..utils.multitalk_utils import get_attn_map_with_target, split_token_counts_and_frame_ids, normalize_and_scale
+from ..modules.attention import SingleStreamAttention, SingleStreamMutiAttention
+def pad_freqs(original_tensor, target_len):
+    seq_len, s1, s2 = original_tensor.shape
+    pad_size = target_len - seq_len
+    padding_tensor = torch.ones(
+        pad_size,
+        s1,
+        s2,
+        dtype=original_tensor.dtype,
+        device=original_tensor.device)
+    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
+    return padded_tensor
+@amp.autocast(enabled=False)
+def rope_apply(x, grid_sizes, freqs):
+    """
+    x:          [B, L, N, C].
+    grid_sizes: [B, 3].
+    freqs:      [M, C // 2].
+    """
+    s, n, c = x.size(1), x.size(2), x.size(3) // 2
+    # split freqs
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1) # [[N, head_dim/2], [N, head_dim/2], [N, head_dim/2]] # T H W 极坐标
+    # loop over samples
+    output = []
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+        # precompute multipliers
+        x_i = torch.view_as_complex(x[i, :s].to(torch.float64).reshape(
+            s, n, -1, 2)) # [L, N, C/2] # 极坐标
+        freqs_i = torch.cat([
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+        ],
+                            dim=-1).reshape(seq_len, 1, -1) # seq_lens, 1,  3 * dim / 2 (T H W)
+        # apply rotary embedding
+        sp_size = get_sequence_parallel_world_size()
+        sp_rank = get_sequence_parallel_rank()
+        freqs_i = pad_freqs(freqs_i, s * sp_size)
+        s_per_rank = s
+        freqs_i_rank = freqs_i[(sp_rank * s_per_rank):((sp_rank + 1) *
+                                                       s_per_rank), :, :]
+        x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
+        x_i = torch.cat([x_i, x[i, s:]])
+        # append to collection
+        output.append(x_i)
+    return torch.stack(output).float()
+def usp_dit_forward_vace(self, x, vace_context, seq_len, kwargs):
+    # embeddings
+    c = [self.vace_patch_embedding(u.unsqueeze(0)) for u in vace_context]
+    c = [u.flatten(2).transpose(1, 2) for u in c]
+    c = torch.cat([
+        torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
+        for u in c
+    ])
+    # arguments
+    new_kwargs = dict(x=x)
+    new_kwargs.update(kwargs)
+    # Context Parallel
+    c = torch.chunk(
+        c, get_sequence_parallel_world_size(),
+        dim=1)[get_sequence_parallel_rank()]
+    hints = []
+    for block in self.vace_blocks:
+        c, c_skip = block(c, **new_kwargs)
+        hints.append(c_skip)
+    return hints
+def usp_dit_forward(
+    self,
+    x,
+    t,
+    context,
+    seq_len,
+    vace_context=None,
+    vace_context_scale=1.0,
+    clip_fea=None,
+    y=None,
+):
+    """
+    x:              A list of videos each with shape [C, T, H, W].
+    t:              [B].
+    context:        A list of text embeddings each with shape [L, C].
+    """
+    if self.model_type == 'i2v':
+        assert clip_fea is not None and y is not None
+    # params
+    device = self.patch_embedding.weight.device
+    if self.freqs.device != device:
+        self.freqs = self.freqs.to(device)
+    if self.model_type != 'vace' and y is not None:
+        x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
+    # embeddings
+    x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
+    grid_sizes = torch.stack(
+        [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
+    x = [u.flatten(2).transpose(1, 2) for u in x]
+    seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
+    assert seq_lens.max() <= seq_len
+    x = torch.cat([
+        torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
+        for u in x
+    ])
+    # time embeddings
+    with amp.autocast(dtype=torch.float32):
+        e = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, t).float())
+        e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+        assert e.dtype == torch.float32 and e0.dtype == torch.float32
+    # context
+    context_lens = None
+    context = self.text_embedding(
+        torch.stack([
+            torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
+            for u in context
+        ]))
+    if self.model_type != 'vace' and clip_fea is not None:
+        context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+        context = torch.concat([context_clip, context], dim=1)
+    # arguments
+    kwargs = dict(
+        e=e0,
+        seq_lens=seq_lens,
+        grid_sizes=grid_sizes,
+        freqs=self.freqs,
+        context=context,
+        context_lens=context_lens)
+    # Context Parallel
+    x = torch.chunk(
+        x, get_sequence_parallel_world_size(),
+        dim=1)[get_sequence_parallel_rank()]
+    for block in self.blocks:
+        x = block(x, **kwargs)
+    # head
+    x = self.head(x, e)
+    # Context Parallel
+    x = get_sp_group().all_gather(x, dim=1)
+    # unpatchify
+    x = self.unpatchify(x, grid_sizes)
+    return [u.float() for u in x]
+def usp_attn_forward(self,
+                     x,
+                     seq_lens,
+                     grid_sizes,
+                     freqs,
+                     dtype=torch.bfloat16):
+    b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+    half_dtypes = (torch.float16, torch.bfloat16)
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # query, key, value function
+    def qkv_fn(x):
+        q = self.norm_q(self.q(x)).view(b, s, n, d)
+        k = self.norm_k(self.k(x)).view(b, s, n, d)
+        v = self.v(x).view(b, s, n, d)
+        return q, k, v
+    q, k, v = qkv_fn(x)
+    q = rope_apply(q, grid_sizes, freqs)
+    k = rope_apply(k, grid_sizes, freqs)
+    # TODO: We should use unpaded q,k,v for attention.
+    # k_lens = seq_lens // get_sequence_parallel_world_size()
+    # if k_lens is not None:
+    #     q = torch.cat([u[:l] for u, l in zip(q, k_lens)]).unsqueeze(0)
+    #     k = torch.cat([u[:l] for u, l in zip(k, k_lens)]).unsqueeze(0)
+    #     v = torch.cat([u[:l] for u, l in zip(v, k_lens)]).unsqueeze(0)
+    x = xFuserLongContextAttention()(
+        None,
+        query=half(q),
+        key=half(k),
+        value=half(v),
+        window_size=self.window_size)
+    # TODO: padding after attention.
+    # x = torch.cat([x, x.new_zeros(b, s - x.size(1), n, d)], dim=1)
+    # output
+    x = x.flatten(2)
+    x = self.o(x)
+    return x
+def usp_dit_forward_multitalk(
+    self,
+    x,
+    t,
+    context,
+    seq_len,
+    clip_fea=None,
+    y=None,
+    audio=None,
+    ref_target_masks=None,
+):
+    """
+    x:              A list of videos each with shape [C, T, H, W].
+    t:              [B].
+    context:        A list of text embeddings each with shape [L, C].
+    """
+    assert clip_fea is not None and y is not None
+    # params
+    device = self.patch_embedding.weight.device
+    if self.freqs.device != device:
+        self.freqs = self.freqs.to(device)
+    _, T, H, W = x[0].shape
+    N_t = T // self.patch_size[0]
+    N_h = H // self.patch_size[1]
+    N_w = W // self.patch_size[2]
+    if y is not None:
+        x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
+    x[0] = x[0].to(context[0].dtype)
+    # embeddings
+    x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
+    grid_sizes = torch.stack(
+        [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
+    x = [u.flatten(2).transpose(1, 2) for u in x]
+    seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
+    assert seq_lens.max() <= seq_len
+    x = torch.cat([
+        torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
+        for u in x
+    ])
+    # time embeddings
+    with amp.autocast(dtype=torch.float32):
+        e = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, t).float())
+        e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+        assert e.dtype == torch.float32 and e0.dtype == torch.float32
+    # context
+    context_lens = None
+    context = self.text_embedding(
+        torch.stack([
+            torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
+            for u in context
+        ]))
+    if clip_fea is not None:
+        context_clip = self.img_emb(clip_fea)
+        context = torch.concat([context_clip, context], dim=1)
+    # get audio token
+    audio_cond = audio.to(device=x.device, dtype=x.dtype)
+    first_frame_audio_emb_s = audio_cond[:, :1, ...]
+    latter_frame_audio_emb = audio_cond[:, 1:, ...]
+    latter_frame_audio_emb = rearrange(latter_frame_audio_emb, "b (n_t n) w s c -> b n_t n w s c", n=self.vae_scale)
+    middle_index = self.audio_window // 2
+    latter_first_frame_audio_emb = latter_frame_audio_emb[:, :, :1, :middle_index+1, ...]
+    latter_first_frame_audio_emb = rearrange(latter_first_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
+    latter_last_frame_audio_emb = latter_frame_audio_emb[:, :, -1:, middle_index:, ...]
+    latter_last_frame_audio_emb = rearrange(latter_last_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
+    latter_middle_frame_audio_emb = latter_frame_audio_emb[:, :, 1:-1, middle_index:middle_index+1, ...]
+    latter_middle_frame_audio_emb = rearrange(latter_middle_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
+    latter_frame_audio_emb_s = torch.concat([latter_first_frame_audio_emb, latter_middle_frame_audio_emb, latter_last_frame_audio_emb], dim=2)
+    audio_embedding = self.audio_proj(first_frame_audio_emb_s, latter_frame_audio_emb_s)
+    human_num = len(audio_embedding)
+    audio_embedding = torch.concat(audio_embedding.split(1), dim=2).to(x.dtype)
+    # convert ref_target_masks to token_ref_target_masks
+    if ref_target_masks is not None:
+        ref_target_masks = ref_target_masks.unsqueeze(0).to(torch.float32)
+        token_ref_target_masks = nn.functional.interpolate(ref_target_masks, size=(N_h, N_w), mode='nearest')
+        token_ref_target_masks = token_ref_target_masks.squeeze(0)
+        token_ref_target_masks = (token_ref_target_masks > 0)
+        token_ref_target_masks = token_ref_target_masks.view(token_ref_target_masks.shape[0], -1)
+        token_ref_target_masks = token_ref_target_masks.to(x.dtype)
+    if self.enable_teacache:
+        modulated_inp = e0 if self.use_ret_steps else e
+        if self.cnt%3==0: # cond
+            if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
+                should_calc_cond = True
+                self.accumulated_rel_l1_distance_cond = 0
+            else:
+                rescale_func = np.poly1d(self.coefficients)
+                self.accumulated_rel_l1_distance_cond += rescale_func(((modulated_inp-self.previous_e0_cond).abs().mean() / self.previous_e0_cond.abs().mean()).cpu().item())
+                # print("accumulated_rel_l1_distance_even", self.accumulated_rel_l1_distance_even)
+                if self.accumulated_rel_l1_distance_cond < self.teacache_thresh:
+                    should_calc_cond = False
+                else:
+                    should_calc_cond = True
+                    self.accumulated_rel_l1_distance_cond = 0
+            self.previous_e0_cond = modulated_inp.clone()
+        elif self.cnt%3==1: # drop_text
+            if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
+                should_calc_drop_text = True
+                self.accumulated_rel_l1_distance_drop_text = 0
+            else:
+                rescale_func = np.poly1d(self.coefficients)
+                self.accumulated_rel_l1_distance_drop_text += rescale_func(((modulated_inp-self.previous_e0_drop_text).abs().mean() / self.previous_e0_drop_text.abs().mean()).cpu().item())
+                if self.accumulated_rel_l1_distance_drop_text < self.teacache_thresh:
+                    should_calc_drop_text = False
+                else:
+                    should_calc_drop_text = True
+                    self.accumulated_rel_l1_distance_drop_text = 0
+            self.previous_e0_drop_text = modulated_inp.clone()
+        else: # uncond
+            if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
+                should_calc_uncond = True
+                self.accumulated_rel_l1_distance_uncond = 0
+            else:
+                rescale_func = np.poly1d(self.coefficients)
+                self.accumulated_rel_l1_distance_uncond += rescale_func(((modulated_inp-self.previous_e0_uncond).abs().mean() / self.previous_e0_uncond.abs().mean()).cpu().item())
+                if self.accumulated_rel_l1_distance_uncond < self.teacache_thresh:
+                    should_calc_uncond = False
+                else:
+                    should_calc_uncond = True
+                    self.accumulated_rel_l1_distance_uncond = 0
+            self.previous_e0_uncond = modulated_inp.clone()
+    # Context Parallel
+    x = torch.chunk(
+        x, get_sequence_parallel_world_size(),
+        dim=1)[get_sequence_parallel_rank()]
+    # arguments
+    kwargs = dict(
+        e=e0,
+        seq_lens=seq_lens,
+        grid_sizes=grid_sizes,
+        freqs=self.freqs,
+        context=context,
+        context_lens=context_lens,
+        audio_embedding=audio_embedding,
+        ref_target_masks=token_ref_target_masks,
+        human_num=human_num,
+        )
+    if self.enable_teacache:
+        if self.cnt%3==0:
+            if not should_calc_cond:
+                x +=  self.previous_residual_cond
+            else:
+                ori_x = x.clone()
+                for block in self.blocks:
+                    x = block(x, **kwargs)
+                self.previous_residual_cond = x - ori_x
+        elif self.cnt%3==1:
+            if not should_calc_drop_text:
+                x +=  self.previous_residual_drop_text
+            else:
+                ori_x = x.clone()
+                for block in self.blocks:
+                    x = block(x, **kwargs)
+                self.previous_residual_drop_text = x - ori_x
+        else:
+            if not should_calc_uncond:
+                x +=  self.previous_residual_uncond
+            else:
+                ori_x = x.clone()
+                for block in self.blocks:
+                    x = block(x, **kwargs)
+                self.previous_residual_uncond = x - ori_x
+    else:
+        for block in self.blocks:
+            x = block(x, **kwargs)
+    # head
+    x = self.head(x, e)
+    # Context Parallel
+    x = get_sp_group().all_gather(x, dim=1)
+    # unpatchify
+    x = self.unpatchify(x, grid_sizes)
+    if self.enable_teacache:
+        self.cnt += 1
+        if self.cnt >= self.num_steps:
+            self.cnt = 0
+    return torch.stack(x).float()
+def usp_attn_forward_multitalk(self,
+                     x,
+                     seq_lens,
+                     grid_sizes,
+                     freqs,
+                     dtype=torch.bfloat16,
+                     ref_target_masks=None):
+    b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+    half_dtypes = (torch.float16, torch.bfloat16)
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # query, key, value function
+    def qkv_fn(x):
+        q = self.norm_q(self.q(x)).view(b, s, n, d)
+        k = self.norm_k(self.k(x)).view(b, s, n, d)
+        v = self.v(x).view(b, s, n, d)
+        return q, k, v
+    q, k, v = qkv_fn(x)
+    q = rope_apply(q, grid_sizes, freqs)
+    k = rope_apply(k, grid_sizes, freqs)
+    x = xFuserLongContextAttention()(
+        None,
+        query=half(q),
+        key=half(k),
+        value=half(v),
+        window_size=self.window_size)
+    # output
+    x = x.flatten(2)
+    x = self.o(x)
+    with torch.no_grad():
+        x_ref_attn_map = get_attn_map_with_target(q.type_as(x), k.type_as(x), grid_sizes[0],
+                                            ref_target_masks=ref_target_masks, enable_sp=True)
+    return x, x_ref_attn_map
+def usp_crossattn_multi_forward_multitalk(self,
+                                        x: torch.Tensor,
+                                        encoder_hidden_states: torch.Tensor,  # 1, 21, 64, C
+                                        shape=None,
+                                        x_ref_attn_map=None,
+                                        human_num=None) -> torch.Tensor:
+        N_t, N_h, N_w = shape
+        sp_size = get_sequence_parallel_world_size()
+        sp_rank = get_sequence_parallel_rank()
+        audio_tokens_per_frame = 32
+        visual_seqlen, frame_ids = split_token_counts_and_frame_ids(N_t, N_h * N_w, sp_size, sp_rank)
+        encoder_hidden_states = encoder_hidden_states[:, min(frame_ids):max(frame_ids)+1, ...]
+        encoder_hidden_states = rearrange(encoder_hidden_states, "B T N C -> B (T N) C")
+        N_a = len(frame_ids)
+        kv_seq = [audio_tokens_per_frame * human_num] * N_a
+        if human_num == 1:
+            return super(SingleStreamMutiAttention, self).forward(x, encoder_hidden_states, shape, enable_sp=True, kv_seq=kv_seq)
+        # get q for hidden_state
+        B, N, C = x.shape
+        q = self.q_linear(x)
+        q_shape = (B, N, self.num_heads, self.head_dim)
+        q = q.view(q_shape).permute((0, 2, 1, 3))
+        if self.qk_norm:
+            q = self.q_norm(q)
+        max_values = x_ref_attn_map.max(1).values[:, None, None]
+        min_values = x_ref_attn_map.min(1).values[:, None, None]
+        max_min_values = torch.cat([max_values, min_values], dim=2)
+        max_min_values = get_sp_group().all_gather(max_min_values, dim=1)
+        human1_max_value, human1_min_value = max_min_values[0, :, 0].max(), max_min_values[0, :, 1].min()
+        human2_max_value, human2_min_value = max_min_values[1, :, 0].max(), max_min_values[1, :, 1].min()
+        human1 = normalize_and_scale(x_ref_attn_map[0], (human1_min_value, human1_max_value), (self.rope_h1[0], self.rope_h1[1]))
+        human2 = normalize_and_scale(x_ref_attn_map[1], (human2_min_value, human2_max_value), (self.rope_h2[0], self.rope_h2[1]))
+        back   = torch.full((x_ref_attn_map.size(1),), self.rope_bak, dtype=human1.dtype).to(human1.device)
+        max_indices = x_ref_attn_map.argmax(dim=0)
+        normalized_map = torch.stack([human1, human2, back], dim=1)
+        normalized_pos = normalized_map[range(x_ref_attn_map.size(1)), max_indices] # N
+        q = self.rope_1d(q, normalized_pos)
+        encoder_kv = self.kv_linear(encoder_hidden_states)
+        encoder_kv_shape = (B, encoder_hidden_states.size(1), 2, self.num_heads, self.head_dim)
+        encoder_kv = encoder_kv.view(encoder_kv_shape).permute((2, 0, 3, 1, 4))
+        encoder_k, encoder_v = encoder_kv.unbind(0) # B H N C
+        if self.qk_norm:
+            encoder_k = self.add_k_norm(encoder_k)
+        # position embedding for condition audio embeddings
+        per_frame = torch.zeros(audio_tokens_per_frame * human_num, dtype=encoder_k.dtype).to(encoder_k.device)
+        per_frame[:audio_tokens_per_frame] = (self.rope_h1[0] + self.rope_h1[1]) / 2
+        per_frame[audio_tokens_per_frame:] = (self.rope_h2[0] + self.rope_h2[1]) / 2
+        encoder_pos = torch.concat([per_frame]*N_a, dim=0)
+        encoder_k = self.rope_1d(encoder_k, encoder_pos)
+        # get attn
+        q = rearrange(q, "B H M K -> B M H K")
+        encoder_k = rearrange(encoder_k, "B H M K -> B M H K")
+        encoder_v = rearrange(encoder_v, "B H M K -> B M H K")
+        attn_bias = xformers.ops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(visual_seqlen, kv_seq)
+        x = xformers.ops.memory_efficient_attention(q, encoder_k, encoder_v, attn_bias=attn_bias, op=None,)
+        x = rearrange(x, "B M H K -> B H M K")
+        # linear transform
+        x_output_shape = (B, N, C)
+        x = x.transpose(1, 2)
+        x = x.reshape(x_output_shape)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

wan/first_last_frame2video.py ADDED Viewed

	@@ -0,0 +1,377 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import gc
+import logging
+import math
+import os
+import random
+import sys
+import types
+from contextlib import contextmanager
+from functools import partial
+import numpy as np
+import torch
+import torch.cuda.amp as amp
+import torch.distributed as dist
+import torchvision.transforms.functional as TF
+from tqdm import tqdm
+from .distributed.fsdp import shard_model
+from .modules.clip import CLIPModel
+from .modules.model import WanModel
+from .modules.t5 import T5EncoderModel
+from .modules.vae import WanVAE
+from .utils.fm_solvers import (
+    FlowDPMSolverMultistepScheduler,
+    get_sampling_sigmas,
+    retrieve_timesteps,
+)
+from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
+class WanFLF2V:
+    def __init__(
+        self,
+        config,
+        checkpoint_dir,
+        device_id=0,
+        rank=0,
+        t5_fsdp=False,
+        dit_fsdp=False,
+        use_usp=False,
+        t5_cpu=False,
+        init_on_cpu=True,
+    ):
+        r"""
+        Initializes the image-to-video generation model components.
+        Args:
+            config (EasyDict):
+                Object containing model parameters initialized from config.py
+            checkpoint_dir (`str`):
+                Path to directory containing model checkpoints
+            device_id (`int`,  *optional*, defaults to 0):
+                Id of target GPU device
+            rank (`int`,  *optional*, defaults to 0):
+                Process rank for distributed training
+            t5_fsdp (`bool`, *optional*, defaults to False):
+                Enable FSDP sharding for T5 model
+            dit_fsdp (`bool`, *optional*, defaults to False):
+                Enable FSDP sharding for DiT model
+            use_usp (`bool`, *optional*, defaults to False):
+                Enable distribution strategy of USP.
+            t5_cpu (`bool`, *optional*, defaults to False):
+                Whether to place T5 model on CPU. Only works without t5_fsdp.
+            init_on_cpu (`bool`, *optional*, defaults to True):
+                Enable initializing Transformer Model on CPU. Only works without FSDP or USP.
+        """
+        self.device = torch.device(f"cuda:{device_id}")
+        self.config = config
+        self.rank = rank
+        self.use_usp = use_usp
+        self.t5_cpu = t5_cpu
+        self.num_train_timesteps = config.num_train_timesteps
+        self.param_dtype = config.param_dtype
+        shard_fn = partial(shard_model, device_id=device_id)
+        self.text_encoder = T5EncoderModel(
+            text_len=config.text_len,
+            dtype=config.t5_dtype,
+            device=torch.device('cpu'),
+            checkpoint_path=os.path.join(checkpoint_dir, config.t5_checkpoint),
+            tokenizer_path=os.path.join(checkpoint_dir, config.t5_tokenizer),
+            shard_fn=shard_fn if t5_fsdp else None,
+        )
+        self.vae_stride = config.vae_stride
+        self.patch_size = config.patch_size
+        self.vae = WanVAE(
+            vae_pth=os.path.join(checkpoint_dir, config.vae_checkpoint),
+            device=self.device)
+        self.clip = CLIPModel(
+            dtype=config.clip_dtype,
+            device=self.device,
+            checkpoint_path=os.path.join(checkpoint_dir,
+                                         config.clip_checkpoint),
+            tokenizer_path=os.path.join(checkpoint_dir, config.clip_tokenizer))
+        logging.info(f"Creating WanModel from {checkpoint_dir}")
+        self.model = WanModel.from_pretrained(checkpoint_dir)
+        self.model.eval().requires_grad_(False)
+        if t5_fsdp or dit_fsdp or use_usp:
+            init_on_cpu = False
+        if use_usp:
+            from xfuser.core.distributed import get_sequence_parallel_world_size
+            from .distributed.xdit_context_parallel import (
+                usp_attn_forward,
+                usp_dit_forward,
+            )
+            for block in self.model.blocks:
+                block.self_attn.forward = types.MethodType(
+                    usp_attn_forward, block.self_attn)
+            self.model.forward = types.MethodType(usp_dit_forward, self.model)
+            self.sp_size = get_sequence_parallel_world_size()
+        else:
+            self.sp_size = 1
+        if dist.is_initialized():
+            dist.barrier()
+        if dit_fsdp:
+            self.model = shard_fn(self.model)
+        else:
+            if not init_on_cpu:
+                self.model.to(self.device)
+        self.sample_neg_prompt = config.sample_neg_prompt
+    def generate(self,
+                 input_prompt,
+                 first_frame,
+                 last_frame,
+                 max_area=720 * 1280,
+                 frame_num=81,
+                 shift=16,
+                 sample_solver='unipc',
+                 sampling_steps=50,
+                 guide_scale=5.5,
+                 n_prompt="",
+                 seed=-1,
+                 offload_model=True):
+        r"""
+        Generates video frames from input first-last frame and text prompt using diffusion process.
+        Args:
+            input_prompt (`str`):
+                Text prompt for content generation.
+            first_frame (PIL.Image.Image):
+                Input image tensor. Shape: [3, H, W]
+            last_frame (PIL.Image.Image):
+                Input image tensor. Shape: [3, H, W]
+                [NOTE] If the sizes of first_frame and last_frame are mismatched, last_frame will be cropped & resized
+                to match first_frame.
+            max_area (`int`, *optional*, defaults to 720*1280):
+                Maximum pixel area for latent space calculation. Controls video resolution scaling
+            frame_num (`int`, *optional*, defaults to 81):
+                How many frames to sample from a video. The number should be 4n+1
+            shift (`float`, *optional*, defaults to 5.0):
+                Noise schedule shift parameter. Affects temporal dynamics
+                [NOTE]: If you want to generate a 480p video, it is recommended to set the shift value to 3.0.
+            sample_solver (`str`, *optional*, defaults to 'unipc'):
+                Solver used to sample the video.
+            sampling_steps (`int`, *optional*, defaults to 40):
+                Number of diffusion sampling steps. Higher values improve quality but slow generation
+            guide_scale (`float`, *optional*, defaults 5.0):
+                Classifier-free guidance scale. Controls prompt adherence vs. creativity
+            n_prompt (`str`, *optional*, defaults to ""):
+                Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt`
+            seed (`int`, *optional*, defaults to -1):
+                Random seed for noise generation. If -1, use random seed
+            offload_model (`bool`, *optional*, defaults to True):
+                If True, offloads models to CPU during generation to save VRAM
+        Returns:
+            torch.Tensor:
+                Generated video frames tensor. Dimensions: (C, N H, W) where:
+                - C: Color channels (3 for RGB)
+                - N: Number of frames (81)
+                - H: Frame height (from max_area)
+                - W: Frame width from max_area)
+        """
+        first_frame_size = first_frame.size
+        last_frame_size = last_frame.size
+        first_frame = TF.to_tensor(first_frame).sub_(0.5).div_(0.5).to(
+            self.device)
+        last_frame = TF.to_tensor(last_frame).sub_(0.5).div_(0.5).to(
+            self.device)
+        F = frame_num
+        first_frame_h, first_frame_w = first_frame.shape[1:]
+        aspect_ratio = first_frame_h / first_frame_w
+        lat_h = round(
+            np.sqrt(max_area * aspect_ratio) // self.vae_stride[1] //
+            self.patch_size[1] * self.patch_size[1])
+        lat_w = round(
+            np.sqrt(max_area / aspect_ratio) // self.vae_stride[2] //
+            self.patch_size[2] * self.patch_size[2])
+        first_frame_h = lat_h * self.vae_stride[1]
+        first_frame_w = lat_w * self.vae_stride[2]
+        if first_frame_size != last_frame_size:
+            # 1. resize
+            last_frame_resize_ratio = max(
+                first_frame_size[0] / last_frame_size[0],
+                first_frame_size[1] / last_frame_size[1])
+            last_frame_size = [
+                round(last_frame_size[0] * last_frame_resize_ratio),
+                round(last_frame_size[1] * last_frame_resize_ratio),
+            ]
+            # 2. center crop
+            last_frame = TF.center_crop(last_frame, last_frame_size)
+        max_seq_len = ((F - 1) // self.vae_stride[0] + 1) * lat_h * lat_w // (
+            self.patch_size[1] * self.patch_size[2])
+        max_seq_len = int(math.ceil(max_seq_len / self.sp_size)) * self.sp_size
+        seed = seed if seed >= 0 else random.randint(0, sys.maxsize)
+        seed_g = torch.Generator(device=self.device)
+        seed_g.manual_seed(seed)
+        noise = torch.randn(
+            16, (F - 1) // 4 + 1,
+            lat_h,
+            lat_w,
+            dtype=torch.float32,
+            generator=seed_g,
+            device=self.device)
+        msk = torch.ones(1, 81, lat_h, lat_w, device=self.device)
+        msk[:, 1:-1] = 0
+        msk = torch.concat([
+            torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]
+        ],
+                           dim=1)
+        msk = msk.view(1, msk.shape[1] // 4, 4, lat_h, lat_w)
+        msk = msk.transpose(1, 2)[0]
+        if n_prompt == "":
+            n_prompt = self.sample_neg_prompt
+        # preprocess
+        if not self.t5_cpu:
+            self.text_encoder.model.to(self.device)
+            context = self.text_encoder([input_prompt], self.device)
+            context_null = self.text_encoder([n_prompt], self.device)
+            if offload_model:
+                self.text_encoder.model.cpu()
+        else:
+            context = self.text_encoder([input_prompt], torch.device('cpu'))
+            context_null = self.text_encoder([n_prompt], torch.device('cpu'))
+            context = [t.to(self.device) for t in context]
+            context_null = [t.to(self.device) for t in context_null]
+        self.clip.model.to(self.device)
+        clip_context = self.clip.visual(
+            [first_frame[:, None, :, :], last_frame[:, None, :, :]])
+        if offload_model:
+            self.clip.model.cpu()
+        y = self.vae.encode([
+            torch.concat([
+                torch.nn.functional.interpolate(
+                    first_frame[None].cpu(),
+                    size=(first_frame_h, first_frame_w),
+                    mode='bicubic').transpose(0, 1),
+                torch.zeros(3, F - 2, first_frame_h, first_frame_w),
+                torch.nn.functional.interpolate(
+                    last_frame[None].cpu(),
+                    size=(first_frame_h, first_frame_w),
+                    mode='bicubic').transpose(0, 1),
+            ],
+                         dim=1).to(self.device)
+        ])[0]
+        y = torch.concat([msk, y])
+        @contextmanager
+        def noop_no_sync():
+            yield
+        no_sync = getattr(self.model, 'no_sync', noop_no_sync)
+        # evaluation mode
+        with amp.autocast(dtype=self.param_dtype), torch.no_grad(), no_sync():
+            if sample_solver == 'unipc':
+                sample_scheduler = FlowUniPCMultistepScheduler(
+                    num_train_timesteps=self.num_train_timesteps,
+                    shift=1,
+                    use_dynamic_shifting=False)
+                sample_scheduler.set_timesteps(
+                    sampling_steps, device=self.device, shift=shift)
+                timesteps = sample_scheduler.timesteps
+            elif sample_solver == 'dpm++':
+                sample_scheduler = FlowDPMSolverMultistepScheduler(
+                    num_train_timesteps=self.num_train_timesteps,
+                    shift=1,
+                    use_dynamic_shifting=False)
+                sampling_sigmas = get_sampling_sigmas(sampling_steps, shift)
+                timesteps, _ = retrieve_timesteps(
+                    sample_scheduler,
+                    device=self.device,
+                    sigmas=sampling_sigmas)
+            else:
+                raise NotImplementedError("Unsupported solver.")
+            # sample videos
+            latent = noise
+            arg_c = {
+                'context': [context[0]],
+                'clip_fea': clip_context,
+                'seq_len': max_seq_len,
+                'y': [y],
+            }
+            arg_null = {
+                'context': context_null,
+                'clip_fea': clip_context,
+                'seq_len': max_seq_len,
+                'y': [y],
+            }
+            if offload_model:
+                torch.cuda.empty_cache()
+            self.model.to(self.device)
+            for _, t in enumerate(tqdm(timesteps)):
+                latent_model_input = [latent.to(self.device)]
+                timestep = [t]
+                timestep = torch.stack(timestep).to(self.device)
+                noise_pred_cond = self.model(
+                    latent_model_input, t=timestep, **arg_c)[0].to(
+                        torch.device('cpu') if offload_model else self.device)
+                if offload_model:
+                    torch.cuda.empty_cache()
+                noise_pred_uncond = self.model(
+                    latent_model_input, t=timestep, **arg_null)[0].to(
+                        torch.device('cpu') if offload_model else self.device)
+                if offload_model:
+                    torch.cuda.empty_cache()
+                noise_pred = noise_pred_uncond + guide_scale * (
+                    noise_pred_cond - noise_pred_uncond)
+                latent = latent.to(
+                    torch.device('cpu') if offload_model else self.device)
+                temp_x0 = sample_scheduler.step(
+                    noise_pred.unsqueeze(0),
+                    t,
+                    latent.unsqueeze(0),
+                    return_dict=False,
+                    generator=seed_g)[0]
+                latent = temp_x0.squeeze(0)
+                x0 = [latent.to(self.device)]
+                del latent_model_input, timestep
+            if offload_model:
+                self.model.cpu()
+                torch.cuda.empty_cache()
+            if self.rank == 0:
+                videos = self.vae.decode(x0)
+        del noise, latent
+        del sample_scheduler
+        if offload_model:
+            gc.collect()
+            torch.cuda.synchronize()
+        if dist.is_initialized():
+            dist.barrier()
+        return videos[0] if self.rank == 0 else None

wan/image2video.py ADDED Viewed

	@@ -0,0 +1,350 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import gc
+import logging
+import math
+import os
+import random
+import sys
+import types
+from contextlib import contextmanager
+from functools import partial
+import numpy as np
+import torch
+import torch.cuda.amp as amp
+import torch.distributed as dist
+import torchvision.transforms.functional as TF
+from tqdm import tqdm
+from .distributed.fsdp import shard_model
+from .modules.clip import CLIPModel
+from .modules.model import WanModel
+from .modules.t5 import T5EncoderModel
+from .modules.vae import WanVAE
+from .utils.fm_solvers import (
+    FlowDPMSolverMultistepScheduler,
+    get_sampling_sigmas,
+    retrieve_timesteps,
+)
+from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
+class WanI2V:
+    def __init__(
+        self,
+        config,
+        checkpoint_dir,
+        device_id=0,
+        rank=0,
+        t5_fsdp=False,
+        dit_fsdp=False,
+        use_usp=False,
+        t5_cpu=False,
+        init_on_cpu=True,
+    ):
+        r"""
+        Initializes the image-to-video generation model components.
+        Args:
+            config (EasyDict):
+                Object containing model parameters initialized from config.py
+            checkpoint_dir (`str`):
+                Path to directory containing model checkpoints
+            device_id (`int`,  *optional*, defaults to 0):
+                Id of target GPU device
+            rank (`int`,  *optional*, defaults to 0):
+                Process rank for distributed training
+            t5_fsdp (`bool`, *optional*, defaults to False):
+                Enable FSDP sharding for T5 model
+            dit_fsdp (`bool`, *optional*, defaults to False):
+                Enable FSDP sharding for DiT model
+            use_usp (`bool`, *optional*, defaults to False):
+                Enable distribution strategy of USP.
+            t5_cpu (`bool`, *optional*, defaults to False):
+                Whether to place T5 model on CPU. Only works without t5_fsdp.
+            init_on_cpu (`bool`, *optional*, defaults to True):
+                Enable initializing Transformer Model on CPU. Only works without FSDP or USP.
+        """
+        self.device = torch.device(f"cuda:{device_id}")
+        self.config = config
+        self.rank = rank
+        self.use_usp = use_usp
+        self.t5_cpu = t5_cpu
+        self.num_train_timesteps = config.num_train_timesteps
+        self.param_dtype = config.param_dtype
+        shard_fn = partial(shard_model, device_id=device_id)
+        self.text_encoder = T5EncoderModel(
+            text_len=config.text_len,
+            dtype=config.t5_dtype,
+            device=torch.device('cpu'),
+            checkpoint_path=os.path.join(checkpoint_dir, config.t5_checkpoint),
+            tokenizer_path=os.path.join(checkpoint_dir, config.t5_tokenizer),
+            shard_fn=shard_fn if t5_fsdp else None,
+        )
+        self.vae_stride = config.vae_stride
+        self.patch_size = config.patch_size
+        self.vae = WanVAE(
+            vae_pth=os.path.join(checkpoint_dir, config.vae_checkpoint),
+            device=self.device)
+        self.clip = CLIPModel(
+            dtype=config.clip_dtype,
+            device=self.device,
+            checkpoint_path=os.path.join(checkpoint_dir,
+                                         config.clip_checkpoint),
+            tokenizer_path=os.path.join(checkpoint_dir, config.clip_tokenizer))
+        logging.info(f"Creating WanModel from {checkpoint_dir}")
+        self.model = WanModel.from_pretrained(checkpoint_dir)
+        self.model.eval().requires_grad_(False)
+        if t5_fsdp or dit_fsdp or use_usp:
+            init_on_cpu = False
+        if use_usp:
+            from xfuser.core.distributed import get_sequence_parallel_world_size
+            from .distributed.xdit_context_parallel import (
+                usp_attn_forward,
+                usp_dit_forward,
+            )
+            for block in self.model.blocks:
+                block.self_attn.forward = types.MethodType(
+                    usp_attn_forward, block.self_attn)
+            self.model.forward = types.MethodType(usp_dit_forward, self.model)
+            self.sp_size = get_sequence_parallel_world_size()
+        else:
+            self.sp_size = 1
+        if dist.is_initialized():
+            dist.barrier()
+        if dit_fsdp:
+            self.model = shard_fn(self.model)
+        else:
+            if not init_on_cpu:
+                self.model.to(self.device)
+        self.sample_neg_prompt = config.sample_neg_prompt
+    def generate(self,
+                 input_prompt,
+                 img,
+                 max_area=720 * 1280,
+                 frame_num=81,
+                 shift=5.0,
+                 sample_solver='unipc',
+                 sampling_steps=40,
+                 guide_scale=5.0,
+                 n_prompt="",
+                 seed=-1,
+                 offload_model=True):
+        r"""
+        Generates video frames from input image and text prompt using diffusion process.
+        Args:
+            input_prompt (`str`):
+                Text prompt for content generation.
+            img (PIL.Image.Image):
+                Input image tensor. Shape: [3, H, W]
+            max_area (`int`, *optional*, defaults to 720*1280):
+                Maximum pixel area for latent space calculation. Controls video resolution scaling
+            frame_num (`int`, *optional*, defaults to 81):
+                How many frames to sample from a video. The number should be 4n+1
+            shift (`float`, *optional*, defaults to 5.0):
+                Noise schedule shift parameter. Affects temporal dynamics
+                [NOTE]: If you want to generate a 480p video, it is recommended to set the shift value to 3.0.
+            sample_solver (`str`, *optional*, defaults to 'unipc'):
+                Solver used to sample the video.
+            sampling_steps (`int`, *optional*, defaults to 40):
+                Number of diffusion sampling steps. Higher values improve quality but slow generation
+            guide_scale (`float`, *optional*, defaults 5.0):
+                Classifier-free guidance scale. Controls prompt adherence vs. creativity
+            n_prompt (`str`, *optional*, defaults to ""):
+                Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt`
+            seed (`int`, *optional*, defaults to -1):
+                Random seed for noise generation. If -1, use random seed
+            offload_model (`bool`, *optional*, defaults to True):
+                If True, offloads models to CPU during generation to save VRAM
+        Returns:
+            torch.Tensor:
+                Generated video frames tensor. Dimensions: (C, N H, W) where:
+                - C: Color channels (3 for RGB)
+                - N: Number of frames (81)
+                - H: Frame height (from max_area)
+                - W: Frame width from max_area)
+        """
+        img = TF.to_tensor(img).sub_(0.5).div_(0.5).to(self.device)
+        F = frame_num
+        h, w = img.shape[1:]
+        aspect_ratio = h / w
+        lat_h = round(
+            np.sqrt(max_area * aspect_ratio) // self.vae_stride[1] //
+            self.patch_size[1] * self.patch_size[1])
+        lat_w = round(
+            np.sqrt(max_area / aspect_ratio) // self.vae_stride[2] //
+            self.patch_size[2] * self.patch_size[2])
+        h = lat_h * self.vae_stride[1]
+        w = lat_w * self.vae_stride[2]
+        max_seq_len = ((F - 1) // self.vae_stride[0] + 1) * lat_h * lat_w // (
+            self.patch_size[1] * self.patch_size[2])
+        max_seq_len = int(math.ceil(max_seq_len / self.sp_size)) * self.sp_size
+        seed = seed if seed >= 0 else random.randint(0, sys.maxsize)
+        seed_g = torch.Generator(device=self.device)
+        seed_g.manual_seed(seed)
+        noise = torch.randn(
+            16, (F - 1) // 4 + 1,
+            lat_h,
+            lat_w,
+            dtype=torch.float32,
+            generator=seed_g,
+            device=self.device)
+        msk = torch.ones(1, 81, lat_h, lat_w, device=self.device)
+        msk[:, 1:] = 0
+        msk = torch.concat([
+            torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]
+        ],
+                           dim=1)
+        msk = msk.view(1, msk.shape[1] // 4, 4, lat_h, lat_w)
+        msk = msk.transpose(1, 2)[0]
+        if n_prompt == "":
+            n_prompt = self.sample_neg_prompt
+        # preprocess
+        if not self.t5_cpu:
+            self.text_encoder.model.to(self.device)
+            context = self.text_encoder([input_prompt], self.device)
+            context_null = self.text_encoder([n_prompt], self.device)
+            if offload_model:
+                self.text_encoder.model.cpu()
+        else:
+            context = self.text_encoder([input_prompt], torch.device('cpu'))
+            context_null = self.text_encoder([n_prompt], torch.device('cpu'))
+            context = [t.to(self.device) for t in context]
+            context_null = [t.to(self.device) for t in context_null]
+        self.clip.model.to(self.device)
+        clip_context = self.clip.visual([img[:, None, :, :]])
+        if offload_model:
+            self.clip.model.cpu()
+        y = self.vae.encode([
+            torch.concat([
+                torch.nn.functional.interpolate(
+                    img[None].cpu(), size=(h, w), mode='bicubic').transpose(
+                        0, 1),
+                torch.zeros(3, F - 1, h, w)
+            ],
+                         dim=1).to(self.device)
+        ])[0]
+        y = torch.concat([msk, y])
+        @contextmanager
+        def noop_no_sync():
+            yield
+        no_sync = getattr(self.model, 'no_sync', noop_no_sync)
+        # evaluation mode
+        with amp.autocast(dtype=self.param_dtype), torch.no_grad(), no_sync():
+            if sample_solver == 'unipc':
+                sample_scheduler = FlowUniPCMultistepScheduler(
+                    num_train_timesteps=self.num_train_timesteps,
+                    shift=1,
+                    use_dynamic_shifting=False)
+                sample_scheduler.set_timesteps(
+                    sampling_steps, device=self.device, shift=shift)
+                timesteps = sample_scheduler.timesteps
+            elif sample_solver == 'dpm++':
+                sample_scheduler = FlowDPMSolverMultistepScheduler(
+                    num_train_timesteps=self.num_train_timesteps,
+                    shift=1,
+                    use_dynamic_shifting=False)
+                sampling_sigmas = get_sampling_sigmas(sampling_steps, shift)
+                timesteps, _ = retrieve_timesteps(
+                    sample_scheduler,
+                    device=self.device,
+                    sigmas=sampling_sigmas)
+            else:
+                raise NotImplementedError("Unsupported solver.")
+            # sample videos
+            latent = noise
+            arg_c = {
+                'context': [context[0]],
+                'clip_fea': clip_context,
+                'seq_len': max_seq_len,
+                'y': [y],
+            }
+            arg_null = {
+                'context': context_null,
+                'clip_fea': clip_context,
+                'seq_len': max_seq_len,
+                'y': [y],
+            }
+            if offload_model:
+                torch.cuda.empty_cache()
+            self.model.to(self.device)
+            for _, t in enumerate(tqdm(timesteps)):
+                latent_model_input = [latent.to(self.device)]
+                timestep = [t]
+                timestep = torch.stack(timestep).to(self.device)
+                noise_pred_cond = self.model(
+                    latent_model_input, t=timestep, **arg_c)[0].to(
+                        torch.device('cpu') if offload_model else self.device)
+                if offload_model:
+                    torch.cuda.empty_cache()
+                noise_pred_uncond = self.model(
+                    latent_model_input, t=timestep, **arg_null)[0].to(
+                        torch.device('cpu') if offload_model else self.device)
+                if offload_model:
+                    torch.cuda.empty_cache()
+                noise_pred = noise_pred_uncond + guide_scale * (
+                    noise_pred_cond - noise_pred_uncond)
+                latent = latent.to(
+                    torch.device('cpu') if offload_model else self.device)
+                temp_x0 = sample_scheduler.step(
+                    noise_pred.unsqueeze(0),
+                    t,
+                    latent.unsqueeze(0),
+                    return_dict=False,
+                    generator=seed_g)[0]
+                latent = temp_x0.squeeze(0)
+                x0 = [latent.to(self.device)]
+                del latent_model_input, timestep
+            if offload_model:
+                self.model.cpu()
+                torch.cuda.empty_cache()
+            if self.rank == 0:
+                videos = self.vae.decode(x0)
+        del noise, latent
+        del sample_scheduler
+        if offload_model:
+            gc.collect()
+            torch.cuda.synchronize()
+        if dist.is_initialized():
+            dist.barrier()
+        return videos[0] if self.rank == 0 else None

wan/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from .attention import flash_attention
+from .model import WanModel
+from .t5 import T5Decoder, T5Encoder, T5EncoderModel, T5Model
+from .tokenizers import HuggingfaceTokenizer
+from .vace_model import VaceWanModel
+from .vae import WanVAE
+__all__ = [
+    'WanVAE',
+    'WanModel',
+    'VaceWanModel',
+    'T5Model',
+    'T5Encoder',
+    'T5Decoder',
+    'T5EncoderModel',
+    'HuggingfaceTokenizer',
+    'flash_attention',
+]

wan/modules/attention.py ADDED Viewed

	@@ -0,0 +1,393 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from ..utils.multitalk_utils import RotaryPositionalEmbedding1D, normalize_and_scale, split_token_counts_and_frame_ids
+from xfuser.core.distributed import (
+    get_sequence_parallel_rank,
+    get_sequence_parallel_world_size,
+    get_sp_group,
+)
+import xformers.ops
+try:
+    import flash_attn_interface
+    FLASH_ATTN_3_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_3_AVAILABLE = False
+try:
+    import flash_attn
+    FLASH_ATTN_2_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_2_AVAILABLE = False
+import warnings
+__all__ = [
+    'flash_attention',
+    'attention',
+]
+def flash_attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    version=None,
+):
+    """
+    q:              [B, Lq, Nq, C1].
+    k:              [B, Lk, Nk, C1].
+    v:              [B, Lk, Nk, C2]. Nq must be divisible by Nk.
+    q_lens:         [B].
+    k_lens:         [B].
+    dropout_p:      float. Dropout probability.
+    softmax_scale:  float. The scaling of QK^T before applying softmax.
+    causal:         bool. Whether to apply causal attention mask.
+    window_size:    (left right). If not (-1, -1), apply sliding window local attention.
+    deterministic:  bool. If True, slightly slower and uses more memory.
+    dtype:          torch.dtype. Apply when dtype of q/k/v is not float16/bfloat16.
+    """
+    half_dtypes = (torch.float16, torch.bfloat16)
+    assert dtype in half_dtypes
+    assert q.device.type == 'cuda' and q.size(-1) <= 256
+    # params
+    b, lq, lk, out_dtype = q.size(0), q.size(1), k.size(1), q.dtype
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # preprocess query
+    if q_lens is None:
+        q = half(q.flatten(0, 1))
+        q_lens = torch.tensor(
+            [lq] * b, dtype=torch.int32).to(
+                device=q.device, non_blocking=True)
+    else:
+        q = half(torch.cat([u[:v] for u, v in zip(q, q_lens)]))
+    # preprocess key, value
+    if k_lens is None:
+        k = half(k.flatten(0, 1))
+        v = half(v.flatten(0, 1))
+        k_lens = torch.tensor(
+            [lk] * b, dtype=torch.int32).to(
+                device=k.device, non_blocking=True)
+    else:
+        k = half(torch.cat([u[:v] for u, v in zip(k, k_lens)]))
+        v = half(torch.cat([u[:v] for u, v in zip(v, k_lens)]))
+    q = q.to(v.dtype)
+    k = k.to(v.dtype)
+    if q_scale is not None:
+        q = q * q_scale
+    if version is not None and version == 3 and not FLASH_ATTN_3_AVAILABLE:
+        warnings.warn(
+            'Flash attention 3 is not available, use flash attention 2 instead.'
+        )
+    # apply attention
+    if (version is None or version == 3) and FLASH_ATTN_3_AVAILABLE:
+        # Note: dropout_p, window_size are not supported in FA3 now.
+        x = flash_attn_interface.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(
+                0, dtype=torch.int32).to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(
+                0, dtype=torch.int32).to(q.device, non_blocking=True),
+            seqused_q=None,
+            seqused_k=None,
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            deterministic=deterministic)[0].unflatten(0, (b, lq))
+    else:
+        assert FLASH_ATTN_2_AVAILABLE
+        x = flash_attn.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(
+                0, dtype=torch.int32).to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(
+                0, dtype=torch.int32).to(q.device, non_blocking=True),
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic).unflatten(0, (b, lq))
+    # output
+    return x.type(out_dtype)
+def attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    fa_version=None,
+):
+    if FLASH_ATTN_2_AVAILABLE or FLASH_ATTN_3_AVAILABLE:
+        return flash_attention(
+            q=q,
+            k=k,
+            v=v,
+            q_lens=q_lens,
+            k_lens=k_lens,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            q_scale=q_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic,
+            dtype=dtype,
+            version=fa_version,
+        )
+    else:
+        if q_lens is not None or k_lens is not None:
+            warnings.warn(
+                'Padding mask is disabled when using scaled_dot_product_attention. It can have a significant impact on performance.'
+            )
+        attn_mask = None
+        q = q.transpose(1, 2).to(dtype)
+        k = k.transpose(1, 2).to(dtype)
+        v = v.transpose(1, 2).to(dtype)
+        out = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p)
+        out = out.transpose(1, 2).contiguous()
+        return out
+class SingleStreamAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        encoder_hidden_states_dim: int,
+        num_heads: int,
+        qkv_bias: bool,
+        qk_norm: bool,
+        norm_layer: nn.Module,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.dim = dim
+        self.encoder_hidden_states_dim = encoder_hidden_states_dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.qk_norm = qk_norm
+        self.q_linear = nn.Linear(dim, dim, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim, eps=eps) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim,eps=eps) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.kv_linear = nn.Linear(encoder_hidden_states_dim, dim * 2, bias=qkv_bias)
+        self.add_q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.add_k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+    def forward(self, x: torch.Tensor, encoder_hidden_states: torch.Tensor, shape=None, enable_sp=False, kv_seq=None) -> torch.Tensor:
+        N_t, N_h, N_w = shape
+        if not enable_sp:
+            x = rearrange(x, "B (N_t S) C -> (B N_t) S C", N_t=N_t)
+        # get q for hidden_state
+        B, N, C = x.shape
+        q = self.q_linear(x)
+        q_shape = (B, N, self.num_heads, self.head_dim)
+        q = q.view(q_shape).permute((0, 2, 1, 3))
+        if self.qk_norm:
+            q = self.q_norm(q)
+        # get kv from encoder_hidden_states
+        _, N_a, _ = encoder_hidden_states.shape
+        encoder_kv = self.kv_linear(encoder_hidden_states)
+        encoder_kv_shape = (B, N_a, 2, self.num_heads, self.head_dim)
+        encoder_kv = encoder_kv.view(encoder_kv_shape).permute((2, 0, 3, 1, 4))
+        encoder_k, encoder_v = encoder_kv.unbind(0)
+        if self.qk_norm:
+            encoder_k = self.add_k_norm(encoder_k)
+        q = rearrange(q, "B H M K -> B M H K")
+        encoder_k = rearrange(encoder_k, "B H M K -> B M H K")
+        encoder_v = rearrange(encoder_v, "B H M K -> B M H K")
+        if enable_sp:
+            # context parallel
+            sp_size = get_sequence_parallel_world_size()
+            sp_rank = get_sequence_parallel_rank()
+            visual_seqlen, _ = split_token_counts_and_frame_ids(N_t, N_h * N_w, sp_size, sp_rank)
+            assert kv_seq is not None, f"kv_seq should not be None."
+            attn_bias = xformers.ops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(visual_seqlen, kv_seq)
+        else:
+            attn_bias = None
+        x = xformers.ops.memory_efficient_attention(q, encoder_k, encoder_v, attn_bias=attn_bias, op=None,)
+        x = rearrange(x, "B M H K -> B H M K")
+        # linear transform
+        x_output_shape = (B, N, C)
+        x = x.transpose(1, 2)
+        x = x.reshape(x_output_shape)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        if not enable_sp:
+            # reshape x to origin shape
+            x = rearrange(x, "(B N_t) S C -> B (N_t S) C", N_t=N_t)
+        return x
+class SingleStreamMutiAttention(SingleStreamAttention):
+    def __init__(
+        self,
+        dim: int,
+        encoder_hidden_states_dim: int,
+        num_heads: int,
+        qkv_bias: bool,
+        qk_norm: bool,
+        norm_layer: nn.Module,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        eps: float = 1e-6,
+        class_range: int = 24,
+        class_interval: int = 4,
+    ) -> None:
+        super().__init__(
+            dim=dim,
+            encoder_hidden_states_dim=encoder_hidden_states_dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            norm_layer=norm_layer,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            eps=eps,
+        )
+        self.class_interval = class_interval
+        self.class_range = class_range
+        self.rope_h1  = (0, self.class_interval)
+        self.rope_h2  = (self.class_range - self.class_interval, self.class_range)
+        self.rope_bak = int(self.class_range // 2)
+        self.rope_1d = RotaryPositionalEmbedding1D(self.head_dim)
+    def forward(self,
+                x: torch.Tensor,
+                encoder_hidden_states: torch.Tensor,
+                shape=None,
+                x_ref_attn_map=None,
+                human_num=None) -> torch.Tensor:
+        encoder_hidden_states = encoder_hidden_states.squeeze(0)
+        if human_num == 1:
+            return super().forward(x, encoder_hidden_states, shape)
+        N_t, _, _ = shape
+        x = rearrange(x, "B (N_t S) C -> (B N_t) S C", N_t=N_t)
+        # get q for hidden_state
+        B, N, C = x.shape
+        q = self.q_linear(x)
+        q_shape = (B, N, self.num_heads, self.head_dim)
+        q = q.view(q_shape).permute((0, 2, 1, 3))
+        if self.qk_norm:
+            q = self.q_norm(q)
+        max_values = x_ref_attn_map.max(1).values[:, None, None]
+        min_values = x_ref_attn_map.min(1).values[:, None, None]
+        max_min_values = torch.cat([max_values, min_values], dim=2)
+        human1_max_value, human1_min_value = max_min_values[0, :, 0].max(), max_min_values[0, :, 1].min()
+        human2_max_value, human2_min_value = max_min_values[1, :, 0].max(), max_min_values[1, :, 1].min()
+        human1 = normalize_and_scale(x_ref_attn_map[0], (human1_min_value, human1_max_value), (self.rope_h1[0], self.rope_h1[1]))
+        human2 = normalize_and_scale(x_ref_attn_map[1], (human2_min_value, human2_max_value), (self.rope_h2[0], self.rope_h2[1]))
+        back   = torch.full((x_ref_attn_map.size(1),), self.rope_bak, dtype=human1.dtype).to(human1.device)
+        max_indices = x_ref_attn_map.argmax(dim=0)
+        normalized_map = torch.stack([human1, human2, back], dim=1)
+        normalized_pos = normalized_map[range(x_ref_attn_map.size(1)), max_indices] # N
+        q = rearrange(q, "(B N_t) H S C -> B H (N_t S) C", N_t=N_t)
+        q = self.rope_1d(q, normalized_pos)
+        q = rearrange(q, "B H (N_t S) C -> (B N_t) H S C", N_t=N_t)
+        _, N_a, _ = encoder_hidden_states.shape
+        encoder_kv = self.kv_linear(encoder_hidden_states)
+        encoder_kv_shape = (B, N_a, 2, self.num_heads, self.head_dim)
+        encoder_kv = encoder_kv.view(encoder_kv_shape).permute((2, 0, 3, 1, 4))
+        encoder_k, encoder_v = encoder_kv.unbind(0)
+        if self.qk_norm:
+            encoder_k = self.add_k_norm(encoder_k)
+        per_frame = torch.zeros(N_a, dtype=encoder_k.dtype).to(encoder_k.device)
+        per_frame[:per_frame.size(0)//2] = (self.rope_h1[0] + self.rope_h1[1]) / 2
+        per_frame[per_frame.size(0)//2:] = (self.rope_h2[0] + self.rope_h2[1]) / 2
+        encoder_pos = torch.concat([per_frame]*N_t, dim=0)
+        encoder_k = rearrange(encoder_k, "(B N_t) H S C -> B H (N_t S) C", N_t=N_t)
+        encoder_k = self.rope_1d(encoder_k, encoder_pos)
+        encoder_k = rearrange(encoder_k, "B H (N_t S) C -> (B N_t) H S C", N_t=N_t)
+        q = rearrange(q, "B H M K -> B M H K")
+        encoder_k = rearrange(encoder_k, "B H M K -> B M H K")
+        encoder_v = rearrange(encoder_v, "B H M K -> B M H K")
+        x = xformers.ops.memory_efficient_attention(q, encoder_k, encoder_v, attn_bias=None, op=None,)
+        x = rearrange(x, "B M H K -> B H M K")
+        # linear transform
+        x_output_shape = (B, N, C)
+        x = x.transpose(1, 2)
+        x = x.reshape(x_output_shape)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        # reshape x to origin shape
+        x = rearrange(x, "(B N_t) S C -> B (N_t S) C", N_t=N_t)
+        return x

wan/modules/clip.py ADDED Viewed

	@@ -0,0 +1,542 @@

+# Modified from ``https://github.com/openai/CLIP'' and ``https://github.com/mlfoundations/open_clip''
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import logging
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as T
+from .attention import flash_attention
+from .tokenizers import HuggingfaceTokenizer
+from .xlm_roberta import XLMRoberta
+__all__ = [
+    'XLMRobertaCLIP',
+    'clip_xlm_roberta_vit_h_14',
+    'CLIPModel',
+]
+def pos_interpolate(pos, seq_len):
+    if pos.size(1) == seq_len:
+        return pos
+    else:
+        src_grid = int(math.sqrt(pos.size(1)))
+        tar_grid = int(math.sqrt(seq_len))
+        n = pos.size(1) - src_grid * src_grid
+        return torch.cat([
+            pos[:, :n],
+            F.interpolate(
+                pos[:, n:].float().reshape(1, src_grid, src_grid, -1).permute(
+                    0, 3, 1, 2),
+                size=(tar_grid, tar_grid),
+                mode='bicubic',
+                align_corners=False).flatten(2).transpose(1, 2)
+        ],
+                         dim=1)
+class QuickGELU(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(1.702 * x)
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type_as(x)
+class SelfAttention(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 causal=False,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.causal = causal
+        self.attn_dropout = attn_dropout
+        self.proj_dropout = proj_dropout
+        # layers
+        self.to_qkv = nn.Linear(dim, dim * 3)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x):
+        """
+        x:   [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).view(b, s, 3, n, d).unbind(2)
+        # compute attention
+        p = self.attn_dropout if self.training else 0.0
+        x = flash_attention(q, k, v, dropout_p=p, causal=self.causal, version=2)
+        x = x.reshape(b, s, c)
+        # output
+        x = self.proj(x)
+        x = F.dropout(x, self.proj_dropout, self.training)
+        return x
+class SwiGLU(nn.Module):
+    def __init__(self, dim, mid_dim):
+        super().__init__()
+        self.dim = dim
+        self.mid_dim = mid_dim
+        # layers
+        self.fc1 = nn.Linear(dim, mid_dim)
+        self.fc2 = nn.Linear(dim, mid_dim)
+        self.fc3 = nn.Linear(mid_dim, dim)
+    def forward(self, x):
+        x = F.silu(self.fc1(x)) * self.fc2(x)
+        x = self.fc3(x)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(self,
+                 dim,
+                 mlp_ratio,
+                 num_heads,
+                 post_norm=False,
+                 causal=False,
+                 activation='quick_gelu',
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 norm_eps=1e-5):
+        assert activation in ['quick_gelu', 'gelu', 'swi_glu']
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.post_norm = post_norm
+        self.causal = causal
+        self.norm_eps = norm_eps
+        # layers
+        self.norm1 = LayerNorm(dim, eps=norm_eps)
+        self.attn = SelfAttention(dim, num_heads, causal, attn_dropout,
+                                  proj_dropout)
+        self.norm2 = LayerNorm(dim, eps=norm_eps)
+        if activation == 'swi_glu':
+            self.mlp = SwiGLU(dim, int(dim * mlp_ratio))
+        else:
+            self.mlp = nn.Sequential(
+                nn.Linear(dim, int(dim * mlp_ratio)),
+                QuickGELU() if activation == 'quick_gelu' else nn.GELU(),
+                nn.Linear(int(dim * mlp_ratio), dim), nn.Dropout(proj_dropout))
+    def forward(self, x):
+        if self.post_norm:
+            x = x + self.norm1(self.attn(x))
+            x = x + self.norm2(self.mlp(x))
+        else:
+            x = x + self.attn(self.norm1(x))
+            x = x + self.mlp(self.norm2(x))
+        return x
+class AttentionPool(nn.Module):
+    def __init__(self,
+                 dim,
+                 mlp_ratio,
+                 num_heads,
+                 activation='gelu',
+                 proj_dropout=0.0,
+                 norm_eps=1e-5):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.proj_dropout = proj_dropout
+        self.norm_eps = norm_eps
+        # layers
+        gain = 1.0 / math.sqrt(dim)
+        self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.to_q = nn.Linear(dim, dim)
+        self.to_kv = nn.Linear(dim, dim * 2)
+        self.proj = nn.Linear(dim, dim)
+        self.norm = LayerNorm(dim, eps=norm_eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, int(dim * mlp_ratio)),
+            QuickGELU() if activation == 'quick_gelu' else nn.GELU(),
+            nn.Linear(int(dim * mlp_ratio), dim), nn.Dropout(proj_dropout))
+    def forward(self, x):
+        """
+        x:  [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.to_q(self.cls_embedding).view(1, 1, n, d).expand(b, -1, -1, -1)
+        k, v = self.to_kv(x).view(b, s, 2, n, d).unbind(2)
+        # compute attention
+        x = flash_attention(q, k, v, version=2)
+        x = x.reshape(b, 1, c)
+        # output
+        x = self.proj(x)
+        x = F.dropout(x, self.proj_dropout, self.training)
+        # mlp
+        x = x + self.mlp(self.norm(x))
+        return x[:, 0]
+class VisionTransformer(nn.Module):
+    def __init__(self,
+                 image_size=224,
+                 patch_size=16,
+                 dim=768,
+                 mlp_ratio=4,
+                 out_dim=512,
+                 num_heads=12,
+                 num_layers=12,
+                 pool_type='token',
+                 pre_norm=True,
+                 post_norm=False,
+                 activation='quick_gelu',
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0,
+                 norm_eps=1e-5):
+        if image_size % patch_size != 0:
+            print(
+                '[WARNING] image_size is not divisible by patch_size',
+                flush=True)
+        assert pool_type in ('token', 'token_fc', 'attn_pool')
+        out_dim = out_dim or dim
+        super().__init__()
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = (image_size // patch_size)**2
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.pool_type = pool_type
+        self.post_norm = post_norm
+        self.norm_eps = norm_eps
+        # embeddings
+        gain = 1.0 / math.sqrt(dim)
+        self.patch_embedding = nn.Conv2d(
+            3,
+            dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=not pre_norm)
+        if pool_type in ('token', 'token_fc'):
+            self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.pos_embedding = nn.Parameter(gain * torch.randn(
+            1, self.num_patches +
+            (1 if pool_type in ('token', 'token_fc') else 0), dim))
+        self.dropout = nn.Dropout(embedding_dropout)
+        # transformer
+        self.pre_norm = LayerNorm(dim, eps=norm_eps) if pre_norm else None
+        self.transformer = nn.Sequential(*[
+            AttentionBlock(dim, mlp_ratio, num_heads, post_norm, False,
+                           activation, attn_dropout, proj_dropout, norm_eps)
+            for _ in range(num_layers)
+        ])
+        self.post_norm = LayerNorm(dim, eps=norm_eps)
+        # head
+        if pool_type == 'token':
+            self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
+        elif pool_type == 'token_fc':
+            self.head = nn.Linear(dim, out_dim)
+        elif pool_type == 'attn_pool':
+            self.head = AttentionPool(dim, mlp_ratio, num_heads, activation,
+                                      proj_dropout, norm_eps)
+    def forward(self, x, interpolation=False, use_31_block=False):
+        b = x.size(0)
+        # embeddings
+        x = self.patch_embedding(x).flatten(2).permute(0, 2, 1)
+        if self.pool_type in ('token', 'token_fc'):
+            x = torch.cat([self.cls_embedding.expand(b, -1, -1), x], dim=1)
+        if interpolation:
+            e = pos_interpolate(self.pos_embedding, x.size(1))
+        else:
+            e = self.pos_embedding
+        x = self.dropout(x + e)
+        if self.pre_norm is not None:
+            x = self.pre_norm(x)
+        # transformer
+        if use_31_block:
+            x = self.transformer[:-1](x)
+            return x
+        else:
+            x = self.transformer(x)
+            return x
+class XLMRobertaWithHead(XLMRoberta):
+    def __init__(self, **kwargs):
+        self.out_dim = kwargs.pop('out_dim')
+        super().__init__(**kwargs)
+        # head
+        mid_dim = (self.dim + self.out_dim) // 2
+        self.head = nn.Sequential(
+            nn.Linear(self.dim, mid_dim, bias=False), nn.GELU(),
+            nn.Linear(mid_dim, self.out_dim, bias=False))
+    def forward(self, ids):
+        # xlm-roberta
+        x = super().forward(ids)
+        # average pooling
+        mask = ids.ne(self.pad_id).unsqueeze(-1).to(x)
+        x = (x * mask).sum(dim=1) / mask.sum(dim=1)
+        # head
+        x = self.head(x)
+        return x
+class XLMRobertaCLIP(nn.Module):
+    def __init__(self,
+                 embed_dim=1024,
+                 image_size=224,
+                 patch_size=14,
+                 vision_dim=1280,
+                 vision_mlp_ratio=4,
+                 vision_heads=16,
+                 vision_layers=32,
+                 vision_pool='token',
+                 vision_pre_norm=True,
+                 vision_post_norm=False,
+                 activation='gelu',
+                 vocab_size=250002,
+                 max_text_len=514,
+                 type_size=1,
+                 pad_id=1,
+                 text_dim=1024,
+                 text_heads=16,
+                 text_layers=24,
+                 text_post_norm=True,
+                 text_dropout=0.1,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0,
+                 norm_eps=1e-5):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.vision_dim = vision_dim
+        self.vision_mlp_ratio = vision_mlp_ratio
+        self.vision_heads = vision_heads
+        self.vision_layers = vision_layers
+        self.vision_pre_norm = vision_pre_norm
+        self.vision_post_norm = vision_post_norm
+        self.activation = activation
+        self.vocab_size = vocab_size
+        self.max_text_len = max_text_len
+        self.type_size = type_size
+        self.pad_id = pad_id
+        self.text_dim = text_dim
+        self.text_heads = text_heads
+        self.text_layers = text_layers
+        self.text_post_norm = text_post_norm
+        self.norm_eps = norm_eps
+        # models
+        self.visual = VisionTransformer(
+            image_size=image_size,
+            patch_size=patch_size,
+            dim=vision_dim,
+            mlp_ratio=vision_mlp_ratio,
+            out_dim=embed_dim,
+            num_heads=vision_heads,
+            num_layers=vision_layers,
+            pool_type=vision_pool,
+            pre_norm=vision_pre_norm,
+            post_norm=vision_post_norm,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout,
+            norm_eps=norm_eps)
+        self.textual = XLMRobertaWithHead(
+            vocab_size=vocab_size,
+            max_seq_len=max_text_len,
+            type_size=type_size,
+            pad_id=pad_id,
+            dim=text_dim,
+            out_dim=embed_dim,
+            num_heads=text_heads,
+            num_layers=text_layers,
+            post_norm=text_post_norm,
+            dropout=text_dropout)
+        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
+    def forward(self, imgs, txt_ids):
+        """
+        imgs:       [B, 3, H, W] of torch.float32.
+        - mean:     [0.48145466, 0.4578275, 0.40821073]
+        - std:      [0.26862954, 0.26130258, 0.27577711]
+        txt_ids:    [B, L] of torch.long.
+                    Encoded by data.CLIPTokenizer.
+        """
+        xi = self.visual(imgs)
+        xt = self.textual(txt_ids)
+        return xi, xt
+    def param_groups(self):
+        groups = [{
+            'params': [
+                p for n, p in self.named_parameters()
+                if 'norm' in n or n.endswith('bias')
+            ],
+            'weight_decay': 0.0
+        }, {
+            'params': [
+                p for n, p in self.named_parameters()
+                if not ('norm' in n or n.endswith('bias'))
+            ]
+        }]
+        return groups
+def _clip(pretrained=False,
+          pretrained_name=None,
+          model_cls=XLMRobertaCLIP,
+          return_transforms=False,
+          return_tokenizer=False,
+          tokenizer_padding='eos',
+          dtype=torch.float32,
+          device='cpu',
+          **kwargs):
+    # init a model on device
+    with torch.device(device):
+        model = model_cls(**kwargs)
+    # set device
+    model = model.to(dtype=dtype, device=device)
+    output = (model,)
+    # init transforms
+    if return_transforms:
+        # mean and std
+        if 'siglip' in pretrained_name.lower():
+            mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
+        else:
+            mean = [0.48145466, 0.4578275, 0.40821073]
+            std = [0.26862954, 0.26130258, 0.27577711]
+        # transforms
+        transforms = T.Compose([
+            T.Resize((model.image_size, model.image_size),
+                     interpolation=T.InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=mean, std=std)
+        ])
+        output += (transforms,)
+    return output[0] if len(output) == 1 else output
+def clip_xlm_roberta_vit_h_14(
+        pretrained=False,
+        pretrained_name='open-clip-xlm-roberta-large-vit-huge-14',
+        **kwargs):
+    cfg = dict(
+        embed_dim=1024,
+        image_size=224,
+        patch_size=14,
+        vision_dim=1280,
+        vision_mlp_ratio=4,
+        vision_heads=16,
+        vision_layers=32,
+        vision_pool='token',
+        activation='gelu',
+        vocab_size=250002,
+        max_text_len=514,
+        type_size=1,
+        pad_id=1,
+        text_dim=1024,
+        text_heads=16,
+        text_layers=24,
+        text_post_norm=True,
+        text_dropout=0.1,
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0)
+    cfg.update(**kwargs)
+    return _clip(pretrained, pretrained_name, XLMRobertaCLIP, **cfg)
+class CLIPModel:
+    def __init__(self, dtype, device, checkpoint_path, tokenizer_path):
+        self.dtype = dtype
+        self.device = device
+        self.checkpoint_path = checkpoint_path
+        self.tokenizer_path = tokenizer_path
+        # init model
+        self.model, self.transforms = clip_xlm_roberta_vit_h_14(
+            pretrained=False,
+            return_transforms=True,
+            return_tokenizer=False,
+            dtype=dtype,
+            device=device)
+        self.model = self.model.eval().requires_grad_(False)
+        logging.info(f'loading {checkpoint_path}')
+        self.model.load_state_dict(
+            torch.load(checkpoint_path, map_location='cpu'))
+        # init tokenizer
+        self.tokenizer = HuggingfaceTokenizer(
+            name=tokenizer_path,
+            seq_len=self.model.max_text_len - 2,
+            clean='whitespace')
+    def visual(self, videos):
+        # preprocess
+        size = (self.model.image_size,) * 2
+        videos = torch.cat([
+            F.interpolate(
+                u.transpose(0, 1),
+                size=size,
+                mode='bicubic',
+                align_corners=False) for u in videos
+        ])
+        videos = self.transforms.transforms[-1](videos.mul_(0.5).add_(0.5))
+        # forward
+        with torch.cuda.amp.autocast(dtype=self.dtype):
+            out = self.model.visual(videos, use_31_block=True)
+            return out

wan/modules/model.py ADDED Viewed

	@@ -0,0 +1,631 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import math
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from .attention import flash_attention
+__all__ = ['WanModel']
+T5_CONTEXT_TOKEN_NUMBER = 512
+FIRST_LAST_FRAME_CONTEXT_TOKEN_NUMBER = 257 * 2
+def sinusoidal_embedding_1d(dim, position):
+    # preprocess
+    assert dim % 2 == 0
+    half = dim // 2
+    position = position.type(torch.float64)
+    # calculation
+    sinusoid = torch.outer(
+        position, torch.pow(10000, -torch.arange(half).to(position).div(half)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x
+@amp.autocast(enabled=False)
+def rope_params(max_seq_len, dim, theta=10000):
+    assert dim % 2 == 0
+    freqs = torch.outer(
+        torch.arange(max_seq_len),
+        1.0 / torch.pow(theta,
+                        torch.arange(0, dim, 2).to(torch.float64).div(dim)))
+    freqs = torch.polar(torch.ones_like(freqs), freqs)
+    return freqs
+@amp.autocast(enabled=False)
+def rope_apply(x, grid_sizes, freqs):
+    n, c = x.size(2), x.size(3) // 2
+    # split freqs
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    # loop over samples
+    output = []
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+        # precompute multipliers
+        x_i = torch.view_as_complex(x[i, :seq_len].to(torch.float64).reshape(
+            seq_len, n, -1, 2))
+        freqs_i = torch.cat([
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+        ],
+                            dim=-1).reshape(seq_len, 1, -1)
+        # apply rotary embedding
+        x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
+        x_i = torch.cat([x_i, x[i, seq_len:]])
+        # append to collection
+        output.append(x_i)
+    return torch.stack(output).float()
+class WanRMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+        """
+        return self._norm(x.float()).type_as(x) * self.weight
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+class WanLayerNorm(nn.LayerNorm):
+    def __init__(self, dim, eps=1e-6, elementwise_affine=False):
+        super().__init__(dim, elementwise_affine=elementwise_affine, eps=eps)
+    def forward(self, x):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+        """
+        return super().forward(x.float()).type_as(x)
+class WanSelfAttention(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 eps=1e-6):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.eps = eps
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+        self.norm_k = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+    def forward(self, x, seq_lens, grid_sizes, freqs):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, num_heads, C / num_heads]
+            seq_lens(Tensor): Shape [B]
+            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
+            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+        """
+        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+        # query, key, value function
+        def qkv_fn(x):
+            q = self.norm_q(self.q(x)).view(b, s, n, d)
+            k = self.norm_k(self.k(x)).view(b, s, n, d)
+            v = self.v(x).view(b, s, n, d)
+            return q, k, v
+        q, k, v = qkv_fn(x)
+        x = flash_attention(
+            q=rope_apply(q, grid_sizes, freqs),
+            k=rope_apply(k, grid_sizes, freqs),
+            v=v,
+            k_lens=seq_lens,
+            window_size=self.window_size)
+        # output
+        x = x.flatten(2)
+        x = self.o(x)
+        return x
+class WanT2VCrossAttention(WanSelfAttention):
+    def forward(self, x, context, context_lens):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            context(Tensor): Shape [B, L2, C]
+            context_lens(Tensor): Shape [B]
+        """
+        b, n, d = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.norm_q(self.q(x)).view(b, -1, n, d)
+        k = self.norm_k(self.k(context)).view(b, -1, n, d)
+        v = self.v(context).view(b, -1, n, d)
+        # compute attention
+        x = flash_attention(q, k, v, k_lens=context_lens)
+        # output
+        x = x.flatten(2)
+        x = self.o(x)
+        return x
+class WanI2VCrossAttention(WanSelfAttention):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 eps=1e-6):
+        super().__init__(dim, num_heads, window_size, qk_norm, eps)
+        self.k_img = nn.Linear(dim, dim)
+        self.v_img = nn.Linear(dim, dim)
+        # self.alpha = nn.Parameter(torch.zeros((1, )))
+        self.norm_k_img = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+    def forward(self, x, context, context_lens):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            context(Tensor): Shape [B, L2, C]
+            context_lens(Tensor): Shape [B]
+        """
+        image_context_length = context.shape[1] - T5_CONTEXT_TOKEN_NUMBER
+        context_img = context[:, :image_context_length]
+        context = context[:, image_context_length:]
+        b, n, d = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.norm_q(self.q(x)).view(b, -1, n, d)
+        k = self.norm_k(self.k(context)).view(b, -1, n, d)
+        v = self.v(context).view(b, -1, n, d)
+        k_img = self.norm_k_img(self.k_img(context_img)).view(b, -1, n, d)
+        v_img = self.v_img(context_img).view(b, -1, n, d)
+        img_x = flash_attention(q, k_img, v_img, k_lens=None)
+        # compute attention
+        x = flash_attention(q, k, v, k_lens=context_lens)
+        # output
+        x = x.flatten(2)
+        img_x = img_x.flatten(2)
+        x = x + img_x
+        x = self.o(x)
+        return x
+WAN_CROSSATTENTION_CLASSES = {
+    't2v_cross_attn': WanT2VCrossAttention,
+    'i2v_cross_attn': WanI2VCrossAttention,
+}
+class WanAttentionBlock(nn.Module):
+    def __init__(self,
+                 cross_attn_type,
+                 dim,
+                 ffn_dim,
+                 num_heads,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 cross_attn_norm=False,
+                 eps=1e-6):
+        super().__init__()
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        # layers
+        self.norm1 = WanLayerNorm(dim, eps)
+        self.self_attn = WanSelfAttention(dim, num_heads, window_size, qk_norm,
+                                          eps)
+        self.norm3 = WanLayerNorm(
+            dim, eps,
+            elementwise_affine=True) if cross_attn_norm else nn.Identity()
+        self.cross_attn = WAN_CROSSATTENTION_CLASSES[cross_attn_type](dim,
+                                                                      num_heads,
+                                                                      (-1, -1),
+                                                                      qk_norm,
+                                                                      eps)
+        self.norm2 = WanLayerNorm(dim, eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_dim), nn.GELU(approximate='tanh'),
+            nn.Linear(ffn_dim, dim))
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+    def forward(
+        self,
+        x,
+        e,
+        seq_lens,
+        grid_sizes,
+        freqs,
+        context,
+        context_lens,
+    ):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+            e(Tensor): Shape [B, 6, C]
+            seq_lens(Tensor): Shape [B], length of each sequence in batch
+            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
+            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+        """
+        assert e.dtype == torch.float32
+        with amp.autocast(dtype=torch.float32):
+            e = (self.modulation.to(e.device) + e).chunk(6, dim=1)
+        assert e[0].dtype == torch.float32
+        # self-attention
+        y = self.self_attn(
+            self.norm1(x).float() * (1 + e[1]) + e[0], seq_lens, grid_sizes,
+            freqs)
+        with amp.autocast(dtype=torch.float32):
+            x = x + y * e[2]
+        # cross-attention & ffn function
+        def cross_attn_ffn(x, context, context_lens, e):
+            x = x + self.cross_attn(self.norm3(x), context, context_lens)
+            y = self.ffn(self.norm2(x).float() * (1 + e[4]) + e[3])
+            with amp.autocast(dtype=torch.float32):
+                x = x + y * e[5]
+            return x
+        x = cross_attn_ffn(x, context, context_lens, e)
+        return x
+class Head(nn.Module):
+    def __init__(self, dim, out_dim, patch_size, eps=1e-6):
+        super().__init__()
+        self.dim = dim
+        self.out_dim = out_dim
+        self.patch_size = patch_size
+        self.eps = eps
+        # layers
+        out_dim = math.prod(patch_size) * out_dim
+        self.norm = WanLayerNorm(dim, eps)
+        self.head = nn.Linear(dim, out_dim)
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
+    def forward(self, x, e):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            e(Tensor): Shape [B, C]
+        """
+        assert e.dtype == torch.float32
+        with amp.autocast(dtype=torch.float32):
+            e = (self.modulation.to(e.device) + e.unsqueeze(1)).chunk(2, dim=1)
+            x = (self.head(self.norm(x) * (1 + e[1]) + e[0]))
+        return x
+class MLPProj(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, flf_pos_emb=False):
+        super().__init__()
+        self.proj = torch.nn.Sequential(
+            torch.nn.LayerNorm(in_dim), torch.nn.Linear(in_dim, in_dim),
+            torch.nn.GELU(), torch.nn.Linear(in_dim, out_dim),
+            torch.nn.LayerNorm(out_dim))
+        if flf_pos_emb:  # NOTE: we only use this for `flf2v`
+            self.emb_pos = nn.Parameter(
+                torch.zeros(1, FIRST_LAST_FRAME_CONTEXT_TOKEN_NUMBER, 1280))
+    def forward(self, image_embeds):
+        if hasattr(self, 'emb_pos'):
+            bs, n, d = image_embeds.shape
+            image_embeds = image_embeds.view(-1, 2 * n, d)
+            image_embeds = image_embeds + self.emb_pos
+        clip_extra_context_tokens = self.proj(image_embeds)
+        return clip_extra_context_tokens
+class WanModel(ModelMixin, ConfigMixin):
+    r"""
+    Wan diffusion backbone supporting both text-to-video and image-to-video.
+    """
+    ignore_for_config = [
+        'patch_size', 'cross_attn_norm', 'qk_norm', 'text_dim', 'window_size'
+    ]
+    _no_split_modules = ['WanAttentionBlock']
+    @register_to_config
+    def __init__(self,
+                 model_type='t2v',
+                 patch_size=(1, 2, 2),
+                 text_len=512,
+                 in_dim=16,
+                 dim=2048,
+                 ffn_dim=8192,
+                 freq_dim=256,
+                 text_dim=4096,
+                 out_dim=16,
+                 num_heads=16,
+                 num_layers=32,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 cross_attn_norm=True,
+                 eps=1e-6):
+        r"""
+        Initialize the diffusion model backbone.
+        Args:
+            model_type (`str`, *optional*, defaults to 't2v'):
+                Model variant - 't2v' (text-to-video) or 'i2v' (image-to-video) or 'flf2v' (first-last-frame-to-video) or 'vace'
+            patch_size (`tuple`, *optional*, defaults to (1, 2, 2)):
+                3D patch dimensions for video embedding (t_patch, h_patch, w_patch)
+            text_len (`int`, *optional*, defaults to 512):
+                Fixed length for text embeddings
+            in_dim (`int`, *optional*, defaults to 16):
+                Input video channels (C_in)
+            dim (`int`, *optional*, defaults to 2048):
+                Hidden dimension of the transformer
+            ffn_dim (`int`, *optional*, defaults to 8192):
+                Intermediate dimension in feed-forward network
+            freq_dim (`int`, *optional*, defaults to 256):
+                Dimension for sinusoidal time embeddings
+            text_dim (`int`, *optional*, defaults to 4096):
+                Input dimension for text embeddings
+            out_dim (`int`, *optional*, defaults to 16):
+                Output video channels (C_out)
+            num_heads (`int`, *optional*, defaults to 16):
+                Number of attention heads
+            num_layers (`int`, *optional*, defaults to 32):
+                Number of transformer blocks
+            window_size (`tuple`, *optional*, defaults to (-1, -1)):
+                Window size for local attention (-1 indicates global attention)
+            qk_norm (`bool`, *optional*, defaults to True):
+                Enable query/key normalization
+            cross_attn_norm (`bool`, *optional*, defaults to False):
+                Enable cross-attention normalization
+            eps (`float`, *optional*, defaults to 1e-6):
+                Epsilon value for normalization layers
+        """
+        super().__init__()
+        assert model_type in ['t2v', 'i2v', 'flf2v', 'vace']
+        self.model_type = model_type
+        self.patch_size = patch_size
+        self.text_len = text_len
+        self.in_dim = in_dim
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.freq_dim = freq_dim
+        self.text_dim = text_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        # embeddings
+        self.patch_embedding = nn.Conv3d(
+            in_dim, dim, kernel_size=patch_size, stride=patch_size)
+        self.text_embedding = nn.Sequential(
+            nn.Linear(text_dim, dim), nn.GELU(approximate='tanh'),
+            nn.Linear(dim, dim))
+        self.time_embedding = nn.Sequential(
+            nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+        self.time_projection = nn.Sequential(nn.SiLU(), nn.Linear(dim, dim * 6))
+        # blocks
+        cross_attn_type = 't2v_cross_attn' if model_type == 't2v' else 'i2v_cross_attn'
+        self.blocks = nn.ModuleList([
+            WanAttentionBlock(cross_attn_type, dim, ffn_dim, num_heads,
+                              window_size, qk_norm, cross_attn_norm, eps)
+            for _ in range(num_layers)
+        ])
+        # head
+        self.head = Head(dim, out_dim, patch_size, eps)
+        # buffers (don't use register_buffer otherwise dtype will be changed in to())
+        assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
+        d = dim // num_heads
+        self.freqs = torch.cat([
+            rope_params(1024, d - 4 * (d // 6)),
+            rope_params(1024, 2 * (d // 6)),
+            rope_params(1024, 2 * (d // 6))
+        ],
+                               dim=1)
+        if model_type == 'i2v' or model_type == 'flf2v':
+            self.img_emb = MLPProj(1280, dim, flf_pos_emb=model_type == 'flf2v')
+        # initialize weights
+        self.init_weights()
+    def forward(
+        self,
+        x,
+        t,
+        context,
+        seq_len,
+        clip_fea=None,
+        y=None,
+    ):
+        r"""
+        Forward pass through the diffusion model
+        Args:
+            x (List[Tensor]):
+                List of input video tensors, each with shape [C_in, F, H, W]
+            t (Tensor):
+                Diffusion timesteps tensor of shape [B]
+            context (List[Tensor]):
+                List of text embeddings each with shape [L, C]
+            seq_len (`int`):
+                Maximum sequence length for positional encoding
+            clip_fea (Tensor, *optional*):
+                CLIP image features for image-to-video mode or first-last-frame-to-video mode
+            y (List[Tensor], *optional*):
+                Conditional video inputs for image-to-video mode, same shape as x
+        Returns:
+            List[Tensor]:
+                List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
+        """
+        if self.model_type == 'i2v' or self.model_type == 'flf2v':
+            assert clip_fea is not None and y is not None
+        # params
+        device = self.patch_embedding.weight.device
+        if self.freqs.device != device:
+            self.freqs = self.freqs.to(device)
+        if y is not None:
+            x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
+        # embeddings
+        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
+        grid_sizes = torch.stack(
+            [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
+        x = [u.flatten(2).transpose(1, 2) for u in x]
+        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
+        assert seq_lens.max() <= seq_len
+        x = torch.cat([
+            torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))],
+                      dim=1) for u in x
+        ])
+        # time embeddings
+        with amp.autocast(dtype=torch.float32):
+            e = self.time_embedding(
+                sinusoidal_embedding_1d(self.freq_dim, t).float())
+            e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+            assert e.dtype == torch.float32 and e0.dtype == torch.float32
+        # context
+        context_lens = None
+        context = self.text_embedding(
+            torch.stack([
+                torch.cat(
+                    [u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
+                for u in context
+            ]))
+        if clip_fea is not None:
+            context_clip = self.img_emb(clip_fea)  # bs x 257 (x2) x dim
+            context = torch.concat([context_clip, context], dim=1)
+        # arguments
+        kwargs = dict(
+            e=e0,
+            seq_lens=seq_lens,
+            grid_sizes=grid_sizes,
+            freqs=self.freqs,
+            context=context,
+            context_lens=context_lens)
+        for block in self.blocks:
+            x = block(x, **kwargs)
+        # head
+        x = self.head(x, e)
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes)
+        return [u.float() for u in x]
+    def unpatchify(self, x, grid_sizes):
+        r"""
+        Reconstruct video tensors from patch embeddings.
+        Args:
+            x (List[Tensor]):
+                List of patchified features, each with shape [L, C_out * prod(patch_size)]
+            grid_sizes (Tensor):
+                Original spatial-temporal grid dimensions before patching,
+                    shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches)
+        Returns:
+            List[Tensor]:
+                Reconstructed video tensors with shape [C_out, F, H / 8, W / 8]
+        """
+        c = self.out_dim
+        out = []
+        for u, v in zip(x, grid_sizes.tolist()):
+            u = u[:math.prod(v)].view(*v, *self.patch_size, c)
+            u = torch.einsum('fhwpqrc->cfphqwr', u)
+            u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)])
+            out.append(u)
+        return out
+    def init_weights(self):
+        r"""
+        Initialize model parameters using Xavier initialization.
+        """
+        # basic init
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+        # init embeddings
+        nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1))
+        for m in self.text_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=.02)
+        for m in self.time_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=.02)
+        # init output layer
+        nn.init.zeros_(self.head.head.weight)

wan/modules/multitalk_model.py ADDED Viewed

	@@ -0,0 +1,799 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import math
+import numpy as np
+import os
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from diffusers import ModelMixin
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from .attention import flash_attention, SingleStreamMutiAttention
+from ..utils.multitalk_utils import get_attn_map_with_target
+__all__ = ['WanModel']
+def sinusoidal_embedding_1d(dim, position):
+    # preprocess
+    assert dim % 2 == 0
+    half = dim // 2
+    position = position.type(torch.float64)
+    # calculation
+    sinusoid = torch.outer(
+        position, torch.pow(10000, -torch.arange(half).to(position).div(half)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x
+@amp.autocast(enabled=False)
+def rope_params(max_seq_len, dim, theta=10000):
+    assert dim % 2 == 0
+    freqs = torch.outer(
+        torch.arange(max_seq_len),
+        1.0 / torch.pow(theta,
+                        torch.arange(0, dim, 2).to(torch.float64).div(dim)))
+    freqs = torch.polar(torch.ones_like(freqs), freqs)
+    return freqs
+@amp.autocast(enabled=False)
+def rope_apply(x, grid_sizes, freqs):
+    s, n, c = x.size(1), x.size(2), x.size(3) // 2
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    output = []
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+        x_i = torch.view_as_complex(x[i, :s].to(torch.float64).reshape(
+            s, n, -1, 2))
+        freqs_i = torch.cat([
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+        ],
+                            dim=-1).reshape(seq_len, 1, -1)
+        freqs_i = freqs_i.to(device=x_i.device)
+        x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
+        x_i = torch.cat([x_i, x[i, seq_len:]])
+        output.append(x_i)
+    return torch.stack(output).float()
+class WanRMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+        """
+        return self._norm(x.float()).type_as(x) * self.weight
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+class WanLayerNorm(nn.LayerNorm):
+    def __init__(self, dim, eps=1e-6, elementwise_affine=False):
+        super().__init__(dim, elementwise_affine=elementwise_affine, eps=eps)
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        origin_dtype = inputs.dtype
+        out = F.layer_norm(
+            inputs.float(),
+            self.normalized_shape,
+            None if self.weight is None else self.weight.float(),
+            None if self.bias is None else self.bias.float() ,
+            self.eps
+        ).to(origin_dtype)
+        return out
+class WanSelfAttention(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 eps=1e-6):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.eps = eps
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+        self.norm_k = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+    def forward(self, x, seq_lens, grid_sizes, freqs, ref_target_masks=None):
+        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+        # query, key, value function
+        def qkv_fn(x):
+            q = self.norm_q(self.q(x)).view(b, s, n, d)
+            k = self.norm_k(self.k(x)).view(b, s, n, d)
+            v = self.v(x).view(b, s, n, d)
+            return q, k, v
+        q, k, v = qkv_fn(x)
+        q = rope_apply(q, grid_sizes, freqs)
+        k = rope_apply(k, grid_sizes, freqs)
+        x = flash_attention(
+            q=q,
+            k=k,
+            v=v,
+            k_lens=seq_lens,
+            window_size=self.window_size
+        ).type_as(x)
+        # output
+        x = x.flatten(2)
+        x = self.o(x)
+        with torch.no_grad():
+            x_ref_attn_map = get_attn_map_with_target(q.type_as(x), k.type_as(x), grid_sizes[0],
+                                                    ref_target_masks=ref_target_masks)
+        return x, x_ref_attn_map
+class WanI2VCrossAttention(WanSelfAttention):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 eps=1e-6):
+        super().__init__(dim, num_heads, window_size, qk_norm, eps)
+        self.k_img = nn.Linear(dim, dim)
+        self.v_img = nn.Linear(dim, dim)
+        self.norm_k_img = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+    def forward(self, x, context, context_lens):
+        context_img = context[:, :257]
+        context = context[:, 257:]
+        b, n, d = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.norm_q(self.q(x)).view(b, -1, n, d)
+        k = self.norm_k(self.k(context)).view(b, -1, n, d)
+        v = self.v(context).view(b, -1, n, d)
+        k_img = self.norm_k_img(self.k_img(context_img)).view(b, -1, n, d)
+        v_img = self.v_img(context_img).view(b, -1, n, d)
+        img_x = flash_attention(q, k_img, v_img, k_lens=None)
+        # compute attention
+        x = flash_attention(q, k, v, k_lens=context_lens)
+        # output
+        x = x.flatten(2)
+        img_x = img_x.flatten(2)
+        x = x + img_x
+        x = self.o(x)
+        return x
+class WanAttentionBlock(nn.Module):
+    def __init__(self,
+                 cross_attn_type,
+                 dim,
+                 ffn_dim,
+                 num_heads,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 cross_attn_norm=False,
+                 eps=1e-6,
+                 output_dim=768,
+                 norm_input_visual=True,
+                 class_range=24,
+                 class_interval=4):
+        super().__init__()
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        # layers
+        self.norm1 = WanLayerNorm(dim, eps)
+        self.self_attn = WanSelfAttention(dim, num_heads, window_size, qk_norm, eps)
+        self.norm3 = WanLayerNorm(
+            dim, eps,
+            elementwise_affine=True) if cross_attn_norm else nn.Identity()
+        self.cross_attn = WanI2VCrossAttention(dim,
+                                                num_heads,
+                                                (-1, -1),
+                                                qk_norm,
+                                                eps)
+        self.norm2 = WanLayerNorm(dim, eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_dim), nn.GELU(approximate='tanh'),
+            nn.Linear(ffn_dim, dim))
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+        # init audio module
+        self.audio_cross_attn = SingleStreamMutiAttention(
+                dim=dim,
+                encoder_hidden_states_dim=output_dim,
+                num_heads=num_heads,
+                qk_norm=False,
+                qkv_bias=True,
+                eps=eps,
+                norm_layer=WanRMSNorm,
+                class_range=class_range,
+                class_interval=class_interval
+            )
+        self.norm_x = WanLayerNorm(dim, eps, elementwise_affine=True)  if norm_input_visual else nn.Identity()
+    def forward(
+        self,
+        x,
+        e,
+        seq_lens,
+        grid_sizes,
+        freqs,
+        context,
+        context_lens,
+        audio_embedding=None,
+        ref_target_masks=None,
+        human_num=None,
+    ):
+        dtype = x.dtype
+        assert e.dtype == torch.float32
+        with amp.autocast(dtype=torch.float32):
+            e = (self.modulation.to(e.device) + e).chunk(6, dim=1)
+        assert e[0].dtype == torch.float32
+        # self-attention
+        y, x_ref_attn_map = self.self_attn(
+            (self.norm1(x).float() * (1 + e[1]) + e[0]).type_as(x), seq_lens, grid_sizes,
+            freqs, ref_target_masks=ref_target_masks)
+        with amp.autocast(dtype=torch.float32):
+            x = x + y * e[2]
+        x = x.to(dtype)
+        # cross-attention of text
+        x = x + self.cross_attn(self.norm3(x), context, context_lens)
+        # cross attn of audio
+        x_a = self.audio_cross_attn(self.norm_x(x), encoder_hidden_states=audio_embedding,
+                                        shape=grid_sizes[0], x_ref_attn_map=x_ref_attn_map, human_num=human_num)
+        x = x + x_a
+        y = self.ffn((self.norm2(x).float() * (1 + e[4]) + e[3]).to(dtype))
+        with amp.autocast(dtype=torch.float32):
+            x = x + y * e[5]
+        x = x.to(dtype)
+        return x
+class Head(nn.Module):
+    def __init__(self, dim, out_dim, patch_size, eps=1e-6):
+        super().__init__()
+        self.dim = dim
+        self.out_dim = out_dim
+        self.patch_size = patch_size
+        self.eps = eps
+        # layers
+        out_dim = math.prod(patch_size) * out_dim
+        self.norm = WanLayerNorm(dim, eps)
+        self.head = nn.Linear(dim, out_dim)
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
+    def forward(self, x, e):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            e(Tensor): Shape [B, C]
+        """
+        assert e.dtype == torch.float32
+        with amp.autocast(dtype=torch.float32):
+            e = (self.modulation.to(e.device) + e.unsqueeze(1)).chunk(2, dim=1)
+            x = (self.head(self.norm(x) * (1 + e[1]) + e[0]))
+        return x
+class MLPProj(torch.nn.Module):
+    def __init__(self, in_dim, out_dim):
+        super().__init__()
+        self.proj = torch.nn.Sequential(
+            torch.nn.LayerNorm(in_dim), torch.nn.Linear(in_dim, in_dim),
+            torch.nn.GELU(), torch.nn.Linear(in_dim, out_dim),
+            torch.nn.LayerNorm(out_dim))
+    def forward(self, image_embeds):
+        clip_extra_context_tokens = self.proj(image_embeds)
+        return clip_extra_context_tokens
+class AudioProjModel(ModelMixin, ConfigMixin):
+    def __init__(
+        self,
+        seq_len=5,
+        seq_len_vf=12,
+        blocks=12,
+        channels=768,
+        intermediate_dim=512,
+        output_dim=768,
+        context_tokens=32,
+        norm_output_audio=False,
+    ):
+        super().__init__()
+        self.seq_len = seq_len
+        self.blocks = blocks
+        self.channels = channels
+        self.input_dim = seq_len * blocks * channels
+        self.input_dim_vf = seq_len_vf * blocks * channels
+        self.intermediate_dim = intermediate_dim
+        self.context_tokens = context_tokens
+        self.output_dim = output_dim
+        # define multiple linear layers
+        self.proj1 = nn.Linear(self.input_dim, intermediate_dim)
+        self.proj1_vf = nn.Linear(self.input_dim_vf, intermediate_dim)
+        self.proj2 = nn.Linear(intermediate_dim, intermediate_dim)
+        self.proj3 = nn.Linear(intermediate_dim, context_tokens * output_dim)
+        self.norm = nn.LayerNorm(output_dim) if norm_output_audio else nn.Identity()
+    def forward(self, audio_embeds, audio_embeds_vf):
+        video_length = audio_embeds.shape[1] + audio_embeds_vf.shape[1]
+        B, _, _, S, C = audio_embeds.shape
+        # process audio of first frame
+        audio_embeds = rearrange(audio_embeds, "bz f w b c -> (bz f) w b c")
+        batch_size, window_size, blocks, channels = audio_embeds.shape
+        audio_embeds = audio_embeds.view(batch_size, window_size * blocks * channels)
+        # process audio of latter frame
+        audio_embeds_vf = rearrange(audio_embeds_vf, "bz f w b c -> (bz f) w b c")
+        batch_size_vf, window_size_vf, blocks_vf, channels_vf = audio_embeds_vf.shape
+        audio_embeds_vf = audio_embeds_vf.view(batch_size_vf, window_size_vf * blocks_vf * channels_vf)
+        # first projection
+        audio_embeds = torch.relu(self.proj1(audio_embeds))
+        audio_embeds_vf = torch.relu(self.proj1_vf(audio_embeds_vf))
+        audio_embeds = rearrange(audio_embeds, "(bz f) c -> bz f c", bz=B)
+        audio_embeds_vf = rearrange(audio_embeds_vf, "(bz f) c -> bz f c", bz=B)
+        audio_embeds_c = torch.concat([audio_embeds, audio_embeds_vf], dim=1)
+        batch_size_c, N_t, C_a = audio_embeds_c.shape
+        audio_embeds_c = audio_embeds_c.view(batch_size_c*N_t, C_a)
+        # second projection
+        audio_embeds_c = torch.relu(self.proj2(audio_embeds_c))
+        context_tokens = self.proj3(audio_embeds_c).reshape(batch_size_c*N_t, self.context_tokens, self.output_dim)
+        # normalization and reshape
+        context_tokens = self.norm(context_tokens)
+        context_tokens = rearrange(context_tokens, "(bz f) m c -> bz f m c", f=video_length)
+        return context_tokens
+class WanModel(ModelMixin, ConfigMixin):
+    r"""
+    Wan diffusion backbone supporting both text-to-video and image-to-video.
+    """
+    ignore_for_config = [
+        'patch_size', 'cross_attn_norm', 'qk_norm', 'text_dim', 'window_size'
+    ]
+    _no_split_modules = ['WanAttentionBlock']
+    @register_to_config
+    def __init__(self,
+                 model_type='i2v',
+                 patch_size=(1, 2, 2),
+                 text_len=512,
+                 in_dim=16,
+                 dim=2048,
+                 ffn_dim=8192,
+                 freq_dim=256,
+                 text_dim=4096,
+                 out_dim=16,
+                 num_heads=16,
+                 num_layers=32,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 cross_attn_norm=True,
+                 eps=1e-6,
+                 # audio params
+                 audio_window=5,
+                 intermediate_dim=512,
+                 output_dim=768,
+                 context_tokens=32,
+                 vae_scale=4, # vae timedownsample scale
+                 norm_input_visual=True,
+                 norm_output_audio=True):
+        super().__init__()
+        assert model_type == 'i2v', 'MultiTalk model requires your model_type is i2v.'
+        self.model_type = model_type
+        self.patch_size = patch_size
+        self.text_len = text_len
+        self.in_dim = in_dim
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.freq_dim = freq_dim
+        self.text_dim = text_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        self.norm_output_audio = norm_output_audio
+        self.audio_window = audio_window
+        self.intermediate_dim = intermediate_dim
+        self.vae_scale = vae_scale
+        # embeddings
+        self.patch_embedding = nn.Conv3d(
+            in_dim, dim, kernel_size=patch_size, stride=patch_size)
+        self.text_embedding = nn.Sequential(
+            nn.Linear(text_dim, dim), nn.GELU(approximate='tanh'),
+            nn.Linear(dim, dim))
+        self.time_embedding = nn.Sequential(
+            nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+        self.time_projection = nn.Sequential(nn.SiLU(), nn.Linear(dim, dim * 6))
+        # blocks
+        cross_attn_type = 'i2v_cross_attn'
+        self.blocks = nn.ModuleList([
+            WanAttentionBlock(cross_attn_type, dim, ffn_dim, num_heads,
+                              window_size, qk_norm, cross_attn_norm, eps,
+                              output_dim=output_dim, norm_input_visual=norm_input_visual)
+            for _ in range(num_layers)
+        ])
+        # head
+        self.head = Head(dim, out_dim, patch_size, eps)
+        assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
+        d = dim // num_heads
+        self.freqs = torch.cat([
+            rope_params(1024, d - 4 * (d // 6)),
+            rope_params(1024, 2 * (d // 6)),
+            rope_params(1024, 2 * (d // 6))
+        ],
+                               dim=1)
+        if model_type == 'i2v':
+            self.img_emb = MLPProj(1280, dim)
+        else:
+            raise NotImplementedError('Not supported model type.')
+        # init audio adapter
+        self.audio_proj = AudioProjModel(
+                    seq_len=audio_window,
+                    seq_len_vf=audio_window+vae_scale-1,
+                    intermediate_dim=intermediate_dim,
+                    output_dim=output_dim,
+                    context_tokens=context_tokens,
+                    norm_output_audio=norm_output_audio,
+                )
+        # initialize weights
+        self.init_weights()
+    def teacache_init(
+        self,
+        use_ret_steps=True,
+        teacache_thresh=0.2,
+        sample_steps=40,
+        model_scale='multitalk-480',
+    ):
+        print("teacache_init")
+        self.enable_teacache = True
+        self.__class__.cnt = 0
+        self.__class__.num_steps = sample_steps*3
+        self.__class__.teacache_thresh = teacache_thresh
+        self.__class__.accumulated_rel_l1_distance_even = 0
+        self.__class__.accumulated_rel_l1_distance_odd = 0
+        self.__class__.previous_e0_even = None
+        self.__class__.previous_e0_odd = None
+        self.__class__.previous_residual_even = None
+        self.__class__.previous_residual_odd = None
+        self.__class__.use_ret_steps = use_ret_steps
+        if use_ret_steps:
+            if model_scale == 'multitalk-480':
+                self.__class__.coefficients = [ 2.57151496e+05, -3.54229917e+04,  1.40286849e+03, -1.35890334e+01, 1.32517977e-01]
+            if model_scale == 'multitalk-720':
+                self.__class__.coefficients = [ 8.10705460e+03,  2.13393892e+03, -3.72934672e+02,  1.66203073e+01, -4.17769401e-02]
+            self.__class__.ret_steps = 5*3
+            self.__class__.cutoff_steps = sample_steps*3
+        else:
+            if model_scale == 'multitalk-480':
+                self.__class__.coefficients = [-3.02331670e+02,  2.23948934e+02, -5.25463970e+01,  5.87348440e+00, -2.01973289e-01]
+            if model_scale == 'multitalk-720':
+                self.__class__.coefficients = [-114.36346466,   65.26524496,  -18.82220707,    4.91518089,   -0.23412683]
+            self.__class__.ret_steps = 1*3
+            self.__class__.cutoff_steps = sample_steps*3 - 3
+        print("teacache_init done")
+    def disable_teacache(self):
+        self.enable_teacache = False
+    def forward(
+            self,
+            x,
+            t,
+            context,
+            seq_len,
+            clip_fea=None,
+            y=None,
+            audio=None,
+            ref_target_masks=None,
+        ):
+        assert clip_fea is not None and y is not None
+        _, T, H, W = x[0].shape
+        N_t = T // self.patch_size[0]
+        N_h = H // self.patch_size[1]
+        N_w = W // self.patch_size[2]
+        if y is not None:
+            x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
+        x[0] = x[0].to(context[0].dtype)
+        # embeddings
+        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
+        grid_sizes = torch.stack(
+            [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
+        x = [u.flatten(2).transpose(1, 2) for u in x]
+        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
+        assert seq_lens.max() <= seq_len
+        x = torch.cat([
+            torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))],
+                      dim=1) for u in x
+        ])
+        # time embeddings
+        with amp.autocast(dtype=torch.float32):
+            e = self.time_embedding(
+                sinusoidal_embedding_1d(self.freq_dim, t).float())
+            e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+            assert e.dtype == torch.float32 and e0.dtype == torch.float32
+        # text embedding
+        context_lens = None
+        context = self.text_embedding(
+            torch.stack([
+                torch.cat(
+                    [u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
+                for u in context
+            ]))
+        # clip embedding
+        if clip_fea is not None:
+            context_clip = self.img_emb(clip_fea)
+            context = torch.concat([context_clip, context], dim=1).to(x.dtype)
+        audio_cond = audio.to(device=x.device, dtype=x.dtype)
+        first_frame_audio_emb_s = audio_cond[:, :1, ...]
+        latter_frame_audio_emb = audio_cond[:, 1:, ...]
+        latter_frame_audio_emb = rearrange(latter_frame_audio_emb, "b (n_t n) w s c -> b n_t n w s c", n=self.vae_scale)
+        middle_index = self.audio_window // 2
+        latter_first_frame_audio_emb = latter_frame_audio_emb[:, :, :1, :middle_index+1, ...]
+        latter_first_frame_audio_emb = rearrange(latter_first_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
+        latter_last_frame_audio_emb = latter_frame_audio_emb[:, :, -1:, middle_index:, ...]
+        latter_last_frame_audio_emb = rearrange(latter_last_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
+        latter_middle_frame_audio_emb = latter_frame_audio_emb[:, :, 1:-1, middle_index:middle_index+1, ...]
+        latter_middle_frame_audio_emb = rearrange(latter_middle_frame_audio_emb, "b n_t n w s c -> b n_t (n w) s c")
+        latter_frame_audio_emb_s = torch.concat([latter_first_frame_audio_emb, latter_middle_frame_audio_emb, latter_last_frame_audio_emb], dim=2)
+        audio_embedding = self.audio_proj(first_frame_audio_emb_s, latter_frame_audio_emb_s)
+        human_num = len(audio_embedding)
+        audio_embedding = torch.concat(audio_embedding.split(1), dim=2).to(x.dtype)
+        # convert ref_target_masks to token_ref_target_masks
+        if ref_target_masks is not None:
+            ref_target_masks = ref_target_masks.unsqueeze(0).to(torch.float32)
+            token_ref_target_masks = nn.functional.interpolate(ref_target_masks, size=(N_h, N_w), mode='nearest')
+            token_ref_target_masks = token_ref_target_masks.squeeze(0)
+            token_ref_target_masks = (token_ref_target_masks > 0)
+            token_ref_target_masks = token_ref_target_masks.view(token_ref_target_masks.shape[0], -1)
+            token_ref_target_masks = token_ref_target_masks.to(x.dtype)
+        # teacache
+        if self.enable_teacache:
+            modulated_inp = e0 if self.use_ret_steps else e
+            if self.cnt%3==0: # cond
+                if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
+                    should_calc_cond = True
+                    self.accumulated_rel_l1_distance_cond = 0
+                else:
+                    rescale_func = np.poly1d(self.coefficients)
+                    self.accumulated_rel_l1_distance_cond += rescale_func(((modulated_inp-self.previous_e0_cond).abs().mean() / self.previous_e0_cond.abs().mean()).cpu().item())
+                    if self.accumulated_rel_l1_distance_cond < self.teacache_thresh:
+                        should_calc_cond = False
+                    else:
+                        should_calc_cond = True
+                        self.accumulated_rel_l1_distance_cond = 0
+                self.previous_e0_cond = modulated_inp.clone()
+            elif self.cnt%3==1: # drop_text
+                if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
+                    should_calc_drop_text = True
+                    self.accumulated_rel_l1_distance_drop_text = 0
+                else:
+                    rescale_func = np.poly1d(self.coefficients)
+                    self.accumulated_rel_l1_distance_drop_text += rescale_func(((modulated_inp-self.previous_e0_drop_text).abs().mean() / self.previous_e0_drop_text.abs().mean()).cpu().item())
+                    if self.accumulated_rel_l1_distance_drop_text < self.teacache_thresh:
+                        should_calc_drop_text = False
+                    else:
+                        should_calc_drop_text = True
+                        self.accumulated_rel_l1_distance_drop_text = 0
+                self.previous_e0_drop_text = modulated_inp.clone()
+            else: # uncond
+                if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
+                    should_calc_uncond = True
+                    self.accumulated_rel_l1_distance_uncond = 0
+                else:
+                    rescale_func = np.poly1d(self.coefficients)
+                    self.accumulated_rel_l1_distance_uncond += rescale_func(((modulated_inp-self.previous_e0_uncond).abs().mean() / self.previous_e0_uncond.abs().mean()).cpu().item())
+                    if self.accumulated_rel_l1_distance_uncond < self.teacache_thresh:
+                        should_calc_uncond = False
+                    else:
+                        should_calc_uncond = True
+                        self.accumulated_rel_l1_distance_uncond = 0
+                self.previous_e0_uncond = modulated_inp.clone()
+        # arguments
+        kwargs = dict(
+            e=e0,
+            seq_lens=seq_lens,
+            grid_sizes=grid_sizes,
+            freqs=self.freqs,
+            context=context,
+            context_lens=context_lens,
+            audio_embedding=audio_embedding,
+            ref_target_masks=token_ref_target_masks,
+            human_num=human_num,
+            )
+        if self.enable_teacache:
+            if self.cnt%3==0:
+                if not should_calc_cond:
+                    x +=  self.previous_residual_cond
+                else:
+                    ori_x = x.clone()
+                    for block in self.blocks:
+                        x = block(x, **kwargs)
+                    self.previous_residual_cond = x - ori_x
+            elif self.cnt%3==1:
+                if not should_calc_drop_text:
+                    x +=  self.previous_residual_drop_text
+                else:
+                    ori_x = x.clone()
+                    for block in self.blocks:
+                        x = block(x, **kwargs)
+                    self.previous_residual_drop_text = x - ori_x
+            else:
+                if not should_calc_uncond:
+                    x +=  self.previous_residual_uncond
+                else:
+                    ori_x = x.clone()
+                    for block in self.blocks:
+                        x = block(x, **kwargs)
+                    self.previous_residual_uncond = x - ori_x
+        else:
+            for block in self.blocks:
+                x = block(x, **kwargs)
+        # head
+        x = self.head(x, e)
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes)
+        if self.enable_teacache:
+            self.cnt += 1
+            if self.cnt >= self.num_steps:
+                self.cnt = 0
+        return torch.stack(x).float()
+    def unpatchify(self, x, grid_sizes):
+        r"""
+        Reconstruct video tensors from patch embeddings.
+        Args:
+            x (List[Tensor]):
+                List of patchified features, each with shape [L, C_out * prod(patch_size)]
+            grid_sizes (Tensor):
+                Original spatial-temporal grid dimensions before patching,
+                    shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches)
+        Returns:
+            List[Tensor]:
+                Reconstructed video tensors with shape [C_out, F, H / 8, W / 8]
+        """
+        c = self.out_dim
+        out = []
+        for u, v in zip(x, grid_sizes.tolist()):
+            u = u[:math.prod(v)].view(*v, *self.patch_size, c)
+            u = torch.einsum('fhwpqrc->cfphqwr', u)
+            u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)])
+            out.append(u)
+        return out
+    def init_weights(self):
+        r"""
+        Initialize model parameters using Xavier initialization.
+        """
+        # basic init
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+        # init embeddings
+        nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1))
+        for m in self.text_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=.02)
+        for m in self.time_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=.02)
+        # init output layer
+        nn.init.zeros_(self.head.head.weight)

wan/modules/t5.py ADDED Viewed

	@@ -0,0 +1,513 @@

+# Modified from transformers.models.t5.modeling_t5
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import logging
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .tokenizers import HuggingfaceTokenizer
+__all__ = [
+    'T5Model',
+    'T5Encoder',
+    'T5Decoder',
+    'T5EncoderModel',
+]
+def fp16_clamp(x):
+    if x.dtype == torch.float16 and torch.isinf(x).any():
+        clamp = torch.finfo(x.dtype).max - 1000
+        x = torch.clamp(x, min=-clamp, max=clamp)
+    return x
+def init_weights(m):
+    if isinstance(m, T5LayerNorm):
+        nn.init.ones_(m.weight)
+    elif isinstance(m, T5Model):
+        nn.init.normal_(m.token_embedding.weight, std=1.0)
+    elif isinstance(m, T5FeedForward):
+        nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc1.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5)
+    elif isinstance(m, T5Attention):
+        nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn)**-0.5)
+        nn.init.normal_(m.k.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.v.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn)**-0.5)
+    elif isinstance(m, T5RelativeEmbedding):
+        nn.init.normal_(
+            m.embedding.weight, std=(2 * m.num_buckets * m.num_heads)**-0.5)
+class GELU(nn.Module):
+    def forward(self, x):
+        return 0.5 * x * (1.0 + torch.tanh(
+            math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+class T5LayerNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super(T5LayerNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        x = x * torch.rsqrt(x.float().pow(2).mean(dim=-1, keepdim=True) +
+                            self.eps)
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            x = x.type_as(self.weight)
+        return self.weight * x
+class T5Attention(nn.Module):
+    def __init__(self, dim, dim_attn, num_heads, dropout=0.1):
+        assert dim_attn % num_heads == 0
+        super(T5Attention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.num_heads = num_heads
+        self.head_dim = dim_attn // num_heads
+        # layers
+        self.q = nn.Linear(dim, dim_attn, bias=False)
+        self.k = nn.Linear(dim, dim_attn, bias=False)
+        self.v = nn.Linear(dim, dim_attn, bias=False)
+        self.o = nn.Linear(dim_attn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, context=None, mask=None, pos_bias=None):
+        """
+        x:          [B, L1, C].
+        context:    [B, L2, C] or None.
+        mask:       [B, L2] or [B, L1, L2] or None.
+        """
+        # check inputs
+        context = x if context is None else context
+        b, n, c = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.q(x).view(b, -1, n, c)
+        k = self.k(context).view(b, -1, n, c)
+        v = self.v(context).view(b, -1, n, c)
+        # attention bias
+        attn_bias = x.new_zeros(b, n, q.size(1), k.size(1))
+        if pos_bias is not None:
+            attn_bias += pos_bias
+        if mask is not None:
+            assert mask.ndim in [2, 3]
+            mask = mask.view(b, 1, 1,
+                             -1) if mask.ndim == 2 else mask.unsqueeze(1)
+            attn_bias.masked_fill_(mask == 0, torch.finfo(x.dtype).min)
+        # compute attention (T5 does not use scaling)
+        attn = torch.einsum('binc,bjnc->bnij', q, k) + attn_bias
+        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
+        x = torch.einsum('bnij,bjnc->binc', attn, v)
+        # output
+        x = x.reshape(b, -1, n * c)
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+class T5FeedForward(nn.Module):
+    def __init__(self, dim, dim_ffn, dropout=0.1):
+        super(T5FeedForward, self).__init__()
+        self.dim = dim
+        self.dim_ffn = dim_ffn
+        # layers
+        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
+        self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
+        self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = self.fc1(x) * self.gate(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+class T5SelfAttention(nn.Module):
+    def __init__(self,
+                 dim,
+                 dim_attn,
+                 dim_ffn,
+                 num_heads,
+                 num_buckets,
+                 shared_pos=True,
+                 dropout=0.1):
+        super(T5SelfAttention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.norm1 = T5LayerNorm(dim)
+        self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm2 = T5LayerNorm(dim)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
+        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(
+            num_buckets, num_heads, bidirectional=True)
+    def forward(self, x, mask=None, pos_bias=None):
+        e = pos_bias if self.shared_pos else self.pos_embedding(
+            x.size(1), x.size(1))
+        x = fp16_clamp(x + self.attn(self.norm1(x), mask=mask, pos_bias=e))
+        x = fp16_clamp(x + self.ffn(self.norm2(x)))
+        return x
+class T5CrossAttention(nn.Module):
+    def __init__(self,
+                 dim,
+                 dim_attn,
+                 dim_ffn,
+                 num_heads,
+                 num_buckets,
+                 shared_pos=True,
+                 dropout=0.1):
+        super(T5CrossAttention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.norm1 = T5LayerNorm(dim)
+        self.self_attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm2 = T5LayerNorm(dim)
+        self.cross_attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm3 = T5LayerNorm(dim)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
+        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(
+            num_buckets, num_heads, bidirectional=False)
+    def forward(self,
+                x,
+                mask=None,
+                encoder_states=None,
+                encoder_mask=None,
+                pos_bias=None):
+        e = pos_bias if self.shared_pos else self.pos_embedding(
+            x.size(1), x.size(1))
+        x = fp16_clamp(x + self.self_attn(self.norm1(x), mask=mask, pos_bias=e))
+        x = fp16_clamp(x + self.cross_attn(
+            self.norm2(x), context=encoder_states, mask=encoder_mask))
+        x = fp16_clamp(x + self.ffn(self.norm3(x)))
+        return x
+class T5RelativeEmbedding(nn.Module):
+    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
+        super(T5RelativeEmbedding, self).__init__()
+        self.num_buckets = num_buckets
+        self.num_heads = num_heads
+        self.bidirectional = bidirectional
+        self.max_dist = max_dist
+        # layers
+        self.embedding = nn.Embedding(num_buckets, num_heads)
+    def forward(self, lq, lk):
+        device = self.embedding.weight.device
+        # rel_pos = torch.arange(lk).unsqueeze(0).to(device) - \
+        #     torch.arange(lq).unsqueeze(1).to(device)
+        rel_pos = torch.arange(lk, device=device).unsqueeze(0) - \
+            torch.arange(lq, device=device).unsqueeze(1)
+        rel_pos = self._relative_position_bucket(rel_pos)
+        rel_pos_embeds = self.embedding(rel_pos)
+        rel_pos_embeds = rel_pos_embeds.permute(2, 0, 1).unsqueeze(
+            0)  # [1, N, Lq, Lk]
+        return rel_pos_embeds.contiguous()
+    def _relative_position_bucket(self, rel_pos):
+        # preprocess
+        if self.bidirectional:
+            num_buckets = self.num_buckets // 2
+            rel_buckets = (rel_pos > 0).long() * num_buckets
+            rel_pos = torch.abs(rel_pos)
+        else:
+            num_buckets = self.num_buckets
+            rel_buckets = 0
+            rel_pos = -torch.min(rel_pos, torch.zeros_like(rel_pos))
+        # embeddings for small and large positions
+        max_exact = num_buckets // 2
+        rel_pos_large = max_exact + (torch.log(rel_pos.float() / max_exact) /
+                                     math.log(self.max_dist / max_exact) *
+                                     (num_buckets - max_exact)).long()
+        rel_pos_large = torch.min(
+            rel_pos_large, torch.full_like(rel_pos_large, num_buckets - 1))
+        rel_buckets += torch.where(rel_pos < max_exact, rel_pos, rel_pos_large)
+        return rel_buckets
+class T5Encoder(nn.Module):
+    def __init__(self,
+                 vocab,
+                 dim,
+                 dim_attn,
+                 dim_ffn,
+                 num_heads,
+                 num_layers,
+                 num_buckets,
+                 shared_pos=True,
+                 dropout=0.1):
+        super(T5Encoder, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) \
+            else nn.Embedding(vocab, dim)
+        self.pos_embedding = T5RelativeEmbedding(
+            num_buckets, num_heads, bidirectional=True) if shared_pos else None
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList([
+            T5SelfAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets,
+                            shared_pos, dropout) for _ in range(num_layers)
+        ])
+        self.norm = T5LayerNorm(dim)
+        # initialize weights
+        self.apply(init_weights)
+    def forward(self, ids, mask=None):
+        x = self.token_embedding(ids)
+        x = self.dropout(x)
+        e = self.pos_embedding(x.size(1),
+                               x.size(1)) if self.shared_pos else None
+        for block in self.blocks:
+            x = block(x, mask, pos_bias=e)
+        x = self.norm(x)
+        x = self.dropout(x)
+        return x
+class T5Decoder(nn.Module):
+    def __init__(self,
+                 vocab,
+                 dim,
+                 dim_attn,
+                 dim_ffn,
+                 num_heads,
+                 num_layers,
+                 num_buckets,
+                 shared_pos=True,
+                 dropout=0.1):
+        super(T5Decoder, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) \
+            else nn.Embedding(vocab, dim)
+        self.pos_embedding = T5RelativeEmbedding(
+            num_buckets, num_heads, bidirectional=False) if shared_pos else None
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList([
+            T5CrossAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets,
+                             shared_pos, dropout) for _ in range(num_layers)
+        ])
+        self.norm = T5LayerNorm(dim)
+        # initialize weights
+        self.apply(init_weights)
+    def forward(self, ids, mask=None, encoder_states=None, encoder_mask=None):
+        b, s = ids.size()
+        # causal mask
+        if mask is None:
+            mask = torch.tril(torch.ones(1, s, s).to(ids.device))
+        elif mask.ndim == 2:
+            mask = torch.tril(mask.unsqueeze(1).expand(-1, s, -1))
+        # layers
+        x = self.token_embedding(ids)
+        x = self.dropout(x)
+        e = self.pos_embedding(x.size(1),
+                               x.size(1)) if self.shared_pos else None
+        for block in self.blocks:
+            x = block(x, mask, encoder_states, encoder_mask, pos_bias=e)
+        x = self.norm(x)
+        x = self.dropout(x)
+        return x
+class T5Model(nn.Module):
+    def __init__(self,
+                 vocab_size,
+                 dim,
+                 dim_attn,
+                 dim_ffn,
+                 num_heads,
+                 encoder_layers,
+                 decoder_layers,
+                 num_buckets,
+                 shared_pos=True,
+                 dropout=0.1):
+        super(T5Model, self).__init__()
+        self.vocab_size = vocab_size
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+        self.num_buckets = num_buckets
+        # layers
+        self.token_embedding = nn.Embedding(vocab_size, dim)
+        self.encoder = T5Encoder(self.token_embedding, dim, dim_attn, dim_ffn,
+                                 num_heads, encoder_layers, num_buckets,
+                                 shared_pos, dropout)
+        self.decoder = T5Decoder(self.token_embedding, dim, dim_attn, dim_ffn,
+                                 num_heads, decoder_layers, num_buckets,
+                                 shared_pos, dropout)
+        self.head = nn.Linear(dim, vocab_size, bias=False)
+        # initialize weights
+        self.apply(init_weights)
+    def forward(self, encoder_ids, encoder_mask, decoder_ids, decoder_mask):
+        x = self.encoder(encoder_ids, encoder_mask)
+        x = self.decoder(decoder_ids, decoder_mask, x, encoder_mask)
+        x = self.head(x)
+        return x
+def _t5(name,
+        encoder_only=False,
+        decoder_only=False,
+        return_tokenizer=False,
+        tokenizer_kwargs={},
+        dtype=torch.float32,
+        device='cpu',
+        **kwargs):
+    # sanity check
+    assert not (encoder_only and decoder_only)
+    # params
+    if encoder_only:
+        model_cls = T5Encoder
+        kwargs['vocab'] = kwargs.pop('vocab_size')
+        kwargs['num_layers'] = kwargs.pop('encoder_layers')
+        _ = kwargs.pop('decoder_layers')
+    elif decoder_only:
+        model_cls = T5Decoder
+        kwargs['vocab'] = kwargs.pop('vocab_size')
+        kwargs['num_layers'] = kwargs.pop('decoder_layers')
+        _ = kwargs.pop('encoder_layers')
+    else:
+        model_cls = T5Model
+    # init model
+    with torch.device(device):
+        model = model_cls(**kwargs)
+    # set device
+    model = model.to(dtype=dtype, device=device)
+    # init tokenizer
+    if return_tokenizer:
+        from .tokenizers import HuggingfaceTokenizer
+        tokenizer = HuggingfaceTokenizer(f'google/{name}', **tokenizer_kwargs)
+        return model, tokenizer
+    else:
+        return model
+def umt5_xxl(**kwargs):
+    cfg = dict(
+        vocab_size=256384,
+        dim=4096,
+        dim_attn=4096,
+        dim_ffn=10240,
+        num_heads=64,
+        encoder_layers=24,
+        decoder_layers=24,
+        num_buckets=32,
+        shared_pos=False,
+        dropout=0.1)
+    cfg.update(**kwargs)
+    return _t5('umt5-xxl', **cfg)
+class T5EncoderModel:
+    def __init__(
+        self,
+        text_len,
+        dtype=torch.bfloat16,
+        device=torch.cuda.current_device(),
+        checkpoint_path=None,
+        tokenizer_path=None,
+        shard_fn=None,
+    ):
+        self.text_len = text_len
+        self.dtype = dtype
+        self.device = device
+        self.checkpoint_path = checkpoint_path
+        self.tokenizer_path = tokenizer_path
+        # init model
+        model = umt5_xxl(
+            encoder_only=True,
+            return_tokenizer=False,
+            dtype=dtype,
+            device=device).eval().requires_grad_(False)
+        logging.info(f'loading {checkpoint_path}')
+        model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'))
+        self.model = model
+        if shard_fn is not None:
+            self.model = shard_fn(self.model, sync_module_states=False)
+        else:
+            self.model.to(self.device)
+        # init tokenizer
+        self.tokenizer = HuggingfaceTokenizer(
+            name=tokenizer_path, seq_len=text_len, clean='whitespace')
+    def __call__(self, texts, device):
+        ids, mask = self.tokenizer(
+            texts, return_mask=True, add_special_tokens=True)
+        ids = ids.to(device)
+        mask = mask.to(device)
+        seq_lens = mask.gt(0).sum(dim=1).long()
+        context = self.model(ids, mask)
+        return [u[:v] for u, v in zip(context, seq_lens)]

wan/modules/tokenizers.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import html
+import string
+import ftfy
+import regex as re
+from transformers import AutoTokenizer
+__all__ = ['HuggingfaceTokenizer']
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+def canonicalize(text, keep_punctuation_exact_string=None):
+    text = text.replace('_', ' ')
+    if keep_punctuation_exact_string:
+        text = keep_punctuation_exact_string.join(
+            part.translate(str.maketrans('', '', string.punctuation))
+            for part in text.split(keep_punctuation_exact_string))
+    else:
+        text = text.translate(str.maketrans('', '', string.punctuation))
+    text = text.lower()
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+class HuggingfaceTokenizer:
+    def __init__(self, name, seq_len=None, clean=None, **kwargs):
+        assert clean in (None, 'whitespace', 'lower', 'canonicalize')
+        self.name = name
+        self.seq_len = seq_len
+        self.clean = clean
+        # init tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs)
+        self.vocab_size = self.tokenizer.vocab_size
+    def __call__(self, sequence, **kwargs):
+        return_mask = kwargs.pop('return_mask', False)
+        # arguments
+        _kwargs = {'return_tensors': 'pt'}
+        if self.seq_len is not None:
+            _kwargs.update({
+                'padding': 'max_length',
+                'truncation': True,
+                'max_length': self.seq_len
+            })
+        _kwargs.update(**kwargs)
+        # tokenization
+        if isinstance(sequence, str):
+            sequence = [sequence]
+        if self.clean:
+            sequence = [self._clean(u) for u in sequence]
+        ids = self.tokenizer(sequence, **_kwargs)
+        # output
+        if return_mask:
+            return ids.input_ids, ids.attention_mask
+        else:
+            return ids.input_ids
+    def _clean(self, text):
+        if self.clean == 'whitespace':
+            text = whitespace_clean(basic_clean(text))
+        elif self.clean == 'lower':
+            text = whitespace_clean(basic_clean(text)).lower()
+        elif self.clean == 'canonicalize':
+            text = canonicalize(basic_clean(text))
+        return text

wan/modules/vace_model.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+from diffusers.configuration_utils import register_to_config
+from .model import WanAttentionBlock, WanModel, sinusoidal_embedding_1d
+class VaceWanAttentionBlock(WanAttentionBlock):
+    def __init__(self,
+                 cross_attn_type,
+                 dim,
+                 ffn_dim,
+                 num_heads,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 cross_attn_norm=False,
+                 eps=1e-6,
+                 block_id=0):
+        super().__init__(cross_attn_type, dim, ffn_dim, num_heads, window_size,
+                         qk_norm, cross_attn_norm, eps)
+        self.block_id = block_id
+        if block_id == 0:
+            self.before_proj = nn.Linear(self.dim, self.dim)
+            nn.init.zeros_(self.before_proj.weight)
+            nn.init.zeros_(self.before_proj.bias)
+        self.after_proj = nn.Linear(self.dim, self.dim)
+        nn.init.zeros_(self.after_proj.weight)
+        nn.init.zeros_(self.after_proj.bias)
+    def forward(self, c, x, **kwargs):
+        if self.block_id == 0:
+            c = self.before_proj(c) + x
+        c = super().forward(c, **kwargs)
+        c_skip = self.after_proj(c)
+        return c, c_skip
+class BaseWanAttentionBlock(WanAttentionBlock):
+    def __init__(self,
+                 cross_attn_type,
+                 dim,
+                 ffn_dim,
+                 num_heads,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 cross_attn_norm=False,
+                 eps=1e-6,
+                 block_id=None):
+        super().__init__(cross_attn_type, dim, ffn_dim, num_heads, window_size,
+                         qk_norm, cross_attn_norm, eps)
+        self.block_id = block_id
+    def forward(self, x, hints, context_scale=1.0, **kwargs):
+        x = super().forward(x, **kwargs)
+        if self.block_id is not None:
+            x = x + hints[self.block_id] * context_scale
+        return x
+class VaceWanModel(WanModel):
+    @register_to_config
+    def __init__(self,
+                 vace_layers=None,
+                 vace_in_dim=None,
+                 model_type='vace',
+                 patch_size=(1, 2, 2),
+                 text_len=512,
+                 in_dim=16,
+                 dim=2048,
+                 ffn_dim=8192,
+                 freq_dim=256,
+                 text_dim=4096,
+                 out_dim=16,
+                 num_heads=16,
+                 num_layers=32,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 cross_attn_norm=True,
+                 eps=1e-6):
+        super().__init__(model_type, patch_size, text_len, in_dim, dim, ffn_dim,
+                         freq_dim, text_dim, out_dim, num_heads, num_layers,
+                         window_size, qk_norm, cross_attn_norm, eps)
+        self.vace_layers = [i for i in range(0, self.num_layers, 2)
+                           ] if vace_layers is None else vace_layers
+        self.vace_in_dim = self.in_dim if vace_in_dim is None else vace_in_dim
+        assert 0 in self.vace_layers
+        self.vace_layers_mapping = {
+            i: n for n, i in enumerate(self.vace_layers)
+        }
+        # blocks
+        self.blocks = nn.ModuleList([
+            BaseWanAttentionBlock(
+                't2v_cross_attn',
+                self.dim,
+                self.ffn_dim,
+                self.num_heads,
+                self.window_size,
+                self.qk_norm,
+                self.cross_attn_norm,
+                self.eps,
+                block_id=self.vace_layers_mapping[i]
+                if i in self.vace_layers else None)
+            for i in range(self.num_layers)
+        ])
+        # vace blocks
+        self.vace_blocks = nn.ModuleList([
+            VaceWanAttentionBlock(
+                't2v_cross_attn',
+                self.dim,
+                self.ffn_dim,
+                self.num_heads,
+                self.window_size,
+                self.qk_norm,
+                self.cross_attn_norm,
+                self.eps,
+                block_id=i) for i in self.vace_layers
+        ])
+        # vace patch embeddings
+        self.vace_patch_embedding = nn.Conv3d(
+            self.vace_in_dim,
+            self.dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size)
+    def forward_vace(self, x, vace_context, seq_len, kwargs):
+        # embeddings
+        c = [self.vace_patch_embedding(u.unsqueeze(0)) for u in vace_context]
+        c = [u.flatten(2).transpose(1, 2) for u in c]
+        c = torch.cat([
+            torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))],
+                      dim=1) for u in c
+        ])
+        # arguments
+        new_kwargs = dict(x=x)
+        new_kwargs.update(kwargs)
+        hints = []
+        for block in self.vace_blocks:
+            c, c_skip = block(c, **new_kwargs)
+            hints.append(c_skip)
+        return hints
+    def forward(
+        self,
+        x,
+        t,
+        vace_context,
+        context,
+        seq_len,
+        vace_context_scale=1.0,
+        clip_fea=None,
+        y=None,
+    ):
+        r"""
+        Forward pass through the diffusion model
+        Args:
+            x (List[Tensor]):
+                List of input video tensors, each with shape [C_in, F, H, W]
+            t (Tensor):
+                Diffusion timesteps tensor of shape [B]
+            context (List[Tensor]):
+                List of text embeddings each with shape [L, C]
+            seq_len (`int`):
+                Maximum sequence length for positional encoding
+            clip_fea (Tensor, *optional*):
+                CLIP image features for image-to-video mode
+            y (List[Tensor], *optional*):
+                Conditional video inputs for image-to-video mode, same shape as x
+        Returns:
+            List[Tensor]:
+                List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
+        """
+        # if self.model_type == 'i2v':
+        #     assert clip_fea is not None and y is not None
+        # params
+        device = self.patch_embedding.weight.device
+        if self.freqs.device != device:
+            self.freqs = self.freqs.to(device)
+        # if y is not None:
+        #     x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
+        # embeddings
+        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
+        grid_sizes = torch.stack(
+            [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
+        x = [u.flatten(2).transpose(1, 2) for u in x]
+        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
+        assert seq_lens.max() <= seq_len
+        x = torch.cat([
+            torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))],
+                      dim=1) for u in x
+        ])
+        # time embeddings
+        with amp.autocast(dtype=torch.float32):
+            e = self.time_embedding(
+                sinusoidal_embedding_1d(self.freq_dim, t).float())
+            e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+            assert e.dtype == torch.float32 and e0.dtype == torch.float32
+        # context
+        context_lens = None
+        context = self.text_embedding(
+            torch.stack([
+                torch.cat(
+                    [u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
+                for u in context
+            ]))
+        # if clip_fea is not None:
+        #     context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+        #     context = torch.concat([context_clip, context], dim=1)
+        # arguments
+        kwargs = dict(
+            e=e0,
+            seq_lens=seq_lens,
+            grid_sizes=grid_sizes,
+            freqs=self.freqs,
+            context=context,
+            context_lens=context_lens)
+        hints = self.forward_vace(x, vace_context, seq_len, kwargs)
+        kwargs['hints'] = hints
+        kwargs['context_scale'] = vace_context_scale
+        for block in self.blocks:
+            x = block(x, **kwargs)
+        # head
+        x = self.head(x, e)
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes)
+        return [u.float() for u in x]

wan/modules/vae.py ADDED Viewed

	@@ -0,0 +1,663 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import logging
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+__all__ = [
+    'WanVAE',
+]
+CACHE_T = 2
+class CausalConv3d(nn.Conv3d):
+    """
+    Causal 3d convolusion.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._padding = (self.padding[2], self.padding[2], self.padding[1],
+                         self.padding[1], 2 * self.padding[0], 0)
+        self.padding = (0, 0, 0)
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        x = F.pad(x, padding)
+        return super().forward(x)
+class RMS_norm(nn.Module):
+    def __init__(self, dim, channel_first=True, images=True, bias=False):
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.
+    def forward(self, x):
+        return F.normalize(
+            x, dim=(1 if self.channel_first else
+                    -1)) * self.scale * self.gamma + self.bias
+class Upsample(nn.Upsample):
+    def forward(self, x):
+        """
+        Fix bfloat16 support for nearest neighbor interpolation.
+        """
+        return super().forward(x.float()).type_as(x)
+class Resample(nn.Module):
+    def __init__(self, dim, mode):
+        assert mode in ('none', 'upsample2d', 'upsample3d', 'downsample2d',
+                        'downsample3d')
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        # layers
+        if mode == 'upsample2d':
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                nn.Conv2d(dim, dim // 2, 3, padding=1))
+        elif mode == 'upsample3d':
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                nn.Conv2d(dim, dim // 2, 3, padding=1))
+            self.time_conv = CausalConv3d(
+                dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+        elif mode == 'downsample2d':
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)),
+                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == 'downsample3d':
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)),
+                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = CausalConv3d(
+                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+        else:
+            self.resample = nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == 'upsample3d':
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = 'Rep'
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if cache_x.shape[2] < 2 and feat_cache[
+                            idx] is not None and feat_cache[idx] != 'Rep':
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat([
+                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                                cache_x.device), cache_x
+                        ],
+                                            dim=2)
+                    if cache_x.shape[2] < 2 and feat_cache[
+                            idx] is not None and feat_cache[idx] == 'Rep':
+                        cache_x = torch.cat([
+                            torch.zeros_like(cache_x).to(cache_x.device),
+                            cache_x
+                        ],
+                                            dim=2)
+                    if feat_cache[idx] == 'Rep':
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
+                                    3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = self.resample(x)
+        x = rearrange(x, '(b t) c h w -> b c t h w', t=t)
+        if self.mode == 'downsample3d':
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    # if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx]!='Rep':
+                    #     # cache last frame of last two chunk
+                    #     cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+                    x = self.time_conv(
+                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+    def init_weight(self, conv):
+        conv_weight = conv.weight
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        one_matrix = torch.eye(c1, c2)
+        init_matrix = one_matrix
+        nn.init.zeros_(conv_weight)
+        #conv_weight.data[:,:,-1,1,1] = init_matrix * 0.5
+        conv_weight.data[:, :, 1, 0, 0] = init_matrix  #* 0.5
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def init_weight2(self, conv):
+        conv_weight = conv.weight.data
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        init_matrix = torch.eye(c1 // 2, c2)
+        #init_matrix = repeat(init_matrix, 'o ... -> (o 2) ...').permute(1,0,2).contiguous().reshape(c1,c2)
+        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
+        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+class ResidualBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout=0.0):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # layers
+        self.residual = nn.Sequential(
+            RMS_norm(in_dim, images=False), nn.SiLU(),
+            CausalConv3d(in_dim, out_dim, 3, padding=1),
+            RMS_norm(out_dim, images=False), nn.SiLU(), nn.Dropout(dropout),
+            CausalConv3d(out_dim, out_dim, 3, padding=1))
+        self.shortcut = CausalConv3d(in_dim, out_dim, 1) \
+            if in_dim != out_dim else nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        h = self.shortcut(x)
+        for layer in self.residual:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x + h
+class AttentionBlock(nn.Module):
+    """
+    Causal self-attention with a single head.
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        # layers
+        self.norm = RMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+    def forward(self, x):
+        identity = x
+        b, c, t, h, w = x.size()
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = self.norm(x)
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).reshape(b * t, 1, c * 3,
+                                         -1).permute(0, 1, 3,
+                                                     2).contiguous().chunk(
+                                                         3, dim=-1)
+        # apply attention
+        x = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+        )
+        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
+        # output
+        x = self.proj(x)
+        x = rearrange(x, '(b t) c h w-> b c t h w', t=t)
+        return x + identity
+class Encoder3d(nn.Module):
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_downsample=[True, True, False],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+        # init block
+        self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            for _ in range(num_res_blocks):
+                downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    downsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # downsample block
+            if i != len(dim_mult) - 1:
+                mode = 'downsample3d' if temperal_downsample[
+                    i] else 'downsample2d'
+                downsamples.append(Resample(out_dim, mode=mode))
+                scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(out_dim, out_dim, dropout), AttentionBlock(out_dim),
+            ResidualBlock(out_dim, out_dim, dropout))
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False), nn.SiLU(),
+            CausalConv3d(out_dim, z_dim, 3, padding=1))
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([
+                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                        cache_x.device), cache_x
+                ],
+                                    dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+class Decoder3d(nn.Module):
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_upsample=[False, True, True],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2**(len(dim_mult) - 2)
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(dims[0], dims[0], dropout), AttentionBlock(dims[0]),
+            ResidualBlock(dims[0], dims[0], dropout))
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i == 1 or i == 2 or i == 3:
+                in_dim = in_dim // 2
+            for _ in range(num_res_blocks + 1):
+                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    upsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # upsample block
+            if i != len(dim_mult) - 1:
+                mode = 'upsample3d' if temperal_upsample[i] else 'upsample2d'
+                upsamples.append(Resample(out_dim, mode=mode))
+                scale *= 2.0
+        self.upsamples = nn.Sequential(*upsamples)
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False), nn.SiLU(),
+            CausalConv3d(out_dim, 3, 3, padding=1))
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        ## conv1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([
+                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                        cache_x.device), cache_x
+                ],
+                                    dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+def count_conv3d(model):
+    count = 0
+    for m in model.modules():
+        if isinstance(m, CausalConv3d):
+            count += 1
+    return count
+class WanVAE_(nn.Module):
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_downsample=[True, True, False],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+        # modules
+        self.encoder = Encoder3d(dim, z_dim * 2, dim_mult, num_res_blocks,
+                                 attn_scales, self.temperal_downsample, dropout)
+        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
+        self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks,
+                                 attn_scales, self.temperal_upsample, dropout)
+    def forward(self, x):
+        mu, log_var = self.encode(x)
+        z = self.reparameterize(mu, log_var)
+        x_recon = self.decode(z)
+        return x_recon, mu, log_var
+    def encode(self, x, scale):
+        self.clear_cache()
+        ## cache
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        ## 对encode输入的x，按时间拆分为1、4、4、4....
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(
+                    x[:, :, :1, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx)
+            else:
+                out_ = self.encoder(
+                    x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx)
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if isinstance(scale[0], torch.Tensor):
+            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
+                1, self.z_dim, 1, 1, 1)
+        else:
+            mu = (mu - scale[0]) * scale[1]
+        self.clear_cache()
+        return mu
+    def decode(self, z, scale):
+        self.clear_cache()
+        # z: [b,c,t,h,w]
+        if isinstance(scale[0], torch.Tensor):
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
+                1, self.z_dim, 1, 1, 1)
+        else:
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(
+                    x[:, :, i:i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx)
+            else:
+                out_ = self.decoder(
+                    x[:, :, i:i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx)
+                out = torch.cat([out, out_], 2)
+        self.clear_cache()
+        return out
+    def reparameterize(self, mu, log_var):
+        std = torch.exp(0.5 * log_var)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+    def sample(self, imgs, deterministic=False):
+        mu, log_var = self.encode(imgs)
+        if deterministic:
+            return mu
+        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
+        return mu + std * torch.randn_like(std)
+    def clear_cache(self):
+        self._conv_num = count_conv3d(self.decoder)
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        #cache encode
+        self._enc_conv_num = count_conv3d(self.encoder)
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+def _video_vae(pretrained_path=None, z_dim=None, device='cpu', **kwargs):
+    """
+    Autoencoder3d adapted from Stable Diffusion 1.x, 2.x and XL.
+    """
+    # params
+    cfg = dict(
+        dim=96,
+        z_dim=z_dim,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[False, True, True],
+        dropout=0.0)
+    cfg.update(**kwargs)
+    # init model
+    with torch.device('meta'):
+        model = WanVAE_(**cfg)
+    # load checkpoint
+    logging.info(f'loading {pretrained_path}')
+    model.load_state_dict(
+        torch.load(pretrained_path, map_location=device), assign=True)
+    return model
+class WanVAE:
+    def __init__(self,
+                 z_dim=16,
+                 vae_pth='cache/vae_step_411000.pth',
+                 dtype=torch.float,
+                 device="cuda"):
+        self.dtype = dtype
+        self.device = device
+        mean = [
+            -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
+            0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
+        ]
+        std = [
+            2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
+            3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
+        ]
+        self.mean = torch.tensor(mean, dtype=dtype, device=device)
+        self.std = torch.tensor(std, dtype=dtype, device=device)
+        self.scale = [self.mean, 1.0 / self.std]
+        # init model
+        self.model = _video_vae(
+            pretrained_path=vae_pth,
+            z_dim=z_dim,
+        ).eval().requires_grad_(False).to(device)
+    def encode(self, videos):
+        """
+        videos: A list of videos each with shape [C, T, H, W].
+        """
+        with amp.autocast(dtype=self.dtype):
+            return [
+                self.model.encode(u.unsqueeze(0), self.scale).float().squeeze(0)
+                for u in videos
+            ]
+    def decode(self, zs):
+        with amp.autocast(dtype=self.dtype):
+            return [
+                self.model.decode(u.unsqueeze(0),
+                                  self.scale).float().clamp_(-1, 1).squeeze(0)
+                for u in zs
+            ]

wan/modules/xlm_roberta.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# Modified from transformers.models.xlm_roberta.modeling_xlm_roberta
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+__all__ = ['XLMRoberta', 'xlm_roberta_large']
+class SelfAttention(nn.Module):
+    def __init__(self, dim, num_heads, dropout=0.1, eps=1e-5):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.eps = eps
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, mask):
+        """
+        x:   [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.q(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        k = self.k(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        v = self.v(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        # compute attention
+        p = self.dropout.p if self.training else 0.0
+        x = F.scaled_dot_product_attention(q, k, v, mask, p)
+        x = x.permute(0, 2, 1, 3).reshape(b, s, c)
+        # output
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(self, dim, num_heads, post_norm, dropout=0.1, eps=1e-5):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.post_norm = post_norm
+        self.eps = eps
+        # layers
+        self.attn = SelfAttention(dim, num_heads, dropout, eps)
+        self.norm1 = nn.LayerNorm(dim, eps=eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim),
+            nn.Dropout(dropout))
+        self.norm2 = nn.LayerNorm(dim, eps=eps)
+    def forward(self, x, mask):
+        if self.post_norm:
+            x = self.norm1(x + self.attn(x, mask))
+            x = self.norm2(x + self.ffn(x))
+        else:
+            x = x + self.attn(self.norm1(x), mask)
+            x = x + self.ffn(self.norm2(x))
+        return x
+class XLMRoberta(nn.Module):
+    """
+    XLMRobertaModel with no pooler and no LM head.
+    """
+    def __init__(self,
+                 vocab_size=250002,
+                 max_seq_len=514,
+                 type_size=1,
+                 pad_id=1,
+                 dim=1024,
+                 num_heads=16,
+                 num_layers=24,
+                 post_norm=True,
+                 dropout=0.1,
+                 eps=1e-5):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.type_size = type_size
+        self.pad_id = pad_id
+        self.dim = dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.post_norm = post_norm
+        self.eps = eps
+        # embeddings
+        self.token_embedding = nn.Embedding(vocab_size, dim, padding_idx=pad_id)
+        self.type_embedding = nn.Embedding(type_size, dim)
+        self.pos_embedding = nn.Embedding(max_seq_len, dim, padding_idx=pad_id)
+        self.dropout = nn.Dropout(dropout)
+        # blocks
+        self.blocks = nn.ModuleList([
+            AttentionBlock(dim, num_heads, post_norm, dropout, eps)
+            for _ in range(num_layers)
+        ])
+        # norm layer
+        self.norm = nn.LayerNorm(dim, eps=eps)
+    def forward(self, ids):
+        """
+        ids: [B, L] of torch.LongTensor.
+        """
+        b, s = ids.shape
+        mask = ids.ne(self.pad_id).long()
+        # embeddings
+        x = self.token_embedding(ids) + \
+            self.type_embedding(torch.zeros_like(ids)) + \
+            self.pos_embedding(self.pad_id + torch.cumsum(mask, dim=1) * mask)
+        if self.post_norm:
+            x = self.norm(x)
+        x = self.dropout(x)
+        # blocks
+        mask = torch.where(
+            mask.view(b, 1, 1, s).gt(0), 0.0,
+            torch.finfo(x.dtype).min)
+        for block in self.blocks:
+            x = block(x, mask)
+        # output
+        if not self.post_norm:
+            x = self.norm(x)
+        return x
+def xlm_roberta_large(pretrained=False,
+                      return_tokenizer=False,
+                      device='cpu',
+                      **kwargs):
+    """
+    XLMRobertaLarge adapted from Huggingface.
+    """
+    # params
+    cfg = dict(
+        vocab_size=250002,
+        max_seq_len=514,
+        type_size=1,
+        pad_id=1,
+        dim=1024,
+        num_heads=16,
+        num_layers=24,
+        post_norm=True,
+        dropout=0.1,
+        eps=1e-5)
+    cfg.update(**kwargs)
+    # init a model on device
+    with torch.device(device):
+        model = XLMRoberta(**cfg)
+    return model