diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..95d6918c199c8bfaa43ba6e2e6e76f2274847576
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,8 @@
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text
+PXL_20240227_181242253jpg filter=lfs diff=lfs merge=lfs -text
+PXL_20240227_181242253jpg* filter=lfs diff=lfs merge=lfs -text
+*PXL_20240227_181242253jpg filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6b4cb718bda164359e3bee1544b7473e0269aac7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+__pycache__/
+.venv
+venv/
+logs/
+uv.lock
+main.exp
+main.lib
+main.obj
+dataset/Wan
+Models/
+Output_LoRAs/
\ No newline at end of file
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000000000000000000000000000000000000..c8cfe3959183f8e9a50f83f54cd723f2dc9c252d
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.10
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..5a546a5f2a8ed3003b391887f68500f0ef150e8d
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,49 @@
+# Use a standard Python 3.12 base image
+FROM python:3.12-slim
+
+# Set the working directory inside the container
+WORKDIR /code
+
+# Install git and aria2 for faster downloads
+RUN apt-get update && apt-get install -y git aria2
+
+# Copy the requirements file first to leverage Docker cache
+COPY requirements.txt .
+
+# Install the correct CUDA-enabled PyTorch version and other requirements
+RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+RUN pip install --no-cache-dir -r requirements.txt
+
+# --- NEW SECTION: DOWNLOAD MODELS ---
+# Download the official Wan2.1 models from their Hugging Face repository
+# This downloads them into a "Models/Wan" folder inside the container
+RUN huggingface-cli download wan-video/wan2.1 \
+    --repo-type model \
+    --include "*.pth" "*.json" "*.safetensors" \
+    --local-dir Models/Wan --local-dir-use-symlinks False
+
+# Copy all your project files (code, dataset configs, etc.) into the container
+COPY . .
+
+# This is the command that will run when the Space starts.
+# It uses the models we just downloaded.
+CMD ["accelerate", "launch", "wan_train_network.py", \
+     "--task", "i2v-14B", \
+     "--dit", "Models/Wan/wan2.1_i2v_720p_14B_fp8_e4m3fn.safetensors", \
+     "--vae", "Models/Wan/Wan2.1_VAE.pth", \
+     "--clip", "Models/Wan/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", \
+     "--t5", "Models/Wan/models_t5_umt5-xxl-enc-bf16.pth", \
+     "--dataset_config", "dataset/testtoml.toml", \
+     "--output_dir", "/data/output", \
+     "--output_name", "My_HF_Lora_v1", \
+     "--save_every_n_epochs", "10", \
+     "--max_train_epochs", "70", \
+     "--network_module", "networks.lora_wan", \
+     "--network_dim", "32", \
+     "--network_alpha", "4", \
+     "--learning_rate", "2e-5", \
+     "--optimizer_type", "adamw", \
+     "--mixed_precision", "bf16", \
+     "--gradient_checkpointing", \
+     "--sdpa" \
+]
\ No newline at end of file
diff --git a/README.ja.md b/README.ja.md
new file mode 100644
index 0000000000000000000000000000000000000000..0bc28a1359d6d4d83e048947ea4aa579606124c4
--- /dev/null
+++ b/README.ja.md
@@ -0,0 +1,426 @@
+# GUIの使い方
+
+- GUIを開くには、次のコマンドを実行します - `Start_Wan_GUI.bat`
+- すべての設定は、 "**Load Settings**" および "**Save Setting**" ボタンを使用して保存および読み込むことができます。
+- 設定の詳細については以下を参照してください。 [Wan2.1 documentation](./docs/wan.md), [Advanced Configuration](./docs/advanced_config.md#fp8-quantization), [Dataset configuration guide](./dataset/dataset_config.md).
+
+
+![Preview](docs/Preview.png)
+
+
+
+# Musubi Tuner
+
+[English](./README.md) | [日本語](./README.ja.md)
+
+## 目次
+
+- [はじめに](#はじめに)
+    - [最近の更新](#最近の更新)
+    - [リリースについて](#リリースについて)
+- [概要](#概要)
+    - [ハードウェア要件](#ハードウェア要件)
+    - [特徴](#特徴)
+- [インストール](#インストール)
+- [モデルのダウンロード](#モデルのダウンロード)
+    - [HunyuanVideoの公式モデルを使う](#HunyuanVideoの公式モデルを使う)
+    - [Text EncoderにComfyUI提供のモデルを使う](#Text-EncoderにComfyUI提供のモデルを使う)
+- [使い方](#使い方)
+    - [データセット設定](#データセット設定)
+    - [latentの事前キャッシュ](#latentの事前キャッシュ)
+    - [Text Encoder出力の事前キャッシュ](#Text-Encoder出力の事前キャッシュ)
+    - [学習](#学習)
+    - [LoRAの重みのマージ](#LoRAの重みのマージ)
+    - [推論](#推論)
+    - [SkyReels V1での推論](#SkyReels-V1での推論)
+    - [LoRAの形式の変換](#LoRAの形式の変換)
+- [その他](#その他)
+    - [SageAttentionのインストール方法](#SageAttentionのインストール方法)
+- [免責事項](#免責事項)
+- [コントリビューションについて](#コントリビューションについて)
+- [ライセンス](#ライセンス)
+
+## はじめに
+
+このリポジトリは、HunyuanVideoおよびWan2.1のLoRA学習用のコマンドラインツールです。このリポジトリは非公式であり、公式のHunyuanVideoやWan2.1のリポジトリとは関係ありません。
+
+Wan2.1については、[Wan2.1のドキュメント](./docs/wan.md)も参照してください。
+
+*リポジトリは開発中です。*
+
+### 最近の更新
+
+- 2025/03/16
+    - Wan2.1の学習で、fp16の重みを使用した場合でも重みがbf16にcastされていた不具合を修正しました。[PR #160]https://github.com/kohya-ss/musubi-tuner/pull/160)
+        - あわせてfp16の重みを使用するとサンプル画像生成で黒画像が生成される不具合を修正しました。
+        - fp16の学習で不具合が起きる場合にはbf16をお使いください。
+    - Wan2.1の推論スクリプトをリファクタリングしました。`--fp8_fast`と`--compile`オプションが追加されました。詳しくは[こちら](./docs/wan.md#inference--推論)を参照してください。PR [#153](https://github.com/kohya-ss/musubi-tuner/pull/153)
+        - 大幅に変更を行ったため、不具合があればお知らせください。
+    - 先日追加された`--fp8_scaled`オプションは、fp8での学習および推論の精度向上に効果があるようです。`--fp8_base`で学習している場合や、`--fp8`で推論している場合は、`--fp8_scaled`の追加をご検討ください。問題があればご連絡ください。
+    
+- 2025/03/13
+    - HunyuanVideoの推論スクリプトで、RTX 40x0向けの高速化オプション`--fp8_fast`と、`torch.compile`を使用するオプション`--compile`が追加されました。[PR #137](https://github.com/kohya-ss/musubi-tuner/pull/137) Sarania 氏に感謝いたします。
+        - 詳細は[推論](#推論)を参照してください。
+    - Wan2.1の学習、推論で、fp8量子化を行うオプションを`--fp8_scaled`を追加しました。[PR #141](https://github.com/kohya-ss/musubi-tuner/pull/141) 
+        - 単純なFP8へのキャストではなく、スケーリングを行うことで、VRAM使用量の削減と精度の維持を両立します。
+        - 詳細は[高度な設定](./docs/advanced_config.md#fp8-quantization)を参照してください。
+        - また`fp16`のモデルをWan2.1の学習と推論でサポートしました。
+
+- 2025/03/07
+    - Wan 2.1の学習で、サンプル画像生成を行わない場合でも`--t5`オプションが必須になっていたのを修正しました。
+
+- 2025/03/07
+    - Wan 2.1のLoRA学習をサポートしました。`wan_train_network.py`を使用してください。詳細は[こちら](./docs/wan.md)を参照してください。
+
+- 2025/03/04
+    - Wan 2.1の推論をサポートしました。`wan_generate_video.py`を使用してください。詳細は[こちら](./docs/wan.md)を参照してください。
+        - `requirements.txt`が更新されました。`pip install -r requirements.txt`を実行してください。
+
+### リリースについて
+
+Musubi Tunerの解説記事執筆や、関連ツールの開発に取り組んでくださる方々に感謝いたします。このプロジェクトは開発中のため、互換性のない変更や機能追加が起きる可能性があります。想定外の互換性問題を避けるため、参照用として[リリース](https://github.com/kohya-ss/musubi-tuner/releases)をお使いください。
+
+最新のリリースとバージョン履歴は[リリースページ](https://github.com/kohya-ss/musubi-tuner/releases)で確認できます。
+
+## 概要
+
+### ハードウェア要件
+
+- VRAM: 静止画での学習は12GB以上推奨、動画での学習は24GB以上推奨。
+    - *解像度等の学習設定により異なります。*12GBでは解像度 960x544 以下とし、`--blocks_to_swap`、`--fp8_llm`等の省メモリオプションを使用してください。
+- メインメモリ: 64GB以上を推奨、32GB+スワップで動作するかもしれませんが、未検証です。
+
+### 特徴
+
+- 省メモリに特化
+- Windows対応（Linuxでの動作報告もあります）
+- マルチGPUには対応していません
+
+## インストール
+
+### pipによるインストール
+
+Python 3.10以上を使用してください（3.10で動作確認済み）。
+
+適当な仮想環境を作成し、ご利用のCUDAバージョンに合わせたPyTorchとtorchvisionをインストールしてください。
+
+PyTorchはバージョン2.5.1以上を使用してください（[補足](#PyTorchのバージョンについて)）。
+
+```bash
+pip install torch torchvision --index-url https://download.pytorch.org/whl/cu124
+```
+
+以下のコマンドを使用して、必要な依存関係をインストールします。
+
+```bash
+pip install -r requirements.txt
+```
+
+オプションとして、FlashAttention、SageAttention（推論にのみ使用、インストール方法は[こちら](#SageAttentionのインストール方法)を参照）を使用できます。
+
+また、`ascii-magic`（データセットの確認に使用）、`matplotlib`（timestepsの可視化に使用）、`tensorboard`（学習ログの記録に使用）を必要に応じてインストールしてください。
+
+```bash
+pip install ascii-magic matplotlib tensorboard
+```
+### uvによるインストール
+
+uvを使用してインストールすることもできますが、uvによるインストールは試験的なものです。フィードバックを歓迎します。
+
+#### Linux/MacOS
+
+```sh
+curl -LsSf https://astral.sh/uv/install.sh | sh
+```
+
+表示される指示に従い、pathを設定してください。
+
+#### Windows
+
+```powershell
+powershell -c "irm https://astral.sh/uv/install.ps1 | iex"
+```
+
+表示される指示に従い、PATHを設定するか、この時点でシステムを再起動してください。
+
+## モデルのダウンロード
+
+以下のいずれかの方法で、モデルをダウンロードしてください。
+
+### HunyuanVideoの公式モデルを使う 
+
+[公式のREADME](https://github.com/Tencent/HunyuanVideo/blob/main/ckpts/README.md)を参考にダウンロードし、任意のディレクトリに以下のように配置します。
+
+```
+  ckpts
+    ├──hunyuan-video-t2v-720p
+    │  ├──transformers
+    │  ├──vae
+    ├──text_encoder
+    ├──text_encoder_2
+    ├──...
+```
+
+### Text EncoderにComfyUI提供のモデルを使う
+
+こちらの方法の方がより簡単です。DiTとVAEのモデルはHumyuanVideoのものを使用します。
+
+https://huggingface.co/tencent/HunyuanVideo/tree/main/hunyuan-video-t2v-720p/transformers から、[mp_rank_00_model_states.pt](https://huggingface.co/tencent/HunyuanVideo/resolve/main/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt) をダウンロードし、任意のディレクトリに配置します。
+
+（同じページにfp8のモデルもありますが、未検証です。）
+
+`--fp8_base`を指定して学習する場合は、`mp_rank_00_model_states.pt`の代わりに、[こちら](https://huggingface.co/kohya-ss/HunyuanVideo-fp8_e4m3fn-unofficial)の`mp_rank_00_model_states_fp8.safetensors`を使用可能です。（このファイルは非公式のもので、重みを単純にfloat8_e4m3fnに変換したものです。）
+
+また、https://huggingface.co/tencent/HunyuanVideo/tree/main/hunyuan-video-t2v-720p/vae から、[pytorch_model.pt](https://huggingface.co/tencent/HunyuanVideo/resolve/main/hunyuan-video-t2v-720p/vae/pytorch_model.pt) をダウンロードし、任意のディレクトリに配置します。
+
+Text EncoderにはComfyUI提供のモデルを使用させていただきます。[ComyUIのページ](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)を参考に、https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged/tree/main/split_files/text_encoders から、llava_llama3_fp16.safetensors （Text Encoder 1、LLM）と、clip_l.safetensors （Text Encoder 2、CLIP）をダウンロードし、任意のディレクトリに配置します。
+
+（同じページにfp8のLLMモデルもありますが、動作未検証です。）
+
+## 使い方
+
+### データセット設定
+
+[こちら](./dataset/dataset_config.md)を参照してください。
+
+### latentの事前キャッシュ
+
+latentの事前キャッシュは必須です。以下のコマンドを使用して、事前キャッシュを作成してください。（pipによるインストールの場合）
+
+```bash
+python cache_latents.py --dataset_config path/to/toml --vae path/to/ckpts/hunyuan-video-t2v-720p/vae/pytorch_model.pt --vae_chunk_size 32 --vae_tiling
+```
+
+uvでインストールした場合は、`uv run python cache_latents.py ...`のように、`uv run`を先頭につけてください。以下のコマンドも同様です。
+
+その他のオプションは`python cache_latents.py --help`で確認できます。
+
+VRAMが足りない場合は、`--vae_spatial_tile_sample_min_size`を128程度に減らし、`--batch_size`を小さくしてください。
+
+`--debug_mode image` を指定するとデータセットの画像とキャプションが新規ウィンドウに表示されます。`--debug_mode console`でコンソールに表示されます（`ascii-magic`が必要）。
+
+デフォルトではデータセットに含まれないキャッシュファイルは自動的に削除されます。`--keep_cache`を指定すると、キャッシュファイルを残すことができます。
+
+### Text Encoder出力の事前キャッシュ
+
+Text Encoder出力の事前キャッシュは必須です。以下のコマンドを使用して、事前キャッシュを作成してください。
+
+```bash
+python cache_text_encoder_outputs.py --dataset_config path/to/toml  --text_encoder1 path/to/ckpts/text_encoder --text_encoder2 path/to/ckpts/text_encoder_2 --batch_size 16
+```
+
+その他のオプションは`python cache_text_encoder_outputs.py --help`で確認できます。
+
+`--batch_size`はVRAMに合わせて調整してください。
+
+VRAMが足りない場合（16GB程度未満の場合）は、`--fp8_llm`を指定して、fp8でLLMを実行してください。
+
+デフォルトではデータセットに含まれないキャッシュファイルは自動的に削除されます。`--keep_cache`を指定すると、キャッシュファイルを残すことができます。
+
+### 学習
+
+以下のコマンドを使用して、学習を開始します（実際には一行で入力してください）。
+
+```bash
+accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 hv_train_network.py 
+    --dit path/to/ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt 
+    --dataset_config path/to/toml --sdpa --mixed_precision bf16 --fp8_base 
+    --optimizer_type adamw8bit --learning_rate 2e-4 --gradient_checkpointing 
+    --max_data_loader_n_workers 2 --persistent_data_loader_workers 
+    --network_module networks.lora --network_dim 32 
+    --timestep_sampling shift --discrete_flow_shift 7.0 
+    --max_train_epochs 16 --save_every_n_epochs 1 --seed 42
+    --output_dir path/to/output_dir --output_name name-of-lora
+```
+
+__更新__：サンプルの学習率を1e-3から2e-4に、`--timestep_sampling`を`sigmoid`から`shift`に、`--discrete_flow_shift`を1.0から7.0に変更しました。より高速な学習が期待されます。ディテールが甘くなる場合は、discrete flow shiftを3.0程度に下げてみてください。
+
+ただ、適切な学習率、学習ステップ数、timestepsの分布、loss weightingなどのパラメータは、以前として不明な点が数多くあります。情報提供をお待ちしています。
+
+その他のオプションは`python hv_train_network.py --help`で確認できます（ただし多くのオプションは動作未確認です）。
+
+`--fp8_base`を指定すると、DiTがfp8で学習されます。未指定時はmixed precisionのデータ型が使用されます。fp8は大きく消費メモリを削減できますが、品質は低下する可能性があります。`--fp8_base`を指定しない場合はVRAM 24GB以上を推奨します。また必要に応じて`--blocks_to_swap`を使用してください。
+
+VRAMが足りない場合は、`--blocks_to_swap`を指定して、一部のブロックをCPUにオフロードしてください。最大36が指定できます。
+
+（block swapのアイデアは2kpr氏の実装に基づくものです。2kpr氏にあらためて感謝します。）
+
+`--sdpa`でPyTorchのscaled dot product attentionを使用します。`--flash_attn`で[FlashAttention]:(https://github.com/Dao-AILab/flash-attention)を使用します。`--xformers`でxformersの利用も可能ですが、xformersを使う場合は`--split_attn`を指定してください。`--sage_attn`でSageAttentionを使用しますが、SageAttentionは現時点では学習に未対応のため、正しく動作しません。
+
+`--split_attn`を指定すると、attentionを分割して処理します。速度が多少低下しますが、VRAM使用量はわずかに減ります。
+
+学習されるLoRAの形式は、`sd-scripts`と同じです。
+
+`--show_timesteps`に`image`（`matplotlib`が必要）または`console`を指定すると、学習時のtimestepsの分布とtimestepsごとのloss weightingが確認できます。
+
+学習時のログの記録が可能です。[TensorBoard形式のログの保存と参照](./docs/advanced_config.md#save-and-view-logs-in-tensorboard-format--tensorboard形式のログの保存と参照)を参照してください。
+
+学習中のサンプル画像生成については、[こちらのドキュメント](./docs/sampling_during_training.md)を参照してください。その他の高度な設定については[こちらのドキュメント](./docs/advanced_config.md)を参照してください。
+
+### LoRAの重みのマージ
+
+注：Wan 2.1には対応していません。
+
+```bash
+python merge_lora.py \
+    --dit path/to/ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt \
+    --lora_weight path/to/lora.safetensors \
+    --save_merged_model path/to/merged_model.safetensors \
+    --device cpu \
+    --lora_multiplier 1.0
+```
+
+`--device`には計算を行うデバイス（`cpu`または`cuda`等）を指定してください。`cuda`を指定すると計算が高速化されます。
+
+`--lora_weight`にはマージするLoRAの重みを、`--lora_multiplier`にはLoRAの重みの係数を、それぞれ指定してください。複数個が指定可能で、両者の数は一致させてください。
+
+### 推論
+
+以下のコマンドを使用して動画を生成します。
+
+```bash
+python hv_generate_video.py --fp8 --video_size 544 960 --video_length 5 --infer_steps 30 
+    --prompt "A cat walks on the grass, realistic style."  --save_path path/to/save/dir --output_type both 
+    --dit path/to/ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt --attn_mode sdpa --split_attn
+    --vae path/to/ckpts/hunyuan-video-t2v-720p/vae/pytorch_model.pt 
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 
+    --text_encoder1 path/to/ckpts/text_encoder 
+    --text_encoder2 path/to/ckpts/text_encoder_2 
+    --seed 1234 --lora_multiplier 1.0 --lora_weight path/to/lora.safetensors
+```
+
+その他のオプションは`python hv_generate_video.py --help`で確認できます。
+
+`--fp8`を指定すると、DiTがfp8で推論されます。fp8は大きく消費メモリを削減できますが、品質は低下する可能性があります。
+
+RTX 40x0シリーズのGPUを使用している場合は、`--fp8_fast`オプションを指定することで、高速推論が可能です。このオプションを指定する場合は、`--fp8`も指定してください。
+
+VRAMが足りない場合は、`--blocks_to_swap`を指定して、一部のブロックをCPUにオフロードしてください。最大38が指定できます。
+
+`--attn_mode`には`flash`、`torch`、`sageattn`、`xformers`または`sdpa`（`torch`指定時と同じ）のいずれかを指定してください。それぞれFlashAttention、scaled dot product attention、SageAttention、xformersに対応します。デフォルトは`torch`です。SageAttentionはVRAMの削減に有効です。
+
+`--split_attn`を指定すると、attentionを分割して処理します。SageAttention利用時で10%程度の高速化が見込まれます。
+
+`--output_type`には`both`、`latent`、`video`、`images`のいずれかを指定してください。`both`はlatentと動画の両方を出力します。VAEでOut of Memoryエラーが発生する場合に備えて、`both`を指定することをお勧めします。`--latent_path`に保存されたlatentを指定し、`--output_type video` （または`images`）としてスクリプトを実行すると、VAEのdecodeのみを行えます。
+
+`--seed`は省略可能です。指定しない場合はランダムなシードが使用されます。
+
+`--video_length`は「4の倍数+1」を指定してください。
+
+`--flow_shift`にタイムステップのシフト値（discrete flow shift）を指定可能です。省略時のデフォルト値は7.0で、これは推論ステップ数が50の時の推奨値です。HunyuanVideoの論文では、ステップ数50の場合は7.0、ステップ数20未満（10など）で17.0が推奨されています。
+
+`--video_path`に読み込む動画を指定すると、video2videoの推論が可能です。動画ファイルを指定するか、複数の画像ファイルが入ったディレクトリを指定してください（画像ファイルはファイル名でソートされ、各フレームとして用いられます）。`--video_length`よりも短い動画を指定するとエラーになります。`--strength`で強度を指定できます。0~1.0で指定でき、大きいほど元の動画からの変化が大きくなります。
+
+なおvideo2video推論の処理は実験的なものです。
+
+`--compile`オプションでPyTorchのコンパイル機能を有効にします（実験的機能）。tritonのインストールが必要です。また、WindowsではVisual C++ build toolsが必要で、かつPyTorch>=2.6.0でのみ動作します。`--compile_args`でコンパイル時の引数を渡すことができます。
+
+`--compile`は初回実行時にかなりの時間がかかりますが、2回目以降は高速化されます。
+
+`--save_merged_model`オプションで、LoRAマージ後のDiTモデルを保存できます。`--save_merged_model path/to/merged_model.safetensors`のように指定してください。なおこのオプションを指定すると推論は行われません。
+
+### SkyReels V1での推論
+
+SkyReels V1のT2VとI2Vモデルがサポートされています（推論のみ）。
+
+モデルは[こちら](https://huggingface.co/Kijai/SkyReels-V1-Hunyuan_comfy)からダウンロードできます。モデルを提供してくださったKijai氏に感謝します。`skyreels_hunyuan_i2v_bf16.safetensors`がI2Vモデル、`skyreels_hunyuan_t2v_bf16.safetensors`がT2Vモデルです。`bf16`以外の形式は未検証です（`fp8_e4m3fn`は動作するかもしれません）。
+
+T2V推論を行う場合、以下のオプションを推論コマンドに追加してください：
+
+```bash
+--guidance_scale 6.0 --embedded_cfg_scale 1.0 --negative_prompt "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion" --split_uncond
+```
+
+SkyReels V1はclassifier free guidance（ネガティブプロンプト）を必要とするようです。`--guidance_scale`はネガティブプロンプトのガイダンススケールです。公式リポジトリの推奨値は6.0です。デフォルトは1.0で、この場合はclassifier free guidanceは使用されません（ネガティブプロンプトは無視されます）。
+
+`--embedded_cfg_scale`は埋め込みガイダンスのスケールです。公式リポジトリの推奨値は1.0です（埋め込みガイダンスなしを意味すると思われます）。
+
+`--negative_prompt`はいわゆるネガティブプロンプトです。上記のサンプルは公式リポジトリのものです。`--guidance_scale`を指定し、`--negative_prompt`を指定しなかった場合は、空文字列が使用されます。
+
+`--split_uncond`を指定すると、モデル呼び出しをuncondとcond（ネガティブプロンプトとプロンプト）に分割します。VRAM使用量が減りますが、推論速度は低下する可能性があります。`--split_attn`が指定されている場合、`--split_uncond`は自動的に有効になります。
+
+### LoRAの形式の変換
+
+ComfyUIで使用可能な形式（Diffusion-pipeと思われる）への変換は以下のコマンドで行えます。
+
+```bash
+python convert_lora.py --input path/to/musubi_lora.safetensors --output path/to/another_format.safetensors --target other
+```
+
+`--input`と`--output`はそれぞれ入力と出力のファイルパスを指定してください。
+
+`--target`には`other`を指定してください。`default`を指定すると、他の形式から当リポジトリの形式に変換できます。
+
+Wan2.1も対応済みです。
+
+## その他
+
+### SageAttentionのインストール方法
+
+sdbds氏によるWindows対応のSageAttentionのwheelが https://github.com/sdbds/SageAttention-for-windows で公開されています。triton をインストールし、Python、PyTorch、CUDAのバージョンが一致する場合は、[Releases](https://github.com/sdbds/SageAttention-for-windows/releases)からビルド済みwheelをダウンロードしてインストールすることが可能です。sdbds氏に感謝します。
+
+参考までに、以下は、SageAttentionをビルドしインストールするための簡単な手順です。Microsoft Visual C++ 再頒布可能パッケージを最新にする必要があるかもしれません。
+
+1. Pythonのバージョンに応じたtriton 3.1.0のwhellを[こちら](https://github.com/woct0rdho/triton-windows/releases/tag/v3.1.0-windows.post5)からダウンロードしてインストールします。
+
+2. Microsoft Visual Studio 2022かBuild Tools for Visual Studio 2022を、C++のビルドができるよう設定し、インストールします。（上のRedditの投稿を参照してください）。
+
+3. 任意のフォルダにSageAttentionのリポジトリをクローンします。
+    ```shell
+    git clone https://github.com/thu-ml/SageAttention.git
+    ```
+
+    なお `git clone https://github.com/sdbds/SageAttention-for-windows.git` で、前述のsdbds氏のリポジトリを使用することで、手順4.を省略できます。
+
+4. `SageAttention/csrc`フォルダ内の`math.cuh`を開き、71行目と146行目の `ushort` を `unsigned short` に変更して保存します。
+
+5. スタートメニューから Visual Studio 2022 内の `x64 Native Tools Command Prompt for VS 2022` を選択してコマンドプロンプトを開きます。
+
+6. venvを有効にし、SageAttentionのフォルダに移動して以下のコマンドを実行します。DISTUTILSが設定されていない、のようなエラーが出た場合は `set DISTUTILS_USE_SDK=1`としてから再度実行してください。
+    ```shell
+    python setup.py install
+    ```
+
+以上でSageAttentionのインストールが完了です。
+
+### PyTorchのバージョンについて
+
+`--attn_mode`に`torch`を指定する場合、2.5.1以降のPyTorchを使用してください（それより前のバージョンでは生成される動画が真っ黒になるようです）。
+
+古いバージョンを使う場合、xformersやSageAttentionを使用してください。
+
+## 免責事項
+
+このリポジトリは非公式であり、公式のHunyuanVideoリポジトリとは関係ありません。また、このリポジトリは開発中で、実験的なものです。テストおよびフィードバックを歓迎しますが、以下の点にご注意ください：
+
+- 実際の稼働環境での動作を意図したものではありません
+- 機能やAPIは予告なく変更されることがあります
+- いくつもの機能が未検証です
+- 動画学習機能はまだ開発中です
+
+問題やバグについては、以下の情報とともにIssueを作成してください：
+
+- 問題の詳細な説明
+- 再現手順
+- 環境の詳細（OS、GPU、VRAM、Pythonバージョンなど）
+- 関連するエラーメッセージやログ
+
+## コントリビューションについて
+
+コントリビューションを歓迎します。ただし、以下にご注意ください：
+
+- メンテナーのリソースが限られているため、PRのレビューやマージには時間がかかる場合があります
+- 大きな変更に取り組む前には、議論のためのIssueを作成してください
+- PRに関して：
+    - 変更は焦点を絞り、適度なサイズにしてください
+    - 明確な説明をお願いします
+    - 既存のコードスタイルに従ってください
+    - ドキュメントが更新されていることを確認してください
+
+## ライセンス
+
+`hunyuan_model`ディレクトリ以下のコードは、[HunyuanVideo](https://github.com/Tencent/HunyuanVideo)のコードを一部改変して使用しているため、そちらのライセンスに従います。
+
+`wan`ディレクトリ以下のコードは、[Wan2.1](https://github.com/Wan-Video/Wan2.1)のコードを一部改変して使用しています。ライセンスはApache License 2.0です。
+
+他のコードはApache License 2.0に従います。一部Diffusersのコードをコピー、改変して使用しています。
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..de2314ddec2c877b2241a5281dc9e0b09a84c8e8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,64 @@
+# Simple GUI for [Musubi Tuner](https://github.com/kohya-ss/musubi-tuner) (Wan 2.1 models only)
+
+
+# How to use GUI
+
+- Download the repository by running in the command line:
+`git clone https://github.com/Kvento/musubi-tuner-wan-gui`
+
+- To open the GUI just run `Start_Wan_GUI.bat`.
+- All settings can be saved and loaded using the "**Load Settings**" and "**Save Setting**" buttons.
+- More info about settings see in [Wan2.1 documentation](./docs/wan.md), [Advanced Configuration](./docs/advanced_config.md#fp8-quantization), [Dataset configuration guide](./dataset/dataset_config.md).
+
+
+![Preview](docs/Preview.png)
+
+
+
+
+
+# Miscellaneous
+
+
+## SageAttention Installation
+
+sdbsd has provided a Windows-compatible SageAttention implementation and pre-built wheels here:  https://github.com/sdbds/SageAttention-for-windows. After installing triton, if your Python, PyTorch, and CUDA versions match, you can download and install the pre-built wheel from the [Releases](https://github.com/sdbds/SageAttention-for-windows/releases) page. Thanks to sdbsd for this contribution.
+
+For reference, the build and installation instructions are as follows. You may need to update Microsoft Visual C++ Redistributable to the latest version.
+
+1. Download and install triton 3.1.0 wheel matching your Python version from [here](https://github.com/woct0rdho/triton-windows/releases/tag/v3.1.0-windows.post5).
+
+2. Install Microsoft Visual Studio 2022 or Build Tools for Visual Studio 2022, configured for C++ builds.
+
+3. Clone the SageAttention repository in your preferred directory:
+    ```shell
+    git clone https://github.com/thu-ml/SageAttention.git
+    ```
+
+    You can skip step 4 by using the sdbsd repository mentioned above by `git clone https://github.com/sdbds/SageAttention-for-windows.git`.
+
+4. Open `math.cuh` in the `SageAttention/csrc` folder and change `ushort` to `unsigned short` on lines 71 and 146, then save.
+
+5. Open `x64 Native Tools Command Prompt for VS 2022` from the Start menu under Visual Studio 2022.
+
+6. Activate your venv, navigate to the SageAttention folder, and run the following command. If you get a DISTUTILS not configured error, set `set DISTUTILS_USE_SDK=1` and try again:
+    ```shell
+    python setup.py install
+    ```
+
+This completes the SageAttention installation.
+
+### PyTorch version
+
+If you specify `torch` for `--attn_mode`, use PyTorch 2.5.1 or later (earlier versions may result in black videos).
+
+If you use an earlier version, use xformers or SageAttention.
+
+
+# License
+
+Code under the `hunyuan_model` directory is modified from [HunyuanVideo](https://github.com/Tencent/HunyuanVideo) and follows their license.
+
+Code under the `wan` directory is modified from [Wan2.1](https://github.com/Wan-Video/Wan2.1). The license is under the Apache License 2.0.
+
+Other code is under the Apache License 2.0. Some code is copied and modified from Diffusers.
diff --git a/Start_Wan_GUI.bat b/Start_Wan_GUI.bat
new file mode 100644
index 0000000000000000000000000000000000000000..691b977cbcac2fb6752b4c43b14ad3b629c42713
--- /dev/null
+++ b/Start_Wan_GUI.bat
@@ -0,0 +1,54 @@
+@echo off
+setlocal
+
+:: Specify the path to your Python script
+set SCRIPT_PATH=wan_lora_trainer_gui.py
+
+:: Check if Python is installed
+echo Checking for Python...
+python --version >nul 2>&1
+if %errorlevel% neq 0 (
+    echo Python not found. Automatic installation is not possible via bat file.
+    echo Please install Python manually from the official website: https://www.python.org/
+    pause
+    exit /b 1
+)
+
+:: Check for pip (tool for installing Python packages)
+echo Checking for pip...
+python -m ensurepip >nul 2>&1
+python -m pip --version >nul 2>&1
+if %errorlevel% neq 0 (
+    echo pip not found. Installing pip...
+    python -m ensurepip --upgrade
+    python -m pip install --upgrade pip
+    if %errorlevel% neq 0 (
+        echo Failed to install pip. Please check your Python installation.
+        pause
+        exit /b 1
+    )
+)
+
+:: Check for tkinter
+echo Checking for tkinter...
+python -c "import tkinter" >nul 2>&1
+if %errorlevel% neq 0 (
+    echo tkinter module not found. Attempting to install...
+    python -m pip install tk
+    if %errorlevel% neq 0 (
+        echo Failed to install tkinter. There might be an issue with permissions.
+        pause
+        exit /b 1
+    )
+)
+
+:: Run the script
+echo All dependencies are installed. Running the script...
+start /min python %SCRIPT_PATH%
+if %errorlevel% neq 0 (
+    echo An error occurred while running the script.
+    pause
+    exit /b 1
+)
+
+echo Script executed successfully.
\ No newline at end of file
diff --git a/cache_latents.py b/cache_latents.py
new file mode 100644
index 0000000000000000000000000000000000000000..b168337e5f8fb1147de548698386ef7833eda1dc
--- /dev/null
+++ b/cache_latents.py
@@ -0,0 +1,281 @@
+import argparse
+import os
+import glob
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from dataset import config_utils
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+from PIL import Image
+
+import logging
+
+from dataset.image_video_dataset import BaseDataset, ItemInfo, save_latent_cache, ARCHITECTURE_HUNYUAN_VIDEO
+from hunyuan_model.vae import load_vae
+from hunyuan_model.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from utils.model_utils import str_to_dtype
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def show_image(image: Union[list[Union[Image.Image, np.ndarray], Union[Image.Image, np.ndarray]]]) -> int:
+    import cv2
+
+    imgs = (
+        [image]
+        if (isinstance(image, np.ndarray) and len(image.shape) == 3) or isinstance(image, Image.Image)
+        else [image[0], image[-1]]
+    )
+    if len(imgs) > 1:
+        print(f"Number of images: {len(image)}")
+    for i, img in enumerate(imgs):
+        if len(imgs) > 1:
+            print(f"{'First' if i == 0 else 'Last'} image: {img.shape}")
+        else:
+            print(f"Image: {img.shape}")
+        cv2_img = np.array(img) if isinstance(img, Image.Image) else img
+        cv2_img = cv2.cvtColor(cv2_img, cv2.COLOR_RGB2BGR)
+        cv2.imshow("image", cv2_img)
+        k = cv2.waitKey(0)
+        cv2.destroyAllWindows()
+        if k == ord("q") or k == ord("d"):
+            return k
+    return k
+
+
+def show_console(
+    image: Union[list[Union[Image.Image, np.ndarray], Union[Image.Image, np.ndarray]]],
+    width: int,
+    back: str,
+    interactive: bool = False,
+) -> int:
+    from ascii_magic import from_pillow_image, Back
+
+    back = None
+    if back is not None:
+        back = getattr(Back, back.upper())
+
+    k = None
+    imgs = (
+        [image]
+        if (isinstance(image, np.ndarray) and len(image.shape) == 3) or isinstance(image, Image.Image)
+        else [image[0], image[-1]]
+    )
+    if len(imgs) > 1:
+        print(f"Number of images: {len(image)}")
+    for i, img in enumerate(imgs):
+        if len(imgs) > 1:
+            print(f"{'First' if i == 0 else 'Last'} image: {img.shape}")
+        else:
+            print(f"Image: {img.shape}")
+        pil_img = img if isinstance(img, Image.Image) else Image.fromarray(img)
+        ascii_img = from_pillow_image(pil_img)
+        ascii_img.to_terminal(columns=width, back=back)
+
+        if interactive:
+            k = input("Press q to quit, d to next dataset, other key to next: ")
+            if k == "q" or k == "d":
+                return ord(k)
+
+    if not interactive:
+        return ord(" ")
+    return ord(k) if k else ord(" ")
+
+
+def show_datasets(
+    datasets: list[BaseDataset], debug_mode: str, console_width: int, console_back: str, console_num_images: Optional[int]
+):
+    print(f"d: next dataset, q: quit")
+
+    num_workers = max(1, os.cpu_count() - 1)
+    for i, dataset in enumerate(datasets):
+        print(f"Dataset [{i}]")
+        batch_index = 0
+        num_images_to_show = console_num_images
+        k = None
+        for key, batch in dataset.retrieve_latent_cache_batches(num_workers):
+            print(f"bucket resolution: {key}, count: {len(batch)}")
+            for j, item_info in enumerate(batch):
+                item_info: ItemInfo
+                print(f"{batch_index}-{j}: {item_info}")
+                if debug_mode == "image":
+                    k = show_image(item_info.content)
+                elif debug_mode == "console":
+                    k = show_console(item_info.content, console_width, console_back, console_num_images is None)
+                    if num_images_to_show is not None:
+                        num_images_to_show -= 1
+                        if num_images_to_show == 0:
+                            k = ord("d")  # next dataset
+
+                if k == ord("q"):
+                    return
+                elif k == ord("d"):
+                    break
+            if k == ord("d"):
+                break
+            batch_index += 1
+
+
+def encode_and_save_batch(vae: AutoencoderKLCausal3D, batch: list[ItemInfo]):
+    contents = torch.stack([torch.from_numpy(item.content) for item in batch])
+    if len(contents.shape) == 4:
+        contents = contents.unsqueeze(1)  # B, H, W, C -> B, F, H, W, C
+
+    contents = contents.permute(0, 4, 1, 2, 3).contiguous()  # B, C, F, H, W
+    contents = contents.to(vae.device, dtype=vae.dtype)
+    contents = contents / 127.5 - 1.0  # normalize to [-1, 1]
+
+    h, w = contents.shape[3], contents.shape[4]
+    if h < 8 or w < 8:
+        item = batch[0]  # other items should have the same size
+        raise ValueError(f"Image or video size too small: {item.item_key} and {len(batch) - 1} more, size: {item.original_size}")
+
+    # print(f"encode batch: {contents.shape}")
+    with torch.no_grad():
+        latent = vae.encode(contents).latent_dist.sample()
+        # latent = latent * vae.config.scaling_factor
+
+    # # debug: decode and save
+    # with torch.no_grad():
+    #     latent_to_decode = latent / vae.config.scaling_factor
+    #     images = vae.decode(latent_to_decode, return_dict=False)[0]
+    #     images = (images / 2 + 0.5).clamp(0, 1)
+    #     images = images.cpu().float().numpy()
+    #     images = (images * 255).astype(np.uint8)
+    #     images = images.transpose(0, 2, 3, 4, 1)  # B, C, F, H, W -> B, F, H, W, C
+    #     for b in range(images.shape[0]):
+    #         for f in range(images.shape[1]):
+    #             fln = os.path.splitext(os.path.basename(batch[b].item_key))[0]
+    #             img = Image.fromarray(images[b, f])
+    #             img.save(f"./logs/decode_{fln}_{b}_{f:03d}.jpg")
+
+    for item, l in zip(batch, latent):
+        # print(f"save latent cache: {item.latent_cache_path}, latent shape: {l.shape}")
+        save_latent_cache(item, l)
+
+
+def encode_datasets(datasets: list[BaseDataset], encode: callable, args: argparse.Namespace):
+    num_workers = args.num_workers if args.num_workers is not None else max(1, os.cpu_count() - 1)
+    for i, dataset in enumerate(datasets):
+        logger.info(f"Encoding dataset [{i}]")
+        all_latent_cache_paths = []
+        for _, batch in tqdm(dataset.retrieve_latent_cache_batches(num_workers)):
+            all_latent_cache_paths.extend([item.latent_cache_path for item in batch])
+
+            if args.skip_existing:
+                filtered_batch = [item for item in batch if not os.path.exists(item.latent_cache_path)]
+                if len(filtered_batch) == 0:
+                    continue
+                batch = filtered_batch
+
+            bs = args.batch_size if args.batch_size is not None else len(batch)
+            for i in range(0, len(batch), bs):
+                encode(batch[i : i + bs])
+
+        # normalize paths
+        all_latent_cache_paths = [os.path.normpath(p) for p in all_latent_cache_paths]
+        all_latent_cache_paths = set(all_latent_cache_paths)
+
+        # remove old cache files not in the dataset
+        all_cache_files = dataset.get_all_latent_cache_files()
+        for cache_file in all_cache_files:
+            if os.path.normpath(cache_file) not in all_latent_cache_paths:
+                if args.keep_cache:
+                    logger.info(f"Keep cache file not in the dataset: {cache_file}")
+                else:
+                    os.remove(cache_file)
+                    logger.info(f"Removed old cache file: {cache_file}")
+
+
+def main(args):
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+
+    # Load dataset config
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+    logger.info(f"Load dataset config from {args.dataset_config}")
+    user_config = config_utils.load_user_config(args.dataset_config)
+    blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_HUNYUAN_VIDEO)
+    train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+
+    datasets = train_dataset_group.datasets
+
+    if args.debug_mode is not None:
+        show_datasets(datasets, args.debug_mode, args.console_width, args.console_back, args.console_num_images)
+        return
+
+    assert args.vae is not None, "vae checkpoint is required"
+
+    # Load VAE model: HunyuanVideo VAE model is float16
+    vae_dtype = torch.float16 if args.vae_dtype is None else str_to_dtype(args.vae_dtype)
+    vae, _, s_ratio, t_ratio = load_vae(vae_dtype=vae_dtype, device=device, vae_path=args.vae)
+    vae.eval()
+    logger.info(f"Loaded VAE: {vae.config}, dtype: {vae.dtype}")
+
+    if args.vae_chunk_size is not None:
+        vae.set_chunk_size_for_causal_conv_3d(args.vae_chunk_size)
+        logger.info(f"Set chunk_size to {args.vae_chunk_size} for CausalConv3d in VAE")
+    if args.vae_spatial_tile_sample_min_size is not None:
+        vae.enable_spatial_tiling(True)
+        vae.tile_sample_min_size = args.vae_spatial_tile_sample_min_size
+        vae.tile_latent_min_size = args.vae_spatial_tile_sample_min_size // 8
+    elif args.vae_tiling:
+        vae.enable_spatial_tiling(True)
+
+    # Encode images
+    def encode(one_batch: list[ItemInfo]):
+        encode_and_save_batch(vae, one_batch)
+
+    encode_datasets(datasets, encode, args)
+
+
+def setup_parser_common() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--dataset_config", type=str, required=True, help="path to dataset config .toml file")
+    parser.add_argument("--vae", type=str, required=False, default=None, help="path to vae checkpoint")
+    parser.add_argument("--vae_dtype", type=str, default=None, help="data type for VAE, default is float16")
+    parser.add_argument("--device", type=str, default=None, help="device to use, default is cuda if available")
+    parser.add_argument(
+        "--batch_size", type=int, default=None, help="batch size, override dataset config if dataset batch size > this"
+    )
+    parser.add_argument("--num_workers", type=int, default=None, help="number of workers for dataset. default is cpu count-1")
+    parser.add_argument("--skip_existing", action="store_true", help="skip existing cache files")
+    parser.add_argument("--keep_cache", action="store_true", help="keep cache files not in dataset")
+    parser.add_argument("--debug_mode", type=str, default=None, choices=["image", "console"], help="debug mode")
+    parser.add_argument("--console_width", type=int, default=80, help="debug mode: console width")
+    parser.add_argument(
+        "--console_back", type=str, default=None, help="debug mode: console background color, one of ascii_magic.Back"
+    )
+    parser.add_argument(
+        "--console_num_images",
+        type=int,
+        default=None,
+        help="debug mode: not interactive, number of images to show for each dataset",
+    )
+    return parser
+
+
+def hv_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    parser.add_argument(
+        "--vae_tiling",
+        action="store_true",
+        help="enable spatial tiling for VAE, default is False. If vae_spatial_tile_sample_min_size is set, this is automatically enabled",
+    )
+    parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
+    parser.add_argument(
+        "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser_common()
+    parser = hv_setup_parser(parser)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/cache_text_encoder_outputs.py b/cache_text_encoder_outputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f2de3dfb9ea42fb0979e7f6bc7ec4b92d70496b
--- /dev/null
+++ b/cache_text_encoder_outputs.py
@@ -0,0 +1,214 @@
+import argparse
+import os
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from dataset import config_utils
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+import accelerate
+
+from dataset.image_video_dataset import ARCHITECTURE_HUNYUAN_VIDEO, BaseDataset, ItemInfo, save_text_encoder_output_cache
+from hunyuan_model import text_encoder as text_encoder_module
+from hunyuan_model.text_encoder import TextEncoder
+
+import logging
+
+from utils.model_utils import str_to_dtype
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def encode_prompt(text_encoder: TextEncoder, prompt: Union[str, list[str]]):
+    data_type = "video"  # video only, image is not supported
+    text_inputs = text_encoder.text2tokens(prompt, data_type=data_type)
+
+    with torch.no_grad():
+        prompt_outputs = text_encoder.encode(text_inputs, data_type=data_type)
+
+    return prompt_outputs.hidden_state, prompt_outputs.attention_mask
+
+
+def encode_and_save_batch(
+    text_encoder: TextEncoder, batch: list[ItemInfo], is_llm: bool, accelerator: Optional[accelerate.Accelerator]
+):
+    prompts = [item.caption for item in batch]
+    # print(prompts)
+
+    # encode prompt
+    if accelerator is not None:
+        with accelerator.autocast():
+            prompt_embeds, prompt_mask = encode_prompt(text_encoder, prompts)
+    else:
+        prompt_embeds, prompt_mask = encode_prompt(text_encoder, prompts)
+
+    # # convert to fp16 if needed
+    # if prompt_embeds.dtype == torch.float32 and text_encoder.dtype != torch.float32:
+    #     prompt_embeds = prompt_embeds.to(text_encoder.dtype)
+
+    # save prompt cache
+    for item, embed, mask in zip(batch, prompt_embeds, prompt_mask):
+        save_text_encoder_output_cache(item, embed, mask, is_llm)
+
+
+def prepare_cache_files_and_paths(datasets: list[BaseDataset]):
+    all_cache_files_for_dataset = []  # exisiting cache files
+    all_cache_paths_for_dataset = []  # all cache paths in the dataset
+    for dataset in datasets:
+        all_cache_files = [os.path.normpath(file) for file in dataset.get_all_text_encoder_output_cache_files()]
+        all_cache_files = set(all_cache_files)
+        all_cache_files_for_dataset.append(all_cache_files)
+
+        all_cache_paths_for_dataset.append(set())
+    return all_cache_files_for_dataset, all_cache_paths_for_dataset
+
+
+def process_text_encoder_batches(
+    num_workers: Optional[int],
+    skip_existing: bool,
+    batch_size: int,
+    datasets: list[BaseDataset],
+    all_cache_files_for_dataset: list[set],
+    all_cache_paths_for_dataset: list[set],
+    encode: callable,
+):
+    num_workers = num_workers if num_workers is not None else max(1, os.cpu_count() - 1)
+    for i, dataset in enumerate(datasets):
+        logger.info(f"Encoding dataset [{i}]")
+        all_cache_files = all_cache_files_for_dataset[i]
+        all_cache_paths = all_cache_paths_for_dataset[i]
+        for batch in tqdm(dataset.retrieve_text_encoder_output_cache_batches(num_workers)):
+            # update cache files (it's ok if we update it multiple times)
+            all_cache_paths.update([os.path.normpath(item.text_encoder_output_cache_path) for item in batch])
+
+            # skip existing cache files
+            if skip_existing:
+                filtered_batch = [
+                    item for item in batch if not os.path.normpath(item.text_encoder_output_cache_path) in all_cache_files
+                ]
+                # print(f"Filtered {len(batch) - len(filtered_batch)} existing cache files")
+                if len(filtered_batch) == 0:
+                    continue
+                batch = filtered_batch
+
+            bs = batch_size if batch_size is not None else len(batch)
+            for i in range(0, len(batch), bs):
+                encode(batch[i : i + bs])
+
+
+def post_process_cache_files(
+    datasets: list[BaseDataset], all_cache_files_for_dataset: list[set], all_cache_paths_for_dataset: list[set]
+):
+    for i, dataset in enumerate(datasets):
+        all_cache_files = all_cache_files_for_dataset[i]
+        all_cache_paths = all_cache_paths_for_dataset[i]
+        for cache_file in all_cache_files:
+            if cache_file not in all_cache_paths:
+                if args.keep_cache:
+                    logger.info(f"Keep cache file not in the dataset: {cache_file}")
+                else:
+                    os.remove(cache_file)
+                    logger.info(f"Removed old cache file: {cache_file}")
+
+
+def main(args):
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+
+    # Load dataset config
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+    logger.info(f"Load dataset config from {args.dataset_config}")
+    user_config = config_utils.load_user_config(args.dataset_config)
+    blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_HUNYUAN_VIDEO)
+    train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+
+    datasets = train_dataset_group.datasets
+
+    # define accelerator for fp8 inference
+    accelerator = None
+    if args.fp8_llm:
+        accelerator = accelerate.Accelerator(mixed_precision="fp16")
+
+    # prepare cache files and paths: all_cache_files_for_dataset = exisiting cache files, all_cache_paths_for_dataset = all cache paths in the dataset
+    all_cache_files_for_dataset, all_cache_paths_for_dataset = prepare_cache_files_and_paths(datasets)
+
+    # Load Text Encoder 1
+    text_encoder_dtype = torch.float16 if args.text_encoder_dtype is None else str_to_dtype(args.text_encoder_dtype)
+    logger.info(f"loading text encoder 1: {args.text_encoder1}")
+    text_encoder_1 = text_encoder_module.load_text_encoder_1(args.text_encoder1, device, args.fp8_llm, text_encoder_dtype)
+    text_encoder_1.to(device=device)
+
+    # Encode with Text Encoder 1 (LLM)
+    logger.info("Encoding with Text Encoder 1")
+
+    def encode_for_text_encoder_1(batch: list[ItemInfo]):
+        encode_and_save_batch(text_encoder_1, batch, is_llm=True, accelerator=accelerator)
+
+    process_text_encoder_batches(
+        args.num_workers,
+        args.skip_existing,
+        args.batch_size,
+        datasets,
+        all_cache_files_for_dataset,
+        all_cache_paths_for_dataset,
+        encode_for_text_encoder_1,
+    )
+    del text_encoder_1
+
+    # Load Text Encoder 2
+    logger.info(f"loading text encoder 2: {args.text_encoder2}")
+    text_encoder_2 = text_encoder_module.load_text_encoder_2(args.text_encoder2, device, text_encoder_dtype)
+    text_encoder_2.to(device=device)
+
+    # Encode with Text Encoder 2
+    logger.info("Encoding with Text Encoder 2")
+
+    def encode_for_text_encoder_2(batch: list[ItemInfo]):
+        encode_and_save_batch(text_encoder_2, batch, is_llm=False, accelerator=None)
+
+    process_text_encoder_batches(
+        args.num_workers,
+        args.skip_existing,
+        args.batch_size,
+        datasets,
+        all_cache_files_for_dataset,
+        all_cache_paths_for_dataset,
+        encode_for_text_encoder_2,
+    )
+    del text_encoder_2
+
+    # remove cache files not in dataset
+    post_process_cache_files(datasets, all_cache_files_for_dataset, all_cache_paths_for_dataset)
+
+
+def setup_parser_common():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--dataset_config", type=str, required=True, help="path to dataset config .toml file")
+    parser.add_argument("--device", type=str, default=None, help="device to use, default is cuda if available")
+    parser.add_argument(
+        "--batch_size", type=int, default=None, help="batch size, override dataset config if dataset batch size > this"
+    )
+    parser.add_argument("--num_workers", type=int, default=None, help="number of workers for dataset. default is cpu count-1")
+    parser.add_argument("--skip_existing", action="store_true", help="skip existing cache files")
+    parser.add_argument("--keep_cache", action="store_true", help="keep cache files not in dataset")
+    return parser
+
+
+def hv_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    parser.add_argument("--text_encoder1", type=str, required=True, help="Text Encoder 1 directory")
+    parser.add_argument("--text_encoder2", type=str, required=True, help="Text Encoder 2 directory")
+    parser.add_argument("--text_encoder_dtype", type=str, default=None, help="data type for Text Encoder, default is float16")
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for Text Encoder 1 (LLM)")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser_common()
+    parser = hv_setup_parser(parser)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/convert_lora.py b/convert_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ba849067cf45ac7ebdd825fbe22c1ce01d7d287
--- /dev/null
+++ b/convert_lora.py
@@ -0,0 +1,131 @@
+import argparse
+import torch
+from safetensors.torch import load_file, save_file
+from safetensors import safe_open
+from utils import model_utils
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def convert_from_diffusers(prefix, weights_sd):
+    # convert from diffusers(?) to default LoRA
+    # Diffusers format: {"diffusion_model.module.name.lora_A.weight": weight, "diffusion_model.module.name.lora_B.weight": weight, ...}
+    # default LoRA format: {"prefix_module_name.lora_down.weight": weight, "prefix_module_name.lora_up.weight": weight, ...}
+
+    # note: Diffusers has no alpha, so alpha is set to rank
+    new_weights_sd = {}
+    lora_dims = {}
+    for key, weight in weights_sd.items():
+        diffusers_prefix, key_body = key.split(".", 1)
+        if diffusers_prefix != "diffusion_model" and diffusers_prefix != "transformer":
+            logger.warning(f"unexpected key: {key} in diffusers format")
+            continue
+
+        new_key = f"{prefix}{key_body}".replace(".", "_").replace("_lora_A_", ".lora_down.").replace("_lora_B_", ".lora_up.")
+        new_weights_sd[new_key] = weight
+
+        lora_name = new_key.split(".")[0]  # before first dot
+        if lora_name not in lora_dims and "lora_down" in new_key:
+            lora_dims[lora_name] = weight.shape[0]
+
+    # add alpha with rank
+    for lora_name, dim in lora_dims.items():
+        new_weights_sd[f"{lora_name}.alpha"] = torch.tensor(dim)
+    return new_weights_sd
+
+
+def convert_to_diffusers(prefix, weights_sd):
+    # convert from default LoRA to diffusers
+
+    # get alphas
+    lora_alphas = {}
+    for key, weight in weights_sd.items():
+        if key.startswith(prefix):
+            lora_name = key.split(".", 1)[0]  # before first dot
+            if lora_name not in lora_alphas and "alpha" in key:
+                lora_alphas[lora_name] = weight
+
+    new_weights_sd = {}
+    for key, weight in weights_sd.items():
+        if key.startswith(prefix):
+            if "alpha" in key:
+                continue
+
+            lora_name = key.split(".", 1)[0]  # before first dot
+
+            module_name = lora_name[len(prefix) :]  # remove prefix
+            module_name = module_name.replace("_", ".")  # replace "_" with "."
+            if ".cross.attn." in module_name or ".self.attn." in module_name:
+                # Wan2.1 lora name to module name: ugly but works
+                module_name = module_name.replace("cross.attn", "cross_attn")
+                module_name = module_name.replace("self.attn", "self_attn")
+                module_name = module_name.replace("k.img", "k_img")
+                module_name = module_name.replace("v.img", "v_img")
+            else:
+                # HunyuanVideo lora name to module name: ugly but works
+                module_name = module_name.replace("double.blocks.", "double_blocks.")
+                module_name = module_name.replace("single.blocks.", "single_blocks.")
+                module_name = module_name.replace("img.", "img_")
+                module_name = module_name.replace("txt.", "txt_")
+                module_name = module_name.replace("attn.", "attn_")
+            diffusers_prefix = "diffusion_model"
+            if "lora_down" in key:
+                new_key = f"{diffusers_prefix}.{module_name}.lora_A.weight"
+                dim = weight.shape[0]
+            elif "lora_up" in key:
+                new_key = f"{diffusers_prefix}.{module_name}.lora_B.weight"
+                dim = weight.shape[1]
+            else:
+                logger.warning(f"unexpected key: {key} in default LoRA format")
+                continue
+
+            # scale weight by alpha using float16
+            if lora_name in lora_alphas:
+                scale = lora_alphas[lora_name].half() / dim
+                scale = scale.sqrt()
+                weight = weight.half() * scale
+            else:
+                logger.warning(f"missing alpha for {lora_name}")
+
+            new_weights_sd[new_key] = weight
+
+    return new_weights_sd
+
+
+def convert(input_file, output_file, target_format):
+    logger.info(f"loading {input_file}")
+    weights_sd = load_file(input_file)
+    with safe_open(input_file, framework="pt") as f:
+        metadata = f.metadata()
+
+    logger.info(f"converting to {target_format}")
+    prefix = "lora_unet_"
+    if target_format == "default":
+        new_weights_sd = convert_from_diffusers(prefix, weights_sd)
+        metadata = metadata or {}
+        model_utils.precalculate_safetensors_hashes(new_weights_sd, metadata)
+    elif target_format == "other":
+        new_weights_sd = convert_to_diffusers(prefix, weights_sd)
+    else:
+        raise ValueError(f"unknown target format: {target_format}")
+
+    logger.info(f"saving to {output_file}")
+    save_file(new_weights_sd, output_file, metadata=metadata)
+
+    logger.info("done")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Convert LoRA weights between default and other formats")
+    parser.add_argument("--input", type=str, required=True, help="input model file")
+    parser.add_argument("--output", type=str, required=True, help="output model file")
+    parser.add_argument("--target", type=str, required=True, choices=["other", "default"], help="target format")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    convert(args.input, args.output, args.target)
diff --git a/dataset/__init__.py b/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dataset/config_utils.py b/dataset/config_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f211a335a911c9d22c566fc5b44246277886342
--- /dev/null
+++ b/dataset/config_utils.py
@@ -0,0 +1,372 @@
+import argparse
+from dataclasses import (
+    asdict,
+    dataclass,
+)
+import functools
+import random
+from textwrap import dedent, indent
+import json
+from pathlib import Path
+
+# from toolz import curry
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import toml
+import voluptuous
+from voluptuous import Any, ExactSequence, MultipleInvalid, Object, Schema
+
+from .image_video_dataset import DatasetGroup, ImageDataset, VideoDataset
+
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+@dataclass
+class BaseDatasetParams:
+    resolution: Tuple[int, int] = (960, 544)
+    enable_bucket: bool = False
+    bucket_no_upscale: bool = False
+    caption_extension: Optional[str] = None
+    batch_size: int = 1
+    num_repeats: int = 1
+    cache_directory: Optional[str] = None
+    debug_dataset: bool = False
+    architecture: str = "no_default"  # short style like "hv" or "wan"
+
+
+@dataclass
+class ImageDatasetParams(BaseDatasetParams):
+    image_directory: Optional[str] = None
+    image_jsonl_file: Optional[str] = None
+
+
+@dataclass
+class VideoDatasetParams(BaseDatasetParams):
+    video_directory: Optional[str] = None
+    video_jsonl_file: Optional[str] = None
+    target_frames: Sequence[int] = (1,)
+    frame_extraction: Optional[str] = "head"
+    frame_stride: Optional[int] = 1
+    frame_sample: Optional[int] = 1
+
+
+@dataclass
+class DatasetBlueprint:
+    is_image_dataset: bool
+    params: Union[ImageDatasetParams, VideoDatasetParams]
+
+
+@dataclass
+class DatasetGroupBlueprint:
+    datasets: Sequence[DatasetBlueprint]
+
+
+@dataclass
+class Blueprint:
+    dataset_group: DatasetGroupBlueprint
+
+
+class ConfigSanitizer:
+    # @curry
+    @staticmethod
+    def __validate_and_convert_twodim(klass, value: Sequence) -> Tuple:
+        Schema(ExactSequence([klass, klass]))(value)
+        return tuple(value)
+
+    # @curry
+    @staticmethod
+    def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]) -> Tuple:
+        Schema(Any(klass, ExactSequence([klass, klass])))(value)
+        try:
+            Schema(klass)(value)
+            return (value, value)
+        except:
+            return ConfigSanitizer.__validate_and_convert_twodim(klass, value)
+
+    # datasets schema
+    DATASET_ASCENDABLE_SCHEMA = {
+        "caption_extension": str,
+        "batch_size": int,
+        "num_repeats": int,
+        "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
+        "enable_bucket": bool,
+        "bucket_no_upscale": bool,
+    }
+    IMAGE_DATASET_DISTINCT_SCHEMA = {
+        "image_directory": str,
+        "image_jsonl_file": str,
+        "cache_directory": str,
+    }
+    VIDEO_DATASET_DISTINCT_SCHEMA = {
+        "video_directory": str,
+        "video_jsonl_file": str,
+        "target_frames": [int],
+        "frame_extraction": str,
+        "frame_stride": int,
+        "frame_sample": int,
+        "cache_directory": str,
+    }
+
+    # options handled by argparse but not handled by user config
+    ARGPARSE_SPECIFIC_SCHEMA = {
+        "debug_dataset": bool,
+    }
+
+    def __init__(self) -> None:
+        self.image_dataset_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.IMAGE_DATASET_DISTINCT_SCHEMA,
+        )
+        self.video_dataset_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.VIDEO_DATASET_DISTINCT_SCHEMA,
+        )
+
+        def validate_flex_dataset(dataset_config: dict):
+            if "target_frames" in dataset_config:
+                return Schema(self.video_dataset_schema)(dataset_config)
+            else:
+                return Schema(self.image_dataset_schema)(dataset_config)
+
+        self.dataset_schema = validate_flex_dataset
+
+        self.general_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+        )
+        self.user_config_validator = Schema(
+            {
+                "general": self.general_schema,
+                "datasets": [self.dataset_schema],
+            }
+        )
+        self.argparse_schema = self.__merge_dict(
+            self.ARGPARSE_SPECIFIC_SCHEMA,
+        )
+        self.argparse_config_validator = Schema(Object(self.argparse_schema), extra=voluptuous.ALLOW_EXTRA)
+
+    def sanitize_user_config(self, user_config: dict) -> dict:
+        try:
+            return self.user_config_validator(user_config)
+        except MultipleInvalid:
+            # TODO: clarify the error message
+            logger.error("Invalid user config / ユーザ設定の形式が正しくないようです")
+            raise
+
+    # NOTE: In nature, argument parser result is not needed to be sanitize
+    #   However this will help us to detect program bug
+    def sanitize_argparse_namespace(self, argparse_namespace: argparse.Namespace) -> argparse.Namespace:
+        try:
+            return self.argparse_config_validator(argparse_namespace)
+        except MultipleInvalid:
+            # XXX: this should be a bug
+            logger.error(
+                "Invalid cmdline parsed arguments. This should be a bug. / コマンドラインのパース結果が正しくないようです。プログラムのバグの可能性が高いです。"
+            )
+            raise
+
+    # NOTE: value would be overwritten by latter dict if there is already the same key
+    @staticmethod
+    def __merge_dict(*dict_list: dict) -> dict:
+        merged = {}
+        for schema in dict_list:
+            # merged |= schema
+            for k, v in schema.items():
+                merged[k] = v
+        return merged
+
+
+class BlueprintGenerator:
+    BLUEPRINT_PARAM_NAME_TO_CONFIG_OPTNAME = {}
+
+    def __init__(self, sanitizer: ConfigSanitizer):
+        self.sanitizer = sanitizer
+
+    # runtime_params is for parameters which is only configurable on runtime, such as tokenizer
+    def generate(self, user_config: dict, argparse_namespace: argparse.Namespace, **runtime_params) -> Blueprint:
+        sanitized_user_config = self.sanitizer.sanitize_user_config(user_config)
+        sanitized_argparse_namespace = self.sanitizer.sanitize_argparse_namespace(argparse_namespace)
+
+        argparse_config = {k: v for k, v in vars(sanitized_argparse_namespace).items() if v is not None}
+        general_config = sanitized_user_config.get("general", {})
+
+        dataset_blueprints = []
+        for dataset_config in sanitized_user_config.get("datasets", []):
+            is_image_dataset = "target_frames" not in dataset_config
+            if is_image_dataset:
+                dataset_params_klass = ImageDatasetParams
+            else:
+                dataset_params_klass = VideoDatasetParams
+
+            params = self.generate_params_by_fallbacks(
+                dataset_params_klass, [dataset_config, general_config, argparse_config, runtime_params]
+            )
+            dataset_blueprints.append(DatasetBlueprint(is_image_dataset, params))
+
+        dataset_group_blueprint = DatasetGroupBlueprint(dataset_blueprints)
+
+        return Blueprint(dataset_group_blueprint)
+
+    @staticmethod
+    def generate_params_by_fallbacks(param_klass, fallbacks: Sequence[dict]):
+        name_map = BlueprintGenerator.BLUEPRINT_PARAM_NAME_TO_CONFIG_OPTNAME
+        search_value = BlueprintGenerator.search_value
+        default_params = asdict(param_klass())
+        param_names = default_params.keys()
+
+        params = {name: search_value(name_map.get(name, name), fallbacks, default_params.get(name)) for name in param_names}
+
+        return param_klass(**params)
+
+    @staticmethod
+    def search_value(key: str, fallbacks: Sequence[dict], default_value=None):
+        for cand in fallbacks:
+            value = cand.get(key)
+            if value is not None:
+                return value
+
+        return default_value
+
+
+# if training is True, it will return a dataset group for training, otherwise for caching
+def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlueprint, training: bool = False) -> DatasetGroup:
+    datasets: List[Union[ImageDataset, VideoDataset]] = []
+
+    for dataset_blueprint in dataset_group_blueprint.datasets:
+        if dataset_blueprint.is_image_dataset:
+            dataset_klass = ImageDataset
+        else:
+            dataset_klass = VideoDataset
+
+        dataset = dataset_klass(**asdict(dataset_blueprint.params))
+        datasets.append(dataset)
+
+    # assertion
+    cache_directories = [dataset.cache_directory for dataset in datasets]
+    num_of_unique_cache_directories = len(set(cache_directories))
+    if num_of_unique_cache_directories != len(cache_directories):
+        raise ValueError(
+            "cache directory should be unique for each dataset (note that cache directory is image/video directory if not specified)"
+            + " / cache directory は各データセットごとに異なる必要があります（指定されていない場合はimage/video directoryが使われるので注意）"
+        )
+
+    # print info
+    info = ""
+    for i, dataset in enumerate(datasets):
+        is_image_dataset = isinstance(dataset, ImageDataset)
+        info += dedent(
+            f"""\
+      [Dataset {i}]
+        is_image_dataset: {is_image_dataset}
+        resolution: {dataset.resolution}
+        batch_size: {dataset.batch_size}
+        num_repeats: {dataset.num_repeats}
+        caption_extension: "{dataset.caption_extension}"
+        enable_bucket: {dataset.enable_bucket}
+        bucket_no_upscale: {dataset.bucket_no_upscale}
+        cache_directory: "{dataset.cache_directory}"
+        debug_dataset: {dataset.debug_dataset}
+    """
+        )
+
+        if is_image_dataset:
+            info += indent(
+                dedent(
+                    f"""\
+        image_directory: "{dataset.image_directory}"
+        image_jsonl_file: "{dataset.image_jsonl_file}"
+    \n"""
+                ),
+                "    ",
+            )
+        else:
+            info += indent(
+                dedent(
+                    f"""\
+        video_directory: "{dataset.video_directory}"
+        video_jsonl_file: "{dataset.video_jsonl_file}"
+        target_frames: {dataset.target_frames}
+        frame_extraction: {dataset.frame_extraction}
+        frame_stride: {dataset.frame_stride}
+        frame_sample: {dataset.frame_sample}
+    \n"""
+                ),
+                "    ",
+            )
+    logger.info(f"{info}")
+
+    # make buckets first because it determines the length of dataset
+    # and set the same seed for all datasets
+    seed = random.randint(0, 2**31)  # actual seed is seed + epoch_no
+    for i, dataset in enumerate(datasets):
+        # logger.info(f"[Dataset {i}]")
+        dataset.set_seed(seed)
+        if training:
+            dataset.prepare_for_training()
+
+    return DatasetGroup(datasets)
+
+
+def load_user_config(file: str) -> dict:
+    file: Path = Path(file)
+    if not file.is_file():
+        raise ValueError(f"file not found / ファイルが見つかりません: {file}")
+
+    if file.name.lower().endswith(".json"):
+        try:
+            with open(file, "r", encoding="utf-8") as f:
+                config = json.load(f)
+        except Exception:
+            logger.error(
+                f"Error on parsing JSON config file. Please check the format. / JSON 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
+            raise
+    elif file.name.lower().endswith(".toml"):
+        try:
+            config = toml.load(file)
+        except Exception:
+            logger.error(
+                f"Error on parsing TOML config file. Please check the format. / TOML 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
+            raise
+    else:
+        raise ValueError(f"not supported config file format / 対応していない設定ファイルの形式です: {file}")
+
+    return config
+
+
+# for config test
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("dataset_config")
+    config_args, remain = parser.parse_known_args()
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--debug_dataset", action="store_true")
+    argparse_namespace = parser.parse_args(remain)
+
+    logger.info("[argparse_namespace]")
+    logger.info(f"{vars(argparse_namespace)}")
+
+    user_config = load_user_config(config_args.dataset_config)
+
+    logger.info("")
+    logger.info("[user_config]")
+    logger.info(f"{user_config}")
+
+    sanitizer = ConfigSanitizer()
+    sanitized_user_config = sanitizer.sanitize_user_config(user_config)
+
+    logger.info("")
+    logger.info("[sanitized_user_config]")
+    logger.info(f"{sanitized_user_config}")
+
+    blueprint = BlueprintGenerator(sanitizer).generate(user_config, argparse_namespace)
+
+    logger.info("")
+    logger.info("[blueprint]")
+    logger.info(f"{blueprint}")
+
+    dataset_group = generate_dataset_group_by_blueprint(blueprint.dataset_group)
diff --git a/dataset/dataset_config.md b/dataset/dataset_config.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3bae1511174b43250342a85dbe0558b52d67df5
--- /dev/null
+++ b/dataset/dataset_config.md
@@ -0,0 +1,378 @@
+> 📝 Click on the language section to expand / 言語をクリックして展開
+
+## Dataset Configuration
+
+Please create a TOML file for dataset configuration.
+
+Image and video datasets are supported. The configuration file can include multiple datasets, either image or video datasets, with caption text files or metadata JSONL files.
+
+The cache directory must be different for each dataset.
+
+<details>
+<summary>日本語</summary>
+
+データセットの設定を行うためのTOMLファイルを作成してください。
+
+画像データセットと動画データセットがサポートされています。設定ファイルには、画像または動画データセットを複数含めることができます。キャプションテキストファイルまたはメタデータJSONLファイルを使用できます。
+
+キャッシュディレクトリは、各データセットごとに異なるディレクトリである必要があります。
+</details>
+
+### Sample for Image Dataset with Caption Text Files
+
+```toml
+# resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale should be set in either general or datasets
+# otherwise, the default values will be used for each item
+
+# general configurations
+[general]
+resolution = [960, 544]
+caption_extension = ".txt"
+batch_size = 1
+enable_bucket = true
+bucket_no_upscale = false
+
+[[datasets]]
+image_directory = "/path/to/image_dir"
+cache_directory = "/path/to/cache_directory"
+num_repeats = 1 # optional, default is 1. Number of times to repeat the dataset. Useful to balance the multiple datasets with different sizes.
+
+# other datasets can be added here. each dataset can have different configurations
+```
+
+`cache_directory` is optional, default is None to use the same directory as the image directory. However, we recommend to set the cache directory to avoid accidental sharing of the cache files between different datasets.
+
+`num_repeats` is also available. It is optional, default is 1 (no repeat). It repeats the images (or videos) that many times to expand the dataset. For example, if `num_repeats = 2` and there are 20 images in the dataset, each image will be duplicated twice (with the same caption) to have a total of 40 images. It is useful to balance the multiple datasets with different sizes.
+
+<details>
+<summary>日本語</summary>
+
+`cache_directory` はオプションです。デフォルトは画像ディレクトリと同じディレクトリに設定されます。ただし、異なるデータセット間でキャッシュファイルが共有されるのを防ぐために、明示的に別のキャッシュディレクトリを設定することをお勧めします。
+
+`num_repeats` はオプションで、デフォルトは 1 です（繰り返しなし）。画像（や動画）を、その回数だけ単純に繰り返してデータセットを拡張します。たとえば`num_repeats = 2`としたとき、画像20枚のデータセットなら、各画像が2枚ずつ（同一のキャプションで）計40枚存在した場合と同じになります。異なるデータ数のデータセット間でバランスを取るために使用可能です。
+
+resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale は general または datasets のどちらかに設定してください。省略時は各項目のデフォルト値が使用されます。
+
+`[[datasets]]`以下を追加することで、他のデータセットを追加できます。各データセットには異なる設定を持てます。
+</details>
+
+### Sample for Image Dataset with Metadata JSONL File
+
+```toml
+# resolution, batch_size, num_repeats, enable_bucket, bucket_no_upscale should be set in either general or datasets
+# caption_extension is not required for metadata jsonl file
+# cache_directory is required for each dataset with metadata jsonl file
+
+# general configurations
+[general]
+resolution = [960, 544]
+batch_size = 1
+enable_bucket = true
+bucket_no_upscale = false
+
+[[datasets]]
+image_jsonl_file = "/path/to/metadata.jsonl"
+cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
+num_repeats = 1 # optional, default is 1. Same as above.
+
+# other datasets can be added here. each dataset can have different configurations
+```
+
+JSONL file format for metadata:
+
+```json
+{"image_path": "/path/to/image1.jpg", "caption": "A caption for image1"}
+{"image_path": "/path/to/image2.jpg", "caption": "A caption for image2"}
+```
+
+<details>
+<summary>日本語</summary>
+
+resolution, batch_size, num_repeats, enable_bucket, bucket_no_upscale は general または datasets のどちらかに設定してください。省略時は各項目のデフォルト値が使用されます。
+
+metadata jsonl ファイルを使用する場合、caption_extension は必要ありません。また、cache_directory は必須です。
+
+キャプションによるデータセットと同様に、複数のデータセットを追加できます。各データセットには異なる設定を持てます。
+</details>
+
+
+### Sample for Video Dataset with Caption Text Files
+
+```toml
+# resolution, caption_extension, target_frames, frame_extraction, frame_stride, frame_sample, 
+# batch_size, num_repeats, enable_bucket, bucket_no_upscale should be set in either general or datasets
+# num_repeats is also available for video dataset, example is not shown here
+
+# general configurations
+[general]
+resolution = [960, 544]
+caption_extension = ".txt"
+batch_size = 1
+enable_bucket = true
+bucket_no_upscale = false
+
+[[datasets]]
+video_directory = "/path/to/video_dir"
+cache_directory = "/path/to/cache_directory" # recommended to set cache directory
+target_frames = [1, 25, 45]
+frame_extraction = "head"
+
+# other datasets can be added here. each dataset can have different configurations
+```
+
+__In HunyuanVideo and Wan2.1, the number of `target_frames` must be "N*4+1" (N=0,1,2,...).__
+
+<details>
+<summary>日本語</summary>
+
+resolution, caption_extension, target_frames, frame_extraction, frame_stride, frame_sample, batch_size, num_repeats, enable_bucket, bucket_no_upscale は general または datasets のどちらかに設定してください。
+
+__HunyuanVideoおよびWan2.1では、target_framesの数値は「N*4+1」である必要があります。__
+
+他の注意事項は画像データセットと同様です。
+</details>
+
+### Sample for Video Dataset with Metadata JSONL File
+
+```toml
+# resolution, target_frames, frame_extraction, frame_stride, frame_sample, 
+# batch_size, num_repeats, enable_bucket, bucket_no_upscale should be set in either general or datasets
+# caption_extension is not required for metadata jsonl file
+# cache_directory is required for each dataset with metadata jsonl file
+
+# general configurations
+[general]
+resolution = [960, 544]
+batch_size = 1
+enable_bucket = true
+bucket_no_upscale = false
+
+[[datasets]]
+video_jsonl_file = "/path/to/metadata.jsonl"
+target_frames = [1, 25, 45]
+frame_extraction = "head"
+cache_directory = "/path/to/cache_directory_head"
+
+# same metadata jsonl file can be used for multiple datasets
+[[datasets]]
+video_jsonl_file = "/path/to/metadata.jsonl"
+target_frames = [1]
+frame_stride = 10
+cache_directory = "/path/to/cache_directory_stride"
+
+# other datasets can be added here. each dataset can have different configurations
+```
+
+JSONL file format for metadata:
+
+```json
+{"video_path": "/path/to/video1.mp4", "caption": "A caption for video1"}
+{"video_path": "/path/to/video2.mp4", "caption": "A caption for video2"}
+```
+
+<details>
+<summary>日本語</summary>
+
+resolution, target_frames, frame_extraction, frame_stride, frame_sample, batch_size, num_repeats, enable_bucket, bucket_no_upscale は general または datasets のどちらかに設定してください。
+
+metadata jsonl ファイルを使用する場合、caption_extension は必要ありません。また、cache_directory は必須です。
+
+他の注意事項は今までのデータセットと同様です。
+</details>
+
+### frame_extraction Options
+
+- `head`: Extract the first N frames from the video.
+- `chunk`: Extract frames by splitting the video into chunks of N frames.
+- `slide`: Extract frames from the video with a stride of `frame_stride`.
+- `uniform`: Extract `frame_sample` samples uniformly from the video.
+
+For example, consider a video with 40 frames. The following diagrams illustrate each extraction:
+
+<details>
+<summary>日本語</summary>
+
+- `head`: 動画から最初のNフレームを抽出します。
+- `chunk`: 動画をNフレームずつに分割してフレームを抽出します。
+- `slide`: `frame_stride`に指定したフレームごとに動画からNフレームを抽出します。
+- `uniform`: 動画から一定間隔で、`frame_sample`個のNフレームを抽出します。
+
+例えば、40フレームの動画を例とした抽出について、以下の図で説明します。
+</details>
+
+```
+Original Video, 40 frames: x = frame, o = no frame
+oooooooooooooooooooooooooooooooooooooooo
+
+head, target_frames = [1, 13, 25] -> extract head frames:
+xooooooooooooooooooooooooooooooooooooooo
+xxxxxxxxxxxxxooooooooooooooooooooooooooo
+xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
+
+chunk, target_frames = [13, 25] -> extract frames by splitting into chunks, into 13 and 25 frames:
+xxxxxxxxxxxxxooooooooooooooooooooooooooo
+oooooooooooooxxxxxxxxxxxxxoooooooooooooo
+ooooooooooooooooooooooooooxxxxxxxxxxxxxo
+xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
+
+NOTE: Please do not include 1 in target_frames if you are using the frame_extraction "chunk". It will make the all frames to be extracted.
+注: frame_extraction "chunk" を使用する場合、target_frames に 1 を含めないでください。全てのフレームが抽出されてしまいます。
+
+slide, target_frames = [1, 13, 25], frame_stride = 10 -> extract N frames with a stride of 10:
+xooooooooooooooooooooooooooooooooooooooo
+ooooooooooxooooooooooooooooooooooooooooo
+ooooooooooooooooooooxooooooooooooooooooo
+ooooooooooooooooooooooooooooooxooooooooo
+xxxxxxxxxxxxxooooooooooooooooooooooooooo
+ooooooooooxxxxxxxxxxxxxooooooooooooooooo
+ooooooooooooooooooooxxxxxxxxxxxxxooooooo
+xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
+ooooooooooxxxxxxxxxxxxxxxxxxxxxxxxxooooo
+
+uniform, target_frames =[1, 13, 25], frame_sample = 4 -> extract `frame_sample` samples uniformly, N frames each:
+xooooooooooooooooooooooooooooooooooooooo
+oooooooooooooxoooooooooooooooooooooooooo
+oooooooooooooooooooooooooxoooooooooooooo
+ooooooooooooooooooooooooooooooooooooooox
+xxxxxxxxxxxxxooooooooooooooooooooooooooo
+oooooooooxxxxxxxxxxxxxoooooooooooooooooo
+ooooooooooooooooooxxxxxxxxxxxxxooooooooo
+oooooooooooooooooooooooooooxxxxxxxxxxxxx
+xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
+oooooxxxxxxxxxxxxxxxxxxxxxxxxxoooooooooo
+ooooooooooxxxxxxxxxxxxxxxxxxxxxxxxxooooo
+oooooooooooooooxxxxxxxxxxxxxxxxxxxxxxxxx
+```
+
+## Specifications
+
+```toml
+# general configurations
+[general]
+resolution = [960, 544] # optional, [W, H], default is None. This is the default resolution for all datasets
+caption_extension = ".txt" # optional, default is None. This is the default caption extension for all datasets
+batch_size = 1 # optional, default is 1. This is the default batch size for all datasets
+num_repeats = 1 # optional, default is 1. Number of times to repeat the dataset. Useful to balance the multiple datasets with different sizes.
+enable_bucket = true # optional, default is false. Enable bucketing for datasets
+bucket_no_upscale = false # optional, default is false. Disable upscaling for bucketing. Ignored if enable_bucket is false
+
+### Image Dataset
+
+# sample image dataset with caption text files
+[[datasets]]
+image_directory = "/path/to/image_dir"
+caption_extension = ".txt" # required for caption text files, if general caption extension is not set
+resolution = [960, 544] # required if general resolution is not set
+batch_size = 4 # optional, overwrite the default batch size
+num_repeats = 1 # optional, overwrite the default num_repeats
+enable_bucket = false # optional, overwrite the default bucketing setting
+bucket_no_upscale = true # optional, overwrite the default bucketing setting
+cache_directory = "/path/to/cache_directory" # optional, default is None to use the same directory as the image directory. NOTE: caching is always enabled
+
+# sample image dataset with metadata **jsonl** file
+[[datasets]]
+image_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of image files and captions
+resolution = [960, 544] # required if general resolution is not set
+cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
+# caption_extension is not required for metadata jsonl file
+# batch_size, num_repeats, enable_bucket, bucket_no_upscale are also available for metadata jsonl file
+
+### Video Dataset
+
+# sample video dataset with caption text files
+[[datasets]]
+video_directory = "/path/to/video_dir"
+caption_extension = ".txt" # required for caption text files, if general caption extension is not set
+resolution = [960, 544] # required if general resolution is not set
+
+target_frames = [1, 25, 79] # required for video dataset. list of video lengths to extract frames. each element must be N*4+1 (N=0,1,2,...)
+
+# NOTE: Please do not include 1 in target_frames if you are using the frame_extraction "chunk". It will make the all frames to be extracted.
+
+frame_extraction = "head" # optional, "head" or "chunk", "slide", "uniform". Default is "head"
+frame_stride = 1 # optional, default is 1, available for "slide" frame extraction
+frame_sample = 4 # optional, default is 1 (same as "head"), available for "uniform" frame extraction
+# batch_size, num_repeats, enable_bucket, bucket_no_upscale, cache_directory are also available for video dataset
+
+# sample video dataset with metadata jsonl file
+[[datasets]]
+video_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of video files and captions
+
+target_frames = [1, 79]
+
+cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
+# frame_extraction, frame_stride, frame_sample are also available for metadata jsonl file
+```
+
+<!-- 
+# sample image dataset with lance
+[[datasets]]
+image_lance_dataset = "/path/to/lance_dataset"
+resolution = [960, 544] # required if general resolution is not set
+# batch_size, enable_bucket, bucket_no_upscale, cache_directory are also available for lance dataset
+-->
+
+The metadata with .json file will be supported in the near future.
+
+
+
+<!--
+
+```toml
+# general configurations
+[general]
+resolution = [960, 544] # optional, [W, H], default is None. This is the default resolution for all datasets
+caption_extension = ".txt" # optional, default is None. This is the default caption extension for all datasets
+batch_size = 1 # optional, default is 1. This is the default batch size for all datasets
+enable_bucket = true # optional, default is false. Enable bucketing for datasets
+bucket_no_upscale = false # optional, default is false. Disable upscaling for bucketing. Ignored if enable_bucket is false
+
+# sample image dataset with caption text files
+[[datasets]]
+image_directory = "/path/to/image_dir"
+caption_extension = ".txt" # required for caption text files, if general caption extension is not set
+resolution = [960, 544] # required if general resolution is not set
+batch_size = 4 # optional, overwrite the default batch size
+enable_bucket = false # optional, overwrite the default bucketing setting
+bucket_no_upscale = true # optional, overwrite the default bucketing setting
+cache_directory = "/path/to/cache_directory" # optional, default is None to use the same directory as the image directory. NOTE: caching is always enabled
+
+# sample image dataset with metadata **jsonl** file
+[[datasets]]
+image_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of image files and captions
+resolution = [960, 544] # required if general resolution is not set
+cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
+# caption_extension is not required for metadata jsonl file
+# batch_size, enable_bucket, bucket_no_upscale are also available for metadata jsonl file
+
+# sample video dataset with caption text files
+[[datasets]]
+video_directory = "/path/to/video_dir"
+caption_extension = ".txt" # required for caption text files, if general caption extension is not set
+resolution = [960, 544] # required if general resolution is not set
+target_frames = [1, 25, 79] # required for video dataset. list of video lengths to extract frames. each element must be N*4+1 (N=0,1,2,...)
+frame_extraction = "head" # optional, "head" or "chunk", "slide", "uniform". Default is "head"
+frame_stride = 1 # optional, default is 1, available for "slide" frame extraction
+frame_sample = 4 # optional, default is 1 (same as "head"), available for "uniform" frame extraction
+# batch_size, enable_bucket, bucket_no_upscale, cache_directory are also available for video dataset
+
+# sample video dataset with metadata jsonl file
+[[datasets]]
+video_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of video files and captions
+target_frames = [1, 79]
+cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
+# frame_extraction, frame_stride, frame_sample are also available for metadata jsonl file
+```
+
+# sample image dataset with lance
+[[datasets]]
+image_lance_dataset = "/path/to/lance_dataset"
+resolution = [960, 544] # required if general resolution is not set
+# batch_size, enable_bucket, bucket_no_upscale, cache_directory are also available for lance dataset
+
+The metadata with .json file will be supported in the near future.
+
+
+
+
+-->
\ No newline at end of file
diff --git a/dataset/dataset_example.toml b/dataset/dataset_example.toml
new file mode 100644
index 0000000000000000000000000000000000000000..e081ffaf5330bba74cb23049a9459d4c5bd456cc
--- /dev/null
+++ b/dataset/dataset_example.toml
@@ -0,0 +1,44 @@
+# resolution, caption_extension, target_frames, frame_extraction, frame_stride, frame_sample, 
+# batch_size, num_repeats, enable_bucket, bucket_no_upscale should be set in either general or datasets
+
+
+# general configurations
+[general]
+caption_extension = ".txt"
+batch_size = 1
+enable_bucket = true
+bucket_no_upscale = false
+
+
+# dataset configurations
+[[datasets]]
+resolution = [160, 160]
+video_directory = "D:/musubi-tuner-wan-gui/dataset/My_Best_Lora_dataset/video"  # path to your video dataset
+cache_directory = "D:/musubi-tuner-wan-gui/dataset/My_Best_Lora_dataset/cache/video" # recommended to set cache directory
+target_frames = [17, 33, 65]
+frame_extraction = "chunk"
+num_repeats = 1
+
+# head: Extract the first N frames from the video.
+# chunk: Extract frames by splitting the video into chunks of N frames.
+# slide: Extract frames from the video with a stride of frame_stride.
+# uniform: Extract frame_sample samples uniformly from the video.
+# NOTE: Please do not include 1 in target_frames if you are using the frame_extraction "chunk". It will make the all frames to be extracted.
+
+# More info here: https://github.com/Kvento/musubi-tuner-wan-gui/blob/main/dataset/dataset_config.md
+
+
+
+
+
+
+
+# other datasets can be added here. each dataset can have different configurations
+
+# If you don't need image training, remove this code:
+# dataset configurations
+[[datasets]]
+resolution = [256, 256]
+image_directory = "D:/musubi-tuner-wan-gui/dataset/My_Best_Lora_dataset/images" # path to your image dataset
+cache_directory = "D:/musubi-tuner-wan-gui/dataset/My_Best_Lora_dataset/cache/images" # recommended to set cache directory
+num_repeats = 1
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20190915_193922.jpg b/dataset/ebPhotos-001/20190915_193922.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1ad2b45e0b4cd0d97c924cd69e1c970e28a93f78
--- /dev/null
+++ b/dataset/ebPhotos-001/20190915_193922.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11fc9ec911df776045021f6c8660a8d02e8a4bd7028df5221e31665f1f41068c
+size 996903
diff --git a/dataset/ebPhotos-001/20190915_193922.txt b/dataset/ebPhotos-001/20190915_193922.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ea011e2e97b3de9720556f5bce26b8dd367b3bae
--- /dev/null
+++ b/dataset/ebPhotos-001/20190915_193922.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with curly hair smiling wearing a black top standing on a boat. In the background a group of people including children sit on a wooden bench wearing casual clothes. A green suspension bridge spans a river with a cloudy sky above. The woman is in the foreground with the group and bridge in the mid-ground. The boat has a green floor and a cylindrical black structure on the left.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20190921_182515.jpg b/dataset/ebPhotos-001/20190921_182515.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..679c8b5ebd073e615d808f7a986ba5526d15a479
--- /dev/null
+++ b/dataset/ebPhotos-001/20190921_182515.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62b1f3f3e0b60c133885d3222661241dc7e671695478debcb1ea4d3d7a3d3dbc
+size 2665085
diff --git a/dataset/ebPhotos-001/20190921_182515.txt b/dataset/ebPhotos-001/20190921_182515.txt
new file mode 100644
index 0000000000000000000000000000000000000000..16958f531806ddceb1b436b42ffc60b5d88cc75c
--- /dev/null
+++ b/dataset/ebPhotos-001/20190921_182515.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark skin smiling standing in a hotel room. She has her hair in a neat bun wearing a lace pink sleeveless dress that accentuates her medium-sized breasts and a large green bow at the chest. She accessorizes with a silver necklace bracelet and watch. The room has a patterned carpet wooden door green chair and metal trash can. She stands confidently one hand on her shoulder.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20190921_182517.jpg b/dataset/ebPhotos-001/20190921_182517.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..679936428d74ec0b0b9622d262f98d6252431e51
--- /dev/null
+++ b/dataset/ebPhotos-001/20190921_182517.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58aedcb9de3acb1f13501321f7f6d10171dbcfbf9310a4bb3712116f5c263aea
+size 2824053
diff --git a/dataset/ebPhotos-001/20190921_182517.txt b/dataset/ebPhotos-001/20190921_182517.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7fec9a91b0a445425a7fa362c1c81cf9cafe207b
--- /dev/null
+++ b/dataset/ebPhotos-001/20190921_182517.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark skin standing in a hotel room. She has a slim curvy figure wearing a pink lace dress that accentuates her medium-sized breasts. She's smiling with her right hand touching her shoulder and her left hand resting on her hip. She wears a white beaded necklace matching bracelet and a blue watch. Her hair is styled in a neat bun. The background includes a green chair a trash can and two wooden doors with gold handles. The carpet has a
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20220521_222809.jpg b/dataset/ebPhotos-001/20220521_222809.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6fa4a6cf2fb9bec3add7e76603d541a253b4ad87
--- /dev/null
+++ b/dataset/ebPhotos-001/20220521_222809.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a19a8d2c096250e6ddab028ace80c3fc3c6e69d7f9d4ee1123a781edadf840c
+size 4389429
diff --git a/dataset/ebPhotos-001/20220521_222809.txt b/dataset/ebPhotos-001/20220521_222809.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ba92f8c571261fdc21b8c2fdbe1fda460c652475
--- /dev/null
+++ b/dataset/ebPhotos-001/20220521_222809.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin dark brown eyes and red lipstick wearing a pink checkered shirt her hair in a high bun standing in a cluttered office. background includes a TV showing a collage of images snacks on a shelf a red box papers and a desk with a white plastic bag. she wears a gold pendant necklace. the office has beige walls and wooden furniture.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230427_082757.jpg b/dataset/ebPhotos-001/20230427_082757.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1a20a6d9437550cfc515987bc0ce442205a8ba7e
--- /dev/null
+++ b/dataset/ebPhotos-001/20230427_082757.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42a5dc8533130c279192f88200861d5aa3c247e83bf35fff17aab3ea982fbeaf
+size 1670352
diff --git a/dataset/ebPhotos-001/20230427_082757.txt b/dataset/ebPhotos-001/20230427_082757.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1a0b5e4bf008b380dee35eb1987ce677b04f4d14
--- /dev/null
+++ b/dataset/ebPhotos-001/20230427_082757.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin shoulder-length wavy black hair and brown eyes. She wears a deep purple sleeveless top looking directly at the camera with a neutral expression. The background is a dimly lit indoor space with beige walls and a dark curtain. The image is a close-up focusing on her face and upper torso.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230427_082800.jpg b/dataset/ebPhotos-001/20230427_082800.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2af2953abb3e89d3f28e6c01dce19b341d3a315d
--- /dev/null
+++ b/dataset/ebPhotos-001/20230427_082800.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:213e80f9fddc50f0a5733435b4f60b4426f19b87531ba6fdaff649934bcc09ec
+size 1723644
diff --git a/dataset/ebPhotos-001/20230427_082800.txt b/dataset/ebPhotos-001/20230427_082800.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4e6168c606a0796c10f9cfcb112b3b96255ccb88
--- /dev/null
+++ b/dataset/ebPhotos-001/20230427_082800.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin shoulder-length wavy black hair and brown eyes. She wears a deep purple halter top looking directly at the camera with a slight confident smile. The background is a dimly lit indoor space with a dark curtain on the right and a beige wall on the left. The lighting highlights her natural skin texture and subtle makeup.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230427_082805.jpg b/dataset/ebPhotos-001/20230427_082805.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9a67d900c218287650ba5bee26f7478a3c1eacd7
--- /dev/null
+++ b/dataset/ebPhotos-001/20230427_082805.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:826b87123a8e51d755c4c781552b2606f600766d463d81822d2513277a3ef354
+size 1779687
diff --git a/dataset/ebPhotos-001/20230427_082805.txt b/dataset/ebPhotos-001/20230427_082805.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5445c22ad5586538104f4c1544b8f863e2f31f59
--- /dev/null
+++ b/dataset/ebPhotos-001/20230427_082805.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin shoulder-length wavy black hair and brown eyes. She wears a low-cut sleeveless purple top revealing a hint of cleavage. Her expression is neutral with slightly pursed lips. The background is a dimly lit indoor room with a dark curtain and a partially visible doorway. The lighting highlights her natural skin texture and subtle makeup.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230502_185323.jpg b/dataset/ebPhotos-001/20230502_185323.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..13cdb964a49bf165bfc7cc6f1804b18079a1929b
--- /dev/null
+++ b/dataset/ebPhotos-001/20230502_185323.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8e0d22c9d3a3b325a1bededcd9c6e56223bdce77e38160ed3753e3252a660ec
+size 2677300
diff --git a/dataset/ebPhotos-001/20230502_185323.txt b/dataset/ebPhotos-001/20230502_185323.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9fc556419a6903335a8d4e2a5f93ffefc7d9f857
--- /dev/null
+++ b/dataset/ebPhotos-001/20230502_185323.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin wearing a form-fitting white satin dress with thin straps standing in a purple-walled dressing room. She has shoulder-length black hair a necklace and is smiling while dancing. The room has a gray carpet a standing mirror and a pink and purple garment hanging on the left. An "EXIT" sign is visible on the ceiling.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230504_193610.jpg b/dataset/ebPhotos-001/20230504_193610.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6770cd95a7ce32efdfb6245047501d07e2478901
--- /dev/null
+++ b/dataset/ebPhotos-001/20230504_193610.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7402ec5e5994031d6343fcc0b7636c6469b54e6e5d01ea6747a82817b4b8405b
+size 1795619
diff --git a/dataset/ebPhotos-001/20230504_193610.txt b/dataset/ebPhotos-001/20230504_193610.txt
new file mode 100644
index 0000000000000000000000000000000000000000..eac81f173f8198965df9d851d0b2832f3972b189
--- /dev/null
+++ b/dataset/ebPhotos-001/20230504_193610.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with curly hair wearing a gray and red track jacket gray leggings and red and black sneakers kneeling on a patterned carpet in a hallway her right hand on her chest left hand on the floor beige walls wooden floor and a door in the background.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230504_193624.jpg b/dataset/ebPhotos-001/20230504_193624.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..573176297ba6a615b744a2f808c9c146451e3849
--- /dev/null
+++ b/dataset/ebPhotos-001/20230504_193624.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4ed95ed45601cec31e164da28dd1de6840615945b59370c748a4039ad0b1696
+size 2009848
diff --git a/dataset/ebPhotos-001/20230504_193624.txt b/dataset/ebPhotos-001/20230504_193624.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a7f8ed23a77aac75cecb55228825d9b98e32b53d
--- /dev/null
+++ b/dataset/ebPhotos-001/20230504_193624.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with curly hair wearing a gray Fila jacket with red trim black top gray leggings and red and white Nike sneakers. She's posing in a hallway one leg raised hand on jacket. wooden floor patterned rug beige walls and white door in background. confident stylish athletic.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230504_193657.jpg b/dataset/ebPhotos-001/20230504_193657.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..18b3f084c7d2861a5bedf81d09c28052638b57f9
--- /dev/null
+++ b/dataset/ebPhotos-001/20230504_193657.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:632a50223f8591339d03c63de042f8733973f7bc706426dc34dc1825e1473e8b
+size 1950078
diff --git a/dataset/ebPhotos-001/20230504_193657.txt b/dataset/ebPhotos-001/20230504_193657.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a0bdd5c4f120bb76f3bfee1ce68f9be3beddc124
--- /dev/null
+++ b/dataset/ebPhotos-001/20230504_193657.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with wavy hair wearing a gray and red track jacket black top and black leggings kneeling on one leg in a hallway with wooden floors and a patterned rug. She's wearing red white and gray sneakers. The hallway has white doors and beige walls. She has a confident expression and her right hand is in her jacket pocket. The lighting is warm and soft.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230504_193734.jpg b/dataset/ebPhotos-001/20230504_193734.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ac4b294cc5f2dd2f1a539a9360d0187a45e9c3f2
--- /dev/null
+++ b/dataset/ebPhotos-001/20230504_193734.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7e64f7ea78ef867478bc40c02e41a20386888f19a90b934e350fc328f011624
+size 1962640
diff --git a/dataset/ebPhotos-001/20230504_193734.txt b/dataset/ebPhotos-001/20230504_193734.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bac001595dc6579ceff101f2ef1e8b39322474fd
--- /dev/null
+++ b/dataset/ebPhotos-001/20230504_193734.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with long wavy hair kneeling on a wooden floor in a hallway. she wears a gray and red track jacket black leggings and red and white sneakers. her right hand is in her jacket pocket. the hallway has beige walls white doors and a patterned gray rug. the lighting is warm and she looks down at the camera with a slight smile.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230504_193750.jpg b/dataset/ebPhotos-001/20230504_193750.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..10a21128e098edde1ae7c1804dbce7d3d9bf1a1c
--- /dev/null
+++ b/dataset/ebPhotos-001/20230504_193750.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b903442db5c5f852b337d82e98977555a2430d72b3112a84978a79c583d2b4b
+size 2035652
diff --git a/dataset/ebPhotos-001/20230504_193750.txt b/dataset/ebPhotos-001/20230504_193750.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d7798aeb30395f3e7749d0200b595f3b8c5b31cc
--- /dev/null
+++ b/dataset/ebPhotos-001/20230504_193750.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin long wavy black hair and a slender build kneeling in a hallway. She wears a gray track jacket with red and white accents black leggings and red and black sneakers. She has a necklace with a circular pendant. The hallway has wooden floors a patterned gray rug and white walls with a door and window blinds in the background. She smiles slightly looking at the camera.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230504_193805.jpg b/dataset/ebPhotos-001/20230504_193805.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ad3caa9527b4fbac575535411a5236ee8b9c6e43
--- /dev/null
+++ b/dataset/ebPhotos-001/20230504_193805.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b86fad2aeff5931ca2ff92073e5b583e7f85a41faf6ac6cddd8662f6f65c0bfe
+size 1833701
diff --git a/dataset/ebPhotos-001/20230504_193805.txt b/dataset/ebPhotos-001/20230504_193805.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5dcbf6f7c97e49620ad94717d3b4e42c885ebfe7
--- /dev/null
+++ b/dataset/ebPhotos-001/20230504_193805.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with wavy hair wearing a gray and red jacket black top gray leggings and red and white sneakers kneeling in a hallway with wooden floors and beige walls holding her hair with a necklace visible smiling at the camera.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230505_194441.jpg b/dataset/ebPhotos-001/20230505_194441.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a5c402a5c668de791d6bab9b2d317efee256338b
--- /dev/null
+++ b/dataset/ebPhotos-001/20230505_194441.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0696b16e283480c7b069770ae1065beb6076a90df6b43dc2abf8ed932599150b
+size 1945142
diff --git a/dataset/ebPhotos-001/20230505_194441.txt b/dataset/ebPhotos-001/20230505_194441.txt
new file mode 100644
index 0000000000000000000000000000000000000000..22a275db59efc319d1877e5a85316817832bd4cd
--- /dev/null
+++ b/dataset/ebPhotos-001/20230505_194441.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin wearing a form-fitting long-sleeve red dress with a keyhole neckline standing in a narrow hallway. She has shoulder-length wavy black hair and is posing with one hand on her head and the other on the wall. She wears black high heels and has a tattoo on her left thigh. The hallway has beige walls white doors and a wooden step. She looks confident and alluring.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230505_194607.jpg b/dataset/ebPhotos-001/20230505_194607.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6d27397464f7db6237205c52c620ee0fca91fa6c
--- /dev/null
+++ b/dataset/ebPhotos-001/20230505_194607.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:215bc59c6e1a69954f0963f72be24a399d485c71b2ca233bdfbe36c10698403c
+size 2080410
diff --git a/dataset/ebPhotos-001/20230505_194607.txt b/dataset/ebPhotos-001/20230505_194607.txt
new file mode 100644
index 0000000000000000000000000000000000000000..431f29ec22b5108a9d351c6d1728cc467751085e
--- /dev/null
+++ b/dataset/ebPhotos-001/20230505_194607.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark skin standing in a narrow hallway. She has shoulder-length curly black hair wearing a tight long-sleeve red mini dress with a keyhole neckline revealing moderate cleavage. She is standing with arms outstretched touching the walls wearing black high-heeled shoes. The hallway has beige walls white doors and wooden floors with a patterned rug at the bottom. Recessed ceiling lights illuminate the scene.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230505_194707.jpg b/dataset/ebPhotos-001/20230505_194707.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..82a824bece7b84bc337e93bae56501889aec8393
--- /dev/null
+++ b/dataset/ebPhotos-001/20230505_194707.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ebb33e8ba8924c5bb81606e6a627a4c3991cdba55abd767c04107369ea1330d
+size 2009043
diff --git a/dataset/ebPhotos-001/20230505_194707.txt b/dataset/ebPhotos-001/20230505_194707.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7855250a927f8830660516c07aba26a9d5f4fb88
--- /dev/null
+++ b/dataset/ebPhotos-001/20230505_194707.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin standing in a narrow hallway. she has wavy black hair wears a tight red long-sleeve mini dress with a low neckline black fishnet stockings and black high-heeled sandals. she stands confidently one hand on the wall the other on her hip. the hallway has beige walls white trim and a patterned doormat. a ceiling light illuminates her from above.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230505_194729.jpg b/dataset/ebPhotos-001/20230505_194729.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a410a30603c8e29ea193b4a9de1b09b0f1b9d384
--- /dev/null
+++ b/dataset/ebPhotos-001/20230505_194729.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94cc36ceeb33fa697653233c86183d3e6a24080106b42cfef0bdc76304cc0651
+size 2048436
diff --git a/dataset/ebPhotos-001/20230505_194729.txt b/dataset/ebPhotos-001/20230505_194729.txt
new file mode 100644
index 0000000000000000000000000000000000000000..008f0b23a7084680c84a2cdb6eb5dd00019e0aa4
--- /dev/null
+++ b/dataset/ebPhotos-001/20230505_194729.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with curly hair wearing a tight red long-sleeve mini dress black high heels and a gold bracelet. She stands in a narrow hallway leaning against a white door showcasing a large tattoo on her right thigh. The hallway has beige walls wooden floor and patterned rug. The lighting is warm and the angle is low emphasizing her confident pose and curvy figure.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230505_194804.jpg b/dataset/ebPhotos-001/20230505_194804.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e6630eca5ee4a59e024e0bf10e92e7b95f2a135c
--- /dev/null
+++ b/dataset/ebPhotos-001/20230505_194804.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0950723c73d2667356afe74ec21205e79bc079222ead2bd9b140bfacd04d0ba
+size 2025047
diff --git a/dataset/ebPhotos-001/20230505_194804.txt b/dataset/ebPhotos-001/20230505_194804.txt
new file mode 100644
index 0000000000000000000000000000000000000000..949c79efa17d3e608d485f7f466e0908b4fa43d7
--- /dev/null
+++ b/dataset/ebPhotos-001/20230505_194804.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark skin wearing a tight red long-sleeve mini dress with a keyhole neckline revealing cleavage. She has shoulder-length wavy black hair and is standing in a narrow hallway with beige walls and white trim. She wears black high-heeled shoes and a gold bracelet on her left wrist. The wooden floor is partially covered by a gray and white patterned rug. She is looking down at the camera with a confident expression.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230505_194954.jpg b/dataset/ebPhotos-001/20230505_194954.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a12d8cc735df88864373ee203ef030f718f18ec1
--- /dev/null
+++ b/dataset/ebPhotos-001/20230505_194954.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a9d19c748de9ef9c9eeab9cf3d69cb6c14db771b9aa09da61ddccad124489e5
+size 1247717
diff --git a/dataset/ebPhotos-001/20230505_194954.txt b/dataset/ebPhotos-001/20230505_194954.txt
new file mode 100644
index 0000000000000000000000000000000000000000..93b749ada9b48e1cf5abf34d586e797be19489f2
--- /dev/null
+++ b/dataset/ebPhotos-001/20230505_194954.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark curly hair wearing a tight long-sleeve red mini dress with a keyhole neckline showcasing her curvy figure and medium-sized breasts. She stands in a hallway leaning against a white door wearing black high-heeled sandals. Her skin is smooth and she has a gold bracelet on her right wrist. The hallway has beige walls white doors and a patterned gray rug on the wooden floor. The lighting is warm highlighting her confident pose.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230505_195052.jpg b/dataset/ebPhotos-001/20230505_195052.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..216a3daa4f46337459887337399706bacf480016
--- /dev/null
+++ b/dataset/ebPhotos-001/20230505_195052.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2488f12a45f79dd961d44c3a33c5a6742560020377e6f913bf40f350b4a1d090
+size 1257135
diff --git a/dataset/ebPhotos-001/20230505_195052.txt b/dataset/ebPhotos-001/20230505_195052.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ec4616931ca8cabcd7002168c38383a46c2ad211
--- /dev/null
+++ b/dataset/ebPhotos-001/20230505_195052.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark curly hair wearing a tight red long-sleeve mini dress with a keyhole cutout and black high heels. She stands in a narrow hallway with beige walls white doors and a patterned rug. She has a tattoo on her left thigh and a gold bracelet on her right wrist. Her confident pose and direct gaze add to her striking presence.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230505_195350.jpg b/dataset/ebPhotos-001/20230505_195350.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3b9e04bbb25642bec2d1c9dbb8d3ce767d44c396
--- /dev/null
+++ b/dataset/ebPhotos-001/20230505_195350.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d42cb2c37923c90258d4b7db2fa67339cdca122d3b081cc994bcdadf753a322f
+size 1835285
diff --git a/dataset/ebPhotos-001/20230505_195350.txt b/dataset/ebPhotos-001/20230505_195350.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e6613f0827ecf8efb029be423e826e829d82d5ea
--- /dev/null
+++ b/dataset/ebPhotos-001/20230505_195350.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin standing in a narrow hallway. she's wearing a tight red long-sleeve mini dress with a cutout at the chest and black high-heeled shoes. her black hair is styled in loose waves. she stands with one hand on the wall and the other touching her hair. the hallway has beige walls white trim and a patterned gray and white rug on the wooden floor. a closed door is visible in the background.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230531_150514.jpg b/dataset/ebPhotos-001/20230531_150514.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3080892cd2ff9b440bec57b73e0fc80d45f576d4
--- /dev/null
+++ b/dataset/ebPhotos-001/20230531_150514.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b478c4b933a6f0d0858de96dd7b801393e8ee08d3189714445580391c4e71d9b
+size 1445125
diff --git a/dataset/ebPhotos-001/20230531_150514.txt b/dataset/ebPhotos-001/20230531_150514.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ebee6b56c61b459603f08326f5063b487eecbf5f
--- /dev/null
+++ b/dataset/ebPhotos-001/20230531_150514.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark skin and a high ponytail wearing a leopard-print bikini smiling at the camera. She has a tattoo on her left arm and is standing in shallow clear turquoise water. In the background another black woman is lying in the water making a peace sign with her right hand. The sky is blue with scattered clouds. The beach is sandy and the ocean is calm.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230531_150552.jpg b/dataset/ebPhotos-001/20230531_150552.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..162a0494aa0ba968ba65e35f8ae40f94c766444d
--- /dev/null
+++ b/dataset/ebPhotos-001/20230531_150552.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f54756bb63341162ddddf4d5e0fd78e7ce5ed729e5f76fd99fc01f4b798e7b8
+size 897622
diff --git a/dataset/ebPhotos-001/20230531_150552.txt b/dataset/ebPhotos-001/20230531_150552.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9c1e2d3c3452bdc75023c09bd24b10f9afdbf7e8
--- /dev/null
+++ b/dataset/ebPhotos-001/20230531_150552.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark skin and a high bun wearing a leopard-print bikini smiling at the camera while taking a selfie in clear turquoise ocean water. She has a tattoo on her left arm and another person partially submerged is in the background. The sky is blue with scattered clouds.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230531_150610.jpg b/dataset/ebPhotos-001/20230531_150610.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2e281b70d0500e2ec6eda018661a43b892a1c2c3
--- /dev/null
+++ b/dataset/ebPhotos-001/20230531_150610.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d8020f60626f33dedc3f6f51498da11d7d1531617e41afd7c49bde523ac40cd
+size 1671927
diff --git a/dataset/ebPhotos-001/20230531_150610.txt b/dataset/ebPhotos-001/20230531_150610.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6798af4146dd5a70ce9e53217f3f42c75346d58f
--- /dev/null
+++ b/dataset/ebPhotos-001/20230531_150610.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark skin smiling widely taking a selfie at a beach. She has her hair in a high bun wearing a leopard-print halter swimsuit. Her right arm has a small tattoo. The background shows clear turquoise water with gentle waves and a person swimming in the distance. The sunlight highlights her glistening skin.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230603_201646.jpg b/dataset/ebPhotos-001/20230603_201646.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7c61bb7086e0a046521cb77a82b79bfb72b3595c
--- /dev/null
+++ b/dataset/ebPhotos-001/20230603_201646.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e632a20c896390e34ac623992c300eb1ce6ec49088bf01488ee9501926864f6
+size 1713027
diff --git a/dataset/ebPhotos-001/20230603_201646.txt b/dataset/ebPhotos-001/20230603_201646.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b5a1cbd5d2a0c0c251e7870b9d824161cbf6dda
--- /dev/null
+++ b/dataset/ebPhotos-001/20230603_201646.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin wearing a white ruched sleeveless dress and a sparkling necklace holding a black sign with gold text reading "I'm a Queen and I'm a Royal Disaster." She has wavy shoulder-length black hair and is smiling with white teeth. The background is a gold sequined curtain. The image has a slightly blurry vintage quality.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230810_145406.jpg b/dataset/ebPhotos-001/20230810_145406.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..412d2c1a68a14464f90506545a9462f79bfa3461
--- /dev/null
+++ b/dataset/ebPhotos-001/20230810_145406.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c863f82292aa2c81cf62f7c620b1e8ef4db9080aecd750cc0997bbd346a40bb2
+size 1503878
diff --git a/dataset/ebPhotos-001/20230810_145406.txt b/dataset/ebPhotos-001/20230810_145406.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7c0ee2c750b9b429d97f89167ef750e8c8c6dc02
--- /dev/null
+++ b/dataset/ebPhotos-001/20230810_145406.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with long straight black hair wearing a gray medical uniform and a silver necklace with a circular pendant standing in a beige-walled kitchen with red cabinets a microwave and a colorful painting of a European street scene in the background. She has a slight smile and is looking at the camera.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230810_145410.jpg b/dataset/ebPhotos-001/20230810_145410.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..acd00d1e2460cae898425abf4f9140476c8bfbc7
--- /dev/null
+++ b/dataset/ebPhotos-001/20230810_145410.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62d2a9d16f160e0bb905a61ab8a1382bf01200a5cc7c5d99affe0d82884caeb0
+size 1528267
diff --git a/dataset/ebPhotos-001/20230810_145410.txt b/dataset/ebPhotos-001/20230810_145410.txt
new file mode 100644
index 0000000000000000000000000000000000000000..da958ebcabcd9b62bd86854dc372168f520113d1
--- /dev/null
+++ b/dataset/ebPhotos-001/20230810_145410.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with straight shoulder-length black hair wearing a dark gray medical uniform with "TOMRA" and "PHLEBOTOMIST" text and a silver necklace with a circular pendant. She has a serious expression and is standing in a beige-walled room with wooden cabinets a black countertop and an art print on the wall. The image is a selfie.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230810_145413.jpg b/dataset/ebPhotos-001/20230810_145413.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5689e0ab37107b7f14c0563ebd8bed7b36118bdc
--- /dev/null
+++ b/dataset/ebPhotos-001/20230810_145413.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2af0b1286345533f0ead4cef0055aa903538c4b845c651db7b95d29b9e0d3b3b
+size 1555869
diff --git a/dataset/ebPhotos-001/20230810_145413.txt b/dataset/ebPhotos-001/20230810_145413.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0dc6248103677a2ebeb3b3672204d2d0ba290408
--- /dev/null
+++ b/dataset/ebPhotos-001/20230810_145413.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium-dark skin straight black hair and brown eyes wearing a dark gray uniform with "VOTTA" logo and a silver necklace with a round pendant. She is smiling slightly with a light beige wall and a colorful painting of a European street scene in the background. The setting appears to be an office or clinic with a brown cabinet and black countertop on the left.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230810_145423.jpg b/dataset/ebPhotos-001/20230810_145423.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d7369411dab02db90bc55368b1b203dfa3690202
--- /dev/null
+++ b/dataset/ebPhotos-001/20230810_145423.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:425651055eedf803dca69673d90247109b9463650a8f0a848ae36331ee22ad3c
+size 1548027
diff --git a/dataset/ebPhotos-001/20230810_145423.txt b/dataset/ebPhotos-001/20230810_145423.txt
new file mode 100644
index 0000000000000000000000000000000000000000..38eca5d2f54b642c080215b3645c026c090cbf06
--- /dev/null
+++ b/dataset/ebPhotos-001/20230810_145423.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin straight black hair and hoop earrings wearing a dark gray button-up shirt with "Zoamart" text and a silver necklace with a circular pendant. She is smiling slightly with a neutral background featuring beige walls and a brown cabinet. The image is a close-up selfie.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230810_145716(0).jpg b/dataset/ebPhotos-001/20230810_145716(0).jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6770d8c82009cfb353a99a43a4daae9adf7dc7fe
--- /dev/null
+++ b/dataset/ebPhotos-001/20230810_145716(0).jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7e27f4f4897a2e80a42ecabc3a5a2dab24cfccd763e1f8c9bbdd2ea8139adcb
+size 1598703
diff --git a/dataset/ebPhotos-001/20230810_145716(0).txt b/dataset/ebPhotos-001/20230810_145716(0).txt
new file mode 100644
index 0000000000000000000000000000000000000000..6578435040b2b1d42f783d4eb43aee9aa5c82d3e
--- /dev/null
+++ b/dataset/ebPhotos-001/20230810_145716(0).txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin and straight black hair wearing a dark gray uniform with "FEMA" and "Emergency Management" text and a silver necklace with a round medallion. She has hoop earrings and a neutral expression. Background includes a beige utility cabinet and a framed document. The setting appears to be an office.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230829_125005.jpg b/dataset/ebPhotos-001/20230829_125005.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..14404005f59f80379f7c22100b617a0f42c18230
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125005.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d61165af867672691725546f028a620db0f62aaeacb2aa0abb4e99eb10d25f78
+size 1952883
diff --git a/dataset/ebPhotos-001/20230829_125005.txt b/dataset/ebPhotos-001/20230829_125005.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2b0909d969ce498ccdfb1d7363f64b76b0705c78
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125005.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin wearing a light blue button-up shirt black hair in a top knot small hoop earrings and a gold necklace with a circular pendant. She is smiling slightly standing by a calm lake with a wooden dock in the background surrounded by green trees under a clear blue sky. The image is a daytime outdoor selfie.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230829_125017.jpg b/dataset/ebPhotos-001/20230829_125017.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2100c1f22e295aaed510353f4f6b6385934ab651
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125017.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61c02042ca2570ff8cd4e004820b08bd388a9a0e6c7a07f4f0100b757722fa41
+size 1778346
diff --git a/dataset/ebPhotos-001/20230829_125017.txt b/dataset/ebPhotos-001/20230829_125017.txt
new file mode 100644
index 0000000000000000000000000000000000000000..72f7c15dcb143920c093362823edcd2b1ac47634
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125017.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin wearing a light blue button-up shirt and gold necklace her hair in a high bun standing by a calm lake with a wooden dock in the background. She has hoop earrings and a subtle smile. The sky is clear blue with a few clouds and green trees line the lake's edge. The image is bright and sunny.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230829_125018.jpg b/dataset/ebPhotos-001/20230829_125018.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d9cc6764dfa6909e654f54bf55bb8fdfabe70afe
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125018.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc3dfa4bc990cc4d72c87a66ba6ef98864e5a5e3fedc85ed301f16ff2fe0bcaa
+size 1946853
diff --git a/dataset/ebPhotos-001/20230829_125018.txt b/dataset/ebPhotos-001/20230829_125018.txt
new file mode 100644
index 0000000000000000000000000000000000000000..78821ba5725a73158f5f2de289441ed43c3b7fc3
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125018.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin wearing a light blue button-up shirt with small white polka dots and her black hair in a neat bun. She's standing on a wooden dock by a lake with a green railing and outdoor furniture in the background. She wears a green pendant necklace and small hoop earrings. The background includes trees a small boat and a red building. The image is a sunny clear day with vibrant colors.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230829_125028.jpg b/dataset/ebPhotos-001/20230829_125028.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..458fcc6d284b98f20f66dde91a7d2c7bdaa69d5e
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125028.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:345f6554e35f38265dc53a3a5a84dfa806593039a87926b14cfe3f03a7ff04c0
+size 1687207
diff --git a/dataset/ebPhotos-001/20230829_125028.txt b/dataset/ebPhotos-001/20230829_125028.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6b72faff8abd6b9307ac5815dd7c620b2e19a7bf
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125028.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark skin and a neat bun hairstyle wearing a light blue button-up shirt standing on a wooden dock by a calm lake. She has a small smile and wears gold hoop earrings. The background includes a clear blue sky green trees and another wooden dock with a ladder. A turquoise pole is in the foreground. The image is bright and sunny.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230829_125040.jpg b/dataset/ebPhotos-001/20230829_125040.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b5b32c9e6ae0c65e75e24136adcd22a80b147123
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125040.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17c15d690fc29229ac17752a6cb4b41dc721bb8986be9d6b7a9a5d425990adfd
+size 1814919
diff --git a/dataset/ebPhotos-001/20230829_125040.txt b/dataset/ebPhotos-001/20230829_125040.txt
new file mode 100644
index 0000000000000000000000000000000000000000..430fef8bb0f7b14b8c3b8525e1d55d790fd2bd14
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125040.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with a medium-dark skin tone wearing a light blue button-up shirt her hair in a neat bun smiling at the camera. She's standing on a green boat by a calm lake with a wooden dock and trees in the background. The sky is clear and blue and there are string lights overhead. The image is bright and sunny capturing a relaxed outdoor setting.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230829_125041.jpg b/dataset/ebPhotos-001/20230829_125041.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ea6e3e8069e9309ee420ba05f08163575d160f95
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125041.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f8aedc2b511ee3cc66167fcc4d7bfd6acc285cb122fff38aa9cbba2318cdbaa
+size 2069509
diff --git a/dataset/ebPhotos-001/20230829_125041.txt b/dataset/ebPhotos-001/20230829_125041.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7a4d826b9f2265687aa6361fe72c3e22542c7f78
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125041.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin wearing a light blue button-up shirt and her hair in a neat bun. She's taking a selfie on a sunny day standing on a wooden dock by a greenish lake. In the background there are boats a sign and trees. She wears small hoop earrings and a subtle necklace. The dock has a turquoise railing and a beige chair is visible to the left.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230829_125125.jpg b/dataset/ebPhotos-001/20230829_125125.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8e52db8d1ddb26e10f5aabf86f83d5c05f422448
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125125.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3d2b6ded056819a063b1a3c3faf82b76026ef09af8f197a22b38f2a2f58b5be
+size 1479078
diff --git a/dataset/ebPhotos-001/20230829_125125.txt b/dataset/ebPhotos-001/20230829_125125.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b45c178dca3326b51b4d354338d41a5cc6f9b37f
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125125.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin wearing a light blue button-up shirt her hair in a neat bun smiling at the camera standing on a wooden dock by a lake green water two small wooden docks in the background green and turquoise poles sunlight green bracelet on her wrist clear sky trees in the distance.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230829_125815.jpg b/dataset/ebPhotos-001/20230829_125815.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a007e92633037da62d6b67b0926d43a0f8f27b8b
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125815.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:377f3b1e47d3cab2025704ab1481a4d92c92aaad6f92bf9076d2c92af7b52437
+size 1912975
diff --git a/dataset/ebPhotos-001/20230829_125815.txt b/dataset/ebPhotos-001/20230829_125815.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1d879e2625454fb90ce1c1db81647753f33713a3
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125815.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin wearing a light blue button-up shirt black hair in a top bun and hoop earrings taking a selfie by a lake. She's standing on a green-painted dock with a wooden pier and boats in the background. Trees and a clear blue sky are visible in the distance. She has a subtle smile and a green necklace. The water is calm and greenish.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230829_125818.jpg b/dataset/ebPhotos-001/20230829_125818.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bc1d057931557599cdd52fde1ca3af0148458818
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125818.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12fa3616c0a634d2dee3c54ff969340f031d01f9c1f0901f2ce25c4b06725d7b
+size 1795770
diff --git a/dataset/ebPhotos-001/20230829_125818.txt b/dataset/ebPhotos-001/20230829_125818.txt
new file mode 100644
index 0000000000000000000000000000000000000000..baeb74a0516d0ef43d70e7c000bad63d6efcea5b
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125818.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark skin and a high bun hairstyle wearing a light blue button-up shirt and gold hoop earrings standing by a calm lake with a wooden dock in the background. She has a subtle smile and is looking at the camera. The sky is clear with light clouds and green trees line the lake's edge. The image is bright and sunny.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230829_125837.jpg b/dataset/ebPhotos-001/20230829_125837.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..948355d872966a28c30499dffa26d35cc04343e8
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125837.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54a0feb82c5268ed886fe7a9d1f13c49a57705c10562cdf8491d9e5da0bf034c
+size 1896931
diff --git a/dataset/ebPhotos-001/20230829_125837.txt b/dataset/ebPhotos-001/20230829_125837.txt
new file mode 100644
index 0000000000000000000000000000000000000000..98dc169f1137519301045b795776329e81479787
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125837.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin wearing a light blue button-up shirt with small white dots and a black bun hairstyle. She has hoop earrings and a subtle smile. She's standing on a green-painted dock by a calm lake with a wooden pier and small boats in the background. Trees and a clear blue sky are visible in the distance. The image is a daytime outdoor selfie.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230829_125843.jpg b/dataset/ebPhotos-001/20230829_125843.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0f6116a37a07f1810a1ac6b2d9d7faed47ad0b7f
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125843.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be53fe1c6fd5cda5cee4b60f40d4b59922efa4295c21b82c1e5a4542d9d3ee16
+size 1838809
diff --git a/dataset/ebPhotos-001/20230829_125843.txt b/dataset/ebPhotos-001/20230829_125843.txt
new file mode 100644
index 0000000000000000000000000000000000000000..918186d764df5c9bf45a875a1d39ea8f2b63616d
--- /dev/null
+++ b/dataset/ebPhotos-001/20230829_125843.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin wearing a light blue button-up shirt her hair in a neat bun standing on a turquoise-painted wooden dock by a calm lake. She has hoop earrings and a slight smile. The background includes a clear blue sky green trees and another dock with small boats. The image is bright and sunny capturing a peaceful outdoor setting.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230904_222403.jpg b/dataset/ebPhotos-001/20230904_222403.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..44f5c32153cc644bfb96dcb888aa344dc4c2e3ec
--- /dev/null
+++ b/dataset/ebPhotos-001/20230904_222403.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d91d693050533558e8da2e0bf14d6f4b778436ade4125f8cdac1a5d3c2fdb53
+size 1562901
diff --git a/dataset/ebPhotos-001/20230904_222403.txt b/dataset/ebPhotos-001/20230904_222403.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e0ff56c9549d8234a036e71f90dd3dd3f5b4c869
--- /dev/null
+++ b/dataset/ebPhotos-001/20230904_222403.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin wearing a white lace-trimmed nightgown lying in bed pouting her lips. She has her hair in a high bun and her brown eyes look at the camera. The background includes wooden paneling and a white pillow. The lighting is soft highlighting her smooth skin and subtle makeup. The bed has a white sheet and a small pile of clothes is visible in the background.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230913_150216.jpg b/dataset/ebPhotos-001/20230913_150216.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..70965035330cddb522f931048d015ca492398462
--- /dev/null
+++ b/dataset/ebPhotos-001/20230913_150216.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2a9b1a0d234c1fa89029e48e6e4153ecf6fbac3a5c17393eb1886eb0807fb68
+size 1422403
diff --git a/dataset/ebPhotos-001/20230913_150216.txt b/dataset/ebPhotos-001/20230913_150216.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0f8b4c1c7724e58692f41432b0e355c661599b45
--- /dev/null
+++ b/dataset/ebPhotos-001/20230913_150216.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin wearing a black top and a gold chain necklace with a cross pendant her hair is in a neat bun. She's leaning on her right cheek smiling subtly. She wears a gold watch on her left wrist. The background shows stacked cardboard boxes and a window with greenery outside. The setting appears to be an office or storage room.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230922_214427.jpg b/dataset/ebPhotos-001/20230922_214427.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3c26734675441307cb70d3e77c14005fee74f12f
--- /dev/null
+++ b/dataset/ebPhotos-001/20230922_214427.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29662166963c387bd886dc160a1b6847bba1329a33971e64784cf444e0d53911
+size 3424248
diff --git a/dataset/ebPhotos-001/20230922_214427.txt b/dataset/ebPhotos-001/20230922_214427.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0dd6b83fec17d1ab5ab5380a9771bf24326e23b5
--- /dev/null
+++ b/dataset/ebPhotos-001/20230922_214427.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin and wavy black hair smiling widely while taking a mirror selfie in a bathroom. She wears a white tank top and white pants with a yellow bracelet on her right wrist. The bathroom has beige walls a granite countertop a white toilet and a black shower curtain. A black dress hangs on a door and various toiletries are on the counter. She holds a black smartphone in her right hand.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20230922_214533.jpg b/dataset/ebPhotos-001/20230922_214533.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0c9283de14792767ca65d1a29329491e2547ee55
--- /dev/null
+++ b/dataset/ebPhotos-001/20230922_214533.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b38c3603a214f00dc2d474ffd75d95858c9f3832c7694d36a43f55d6ff495cd6
+size 3478196
diff --git a/dataset/ebPhotos-001/20230922_214533.txt b/dataset/ebPhotos-001/20230922_214533.txt
new file mode 100644
index 0000000000000000000000000000000000000000..89c792bb51403694b8df512c3607e819c51b211e
--- /dev/null
+++ b/dataset/ebPhotos-001/20230922_214533.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with shoulder-length curly hair smiling widely taking a mirror selfie in a bathroom. She's wearing a white spaghetti strap top revealing a tattoo on her right arm. Her phone has a blue case. The bathroom counter is cluttered with toiletries including a clear plastic bag and a brown bottle. The background shows a toilet and a dark shower curtain. The woman's skin is smooth and she has a yellow bracelet on her left wrist. The image is brightly lit and captures a casual candid
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20231213_221512.jpg b/dataset/ebPhotos-001/20231213_221512.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fae719210a561f6e0c38205439d598d5972a4204
--- /dev/null
+++ b/dataset/ebPhotos-001/20231213_221512.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bc4c4d494d97ee9ec9c011c4e968eb67677760b5fbb1879701f49a387004822
+size 2032075
diff --git a/dataset/ebPhotos-001/20231213_221512.txt b/dataset/ebPhotos-001/20231213_221512.txt
new file mode 100644
index 0000000000000000000000000000000000000000..be67d8c9408e2b0d207b3df6b753749ed0427dc2
--- /dev/null
+++ b/dataset/ebPhotos-001/20231213_221512.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with long black hair standing in a bathroom wearing a red plaid schoolgirl outfit with a short skirt and a matching tie partially lifting her white crop top to reveal her midriff smiling at the camera beige walls white door shower curtain and granite countertop in the background.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20231213_221540.jpg b/dataset/ebPhotos-001/20231213_221540.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cd5cba4ab7c8a069b39eafedbafa801d8f806628
--- /dev/null
+++ b/dataset/ebPhotos-001/20231213_221540.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ae6e9449395192dae021d01b0573bae775bb58c27d06fec6b3773a35d1dc9df
+size 1999043
diff --git a/dataset/ebPhotos-001/20231213_221540.txt b/dataset/ebPhotos-001/20231213_221540.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2b185e4290e6a889673e9fe6e8298c44ad41ddc0
--- /dev/null
+++ b/dataset/ebPhotos-001/20231213_221540.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin and long black hair standing in a bathroom. she's wearing a red plaid schoolgirl outfit with a tied white top revealing her midriff and cleavage and a matching short skirt. she holds the top smiling slightly. the background includes a white shower curtain a closed door and bathroom counter with toiletries. the lighting is soft and the overall mood is playful and confident.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20231213_221623.jpg b/dataset/ebPhotos-001/20231213_221623.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3d8c82d37bdc9dd35af48bce065dd3ca99b799e8
--- /dev/null
+++ b/dataset/ebPhotos-001/20231213_221623.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c2fae17aa84ac8b665917910f8db360fe48d04a085b8ee7860b8ab29677d64f
+size 1765801
diff --git a/dataset/ebPhotos-001/20231213_221623.txt b/dataset/ebPhotos-001/20231213_221623.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9b609131ed0648e2900a2e4590e41c6342ae58e8
--- /dev/null
+++ b/dataset/ebPhotos-001/20231213_221623.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin and shoulder-length black hair wearing a red plaid schoolgirl outfit with a tied white top revealing her cleavage and a short plaid skirt. She stands in a bathroom with white doors a shower curtain and granite countertop. She smiles slightly holding her top with a green bracelet on her wrist. The lighting is soft and the overall mood is playful and seductive.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20231213_221740.jpg b/dataset/ebPhotos-001/20231213_221740.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8c6148003c9139f929b48eb0760e6628dd56906a
--- /dev/null
+++ b/dataset/ebPhotos-001/20231213_221740.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a17af05e2736563edd1ec9296f331486161e4d8d863d0f13ae2c92157a09eb9
+size 1815408
diff --git a/dataset/ebPhotos-001/20231213_221740.txt b/dataset/ebPhotos-001/20231213_221740.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6495748d6bc9ba679858662c547733d9db996ba4
--- /dev/null
+++ b/dataset/ebPhotos-001/20231213_221740.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with short curly hair wearing a red plaid schoolgirl outfit with a tied white top exposing her midriff and small breasts. She stands in a bedroom with beige walls white door and white curtain hands behind her head smiling confidently. The outfit includes a short skirt and suspenders. The lighting is soft highlighting her smooth dark skin.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20231213_221801.jpg b/dataset/ebPhotos-001/20231213_221801.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..25c5191f4c385b2c639292f8e95e1e6ff645dd92
--- /dev/null
+++ b/dataset/ebPhotos-001/20231213_221801.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ea92658d09057358ef9a086c4796e05e1df4b4a31dedca78a17fe98713e46ad
+size 1960043
diff --git a/dataset/ebPhotos-001/20231213_221801.txt b/dataset/ebPhotos-001/20231213_221801.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0a3241960a84705152ee40cfce9b27dd3ec35983
--- /dev/null
+++ b/dataset/ebPhotos-001/20231213_221801.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin wearing a red plaid schoolgirl outfit with a white tied crop top short pleated skirt and suspenders. She has black hair in a side ponytail one arm raised and a playful expression. Background includes a white door beige walls and white curtain. She wears a green bracelet on her right wrist. The setting is a bathroom.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20240108_182603.jpg b/dataset/ebPhotos-001/20240108_182603.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9accfd46f2fe292768bff99f7c9e734911af19f5
--- /dev/null
+++ b/dataset/ebPhotos-001/20240108_182603.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf58b9e65f06ac6db8ea27b35326777b5c4d687f8c504797cc2b08359c20c475
+size 2128505
diff --git a/dataset/ebPhotos-001/20240108_182603.txt b/dataset/ebPhotos-001/20240108_182603.txt
new file mode 100644
index 0000000000000000000000000000000000000000..931d7dc823bb2728de95706119243843953570aa
--- /dev/null
+++ b/dataset/ebPhotos-001/20240108_182603.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with braided hair wearing a green and black patterned shawl over a black sweater standing in a shoe store. She has a playful expression sticking out her tongue. The background shows rows of neatly arranged shoes on red shelves. She wears small hoop earrings and has a light brown skin tone. The image is a selfie capturing her from the waist up.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20240108_182606.jpg b/dataset/ebPhotos-001/20240108_182606.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3a418bee74abcab60f497db2e47ed7e52d2d8dc9
--- /dev/null
+++ b/dataset/ebPhotos-001/20240108_182606.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f658a4f09fd5d998a6a6eebffbdeb877dd470d53e4ec29826515adddc8910d77
+size 2373918
diff --git a/dataset/ebPhotos-001/20240108_182606.txt b/dataset/ebPhotos-001/20240108_182606.txt
new file mode 100644
index 0000000000000000000000000000000000000000..56486715a228f9b5094e121e4d98552ff2aa69f7
--- /dev/null
+++ b/dataset/ebPhotos-001/20240108_182606.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with a medium-dark skin tone wearing a green and brown patterned shawl smiling with her tongue slightly out. She has her hair pulled back and is standing in a hardware store with a wooden pegboard background displaying various tools. Signs in red and white are visible above the pegboard. She is wearing a dark textured sweater. The image is a selfie capturing her from the waist up.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20240120_204005.jpg b/dataset/ebPhotos-001/20240120_204005.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b9bf630776a14ec7265c8d23923c1cc135174c31
--- /dev/null
+++ b/dataset/ebPhotos-001/20240120_204005.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a292d71ad9327ce2c4b21dcd3550538436b9f174e5fce23f8e583acb553fa02
+size 2045225
diff --git a/dataset/ebPhotos-001/20240120_204005.txt b/dataset/ebPhotos-001/20240120_204005.txt
new file mode 100644
index 0000000000000000000000000000000000000000..72961a449ce584ff6a8fab89ed0bf4ee9eee7b6f
--- /dev/null
+++ b/dataset/ebPhotos-001/20240120_204005.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin black shoulder-length hair and a tattoo on her right arm. She is wearing a red and orange striped sari with one shoulder exposed. She has a playful expression with puckered lips. The background shows a beige-walled room with a white door and ceiling lights. A black mirror frame is visible on the left. The image is a selfie.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20240126_210211.jpg b/dataset/ebPhotos-001/20240126_210211.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2514eda7e7f48a8bbb3e8fb3c77e446643e48b5f
--- /dev/null
+++ b/dataset/ebPhotos-001/20240126_210211.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57206ec4549a3a26d301fef369d29dd75e6ca9081364d79b21c6f4a48c4e7956
+size 1945215
diff --git a/dataset/ebPhotos-001/20240126_210211.txt b/dataset/ebPhotos-001/20240126_210211.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a06ff06621204a716164fd9e15b84a273a2773a1
--- /dev/null
+++ b/dataset/ebPhotos-001/20240126_210211.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark skin and curly black hair standing in a narrow hallway. She wears a form-fitting strapless coral-red evening gown that highlights her curvy figure and medium-sized breasts. She poses with one hand touching her hair and the other on her head. The hallway has beige walls white doors and a patterned gray and white rug on the wooden floor. She stands in front of a closed white door with a glimpse of a brightly lit room in the background.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20240126_211631.jpg b/dataset/ebPhotos-001/20240126_211631.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7d780af41f5c67ca89abda465d0454ebdf1c1462
--- /dev/null
+++ b/dataset/ebPhotos-001/20240126_211631.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:550bf7f36fd4028a504afaa0f262ed3f38711aeb98884b72eb3bd75d33f46f7b
+size 1754997
diff --git a/dataset/ebPhotos-001/20240126_211631.txt b/dataset/ebPhotos-001/20240126_211631.txt
new file mode 100644
index 0000000000000000000000000000000000000000..38986e2cbab8cf97fc18a529d48fc8d39ba2d547
--- /dev/null
+++ b/dataset/ebPhotos-001/20240126_211631.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin standing in a narrow hallway. she has wavy black hair wearing a tight black leather mini dress with spaghetti straps accentuating her slim toned physique and small breasts. she's posing with one hand on the wall and the other on her hip showing a tattoo on her left thigh. the hallway has beige walls white doors and a wooden floor with a patterned rug. a ceiling light illuminates her.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20240126_211810.jpg b/dataset/ebPhotos-001/20240126_211810.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0c77775e1eeec08ac0b63ef27ed05576289c7869
--- /dev/null
+++ b/dataset/ebPhotos-001/20240126_211810.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ed5b615b7d75a08297a291da7d8ea4248fd692cde797851fd06139b9e3016a6
+size 1927213
diff --git a/dataset/ebPhotos-001/20240126_211810.txt b/dataset/ebPhotos-001/20240126_211810.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bf220deb96c0480717babd9ba8604227e3fff996
--- /dev/null
+++ b/dataset/ebPhotos-001/20240126_211810.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with curly hair wearing a tight black leather dress and black high-heeled sandals standing in a narrow hallway with beige walls and wooden floor. She has a tattoo on her left thigh and is leaning against the wall with one hand. A patterned rug is under her feet and a door with white blinds is visible in the background. A ceiling light illuminates the scene.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20240403_130236.jpg b/dataset/ebPhotos-001/20240403_130236.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..591cfdf4bf2d7f05dc46b7b1625cc3af36e9ef58
--- /dev/null
+++ b/dataset/ebPhotos-001/20240403_130236.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:589f292193fc34fb90988c645d0b345596b48607f8e965c07e8d0777a053abb3
+size 1572812
diff --git a/dataset/ebPhotos-001/20240403_130236.txt b/dataset/ebPhotos-001/20240403_130236.txt
new file mode 100644
index 0000000000000000000000000000000000000000..efebaa896aaacf91c0a593943fda09dd7d5c762c
--- /dev/null
+++ b/dataset/ebPhotos-001/20240403_130236.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with long braided hair wearing a white sleeveless top standing by a clear blue swimming pool. She has hoop earrings colorful bracelets and a silver necklace. In the background there's a rustic house with a shingled roof green fence and white pool chairs. She's smiling slightly looking at the camera with her right hand touching her hair. The sunlight casts shadows enhancing the bright sunny day.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20240403_130247.jpg b/dataset/ebPhotos-001/20240403_130247.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d88830c436d741aaef1eb262927ce3fac133d5e0
--- /dev/null
+++ b/dataset/ebPhotos-001/20240403_130247.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c02a038e4f2360ce10fa068b1442e092083157b8e93a51c0d28fb863bcf7b995
+size 1823126
diff --git a/dataset/ebPhotos-001/20240403_130247.txt b/dataset/ebPhotos-001/20240403_130247.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1b8ac22b481cc6c090fac43ce85316fe08989224
--- /dev/null
+++ b/dataset/ebPhotos-001/20240403_130247.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with long braided hair wearing a white sleeveless top taking a selfie by a clear blue pool. She has a neutral expression wearing colorful beaded bracelets. In the background there's a charming two-story house with a shingled roof surrounded by tall palm trees and greenery. The bright sunlight casts clear shadows enhancing the sunny relaxed atmosphere.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/20240403_130313.jpg b/dataset/ebPhotos-001/20240403_130313.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e95bf3287be439f03174c902d6542a49e0b4a274
--- /dev/null
+++ b/dataset/ebPhotos-001/20240403_130313.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab72d3bbb1dcb0440168bd9ba681758558a93dd055554bf2ea97be68a459d9e1
+size 1934690
diff --git a/dataset/ebPhotos-001/20240403_130313.txt b/dataset/ebPhotos-001/20240403_130313.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d0098738c100ac6131e1b796d086a4a36b72e4ea
--- /dev/null
+++ b/dataset/ebPhotos-001/20240403_130313.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with braided hair wearing a white top standing by a clear blue swimming pool. She smiles at the camera touching her hair with her right hand. In the background there's a charming gray-roofed house with tall palm trees and a green fence. The bright sunlight casts shadows enhancing the sunny relaxed atmosphere.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/Ebony2.JPG b/dataset/ebPhotos-001/Ebony2.JPG
new file mode 100644
index 0000000000000000000000000000000000000000..82b0557db2f382ce66ab0ca65769d85e479e0400
--- /dev/null
+++ b/dataset/ebPhotos-001/Ebony2.JPG
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e510630606515758bc87766eab33e47f68f6e8b51083bbcf143b64b7e0c03996
+size 53988
diff --git a/dataset/ebPhotos-001/Ebony2.txt b/dataset/ebPhotos-001/Ebony2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..393c92f79cf4b0101fb92a019a128d841e5f557e
--- /dev/null
+++ b/dataset/ebPhotos-001/Ebony2.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark brown skin curly black hair and a bright smile. She wears a purple top with a red and yellow beaded necklace. The background is a soft cloudy sky. The image is a close-up portrait capturing her cheerful expression and natural beauty.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/Ebony2upscaled.JPG b/dataset/ebPhotos-001/Ebony2upscaled.JPG
new file mode 100644
index 0000000000000000000000000000000000000000..9e5a77625ef9029aa393a80bfbb28a734c9faa68
--- /dev/null
+++ b/dataset/ebPhotos-001/Ebony2upscaled.JPG
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1b923891fc6511840b93b797272f7796dcafbb05eb0dbd61a8c63876035955d
+size 134026
diff --git a/dataset/ebPhotos-001/Ebony2upscaled.txt b/dataset/ebPhotos-001/Ebony2upscaled.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0d79032294cb0f1024342496f1be313d5954e9c7
--- /dev/null
+++ b/dataset/ebPhotos-001/Ebony2upscaled.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark brown skin curly hair and a bright smile. She wears a purple top with a red and yellow beaded necklace. The background is a cloudy light blue sky. The image is a close-up portrait capturing her cheerful expression and natural beauty.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/Ebony3upscaled.JPG b/dataset/ebPhotos-001/Ebony3upscaled.JPG
new file mode 100644
index 0000000000000000000000000000000000000000..552b3090837e5eef3b891f25aafd89137d57b1dc
--- /dev/null
+++ b/dataset/ebPhotos-001/Ebony3upscaled.JPG
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d656bda36048c9dceef5aac244b3f7ed59caf342d32d136715a17475e0b7ea30
+size 255807
diff --git a/dataset/ebPhotos-001/Ebony3upscaled.txt b/dataset/ebPhotos-001/Ebony3upscaled.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2ae4c4203cefccbc8e3433133670d9c7bd98667c
--- /dev/null
+++ b/dataset/ebPhotos-001/Ebony3upscaled.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark brown skin curly black hair and a bright smile. She wears a purple top with a red and yellow beaded necklace. The background is a cloudy light blue sky. The image is a close-up portrait capturing her cheerful expression and natural beauty.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/IMG_1056 - Copy.JPG b/dataset/ebPhotos-001/IMG_1056 - Copy.JPG
new file mode 100644
index 0000000000000000000000000000000000000000..1950342bcc10d1b98930939fb1807281171e8b42
--- /dev/null
+++ b/dataset/ebPhotos-001/IMG_1056 - Copy.JPG	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92675d51208f815f5bc9ec3d615d764c48d91e5684e868200fe3f938cefd26cc
+size 512160
diff --git a/dataset/ebPhotos-001/IMG_1056 - Copy.txt b/dataset/ebPhotos-001/IMG_1056 - Copy.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f3248d78f2f8eea0a15c90c9bf72909e0be7e9ab
--- /dev/null
+++ b/dataset/ebPhotos-001/IMG_1056 - Copy.txt	
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark skin wearing a white bandage headwrap and a white button-up shirt smiling softly. She has a slender build and is standing against a plain light beige background. The image is well-lit with a soft natural light highlighting her face and upper body.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/IMG_1061.JPG b/dataset/ebPhotos-001/IMG_1061.JPG
new file mode 100644
index 0000000000000000000000000000000000000000..3c9ccd7d1941e67428064024af1ff2ddfc6d17f6
--- /dev/null
+++ b/dataset/ebPhotos-001/IMG_1061.JPG
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6c0effafa7e973dbb078b34e8a7e76f5d52748b7eb2ebca069e5a61ab93dcaf
+size 527185
diff --git a/dataset/ebPhotos-001/IMG_1061.txt b/dataset/ebPhotos-001/IMG_1061.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b254b1c6c2aa70864b5ab1d05ba325bbcad5febf
--- /dev/null
+++ b/dataset/ebPhotos-001/IMG_1061.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark skin wearing a white knit cap and a white button-up shirt smiling at the camera. She has straight black hair small silver braces on her teeth and is standing against a plain light gray background. The image is well-lit with a simple and clean composition.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/PXL_20240227_181252113a.jpg b/dataset/ebPhotos-001/PXL_20240227_181252113a.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3425f3c4403adc05875002a8dcd6d19f1635c2da
--- /dev/null
+++ b/dataset/ebPhotos-001/PXL_20240227_181252113a.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4761ddf6ac5cacf94c4ca24c24d7bb68c9e3f284988aaaacfdaaa880fc31e21b
+size 2324372
diff --git a/dataset/ebPhotos-001/PXL_20240227_181252113a.txt b/dataset/ebPhotos-001/PXL_20240227_181252113a.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d28d5f7393a9e7a8738bb91fc6ab2a56ed7031cb
--- /dev/null
+++ b/dataset/ebPhotos-001/PXL_20240227_181252113a.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin smiling standing on a wooden dock by a calm lake. She has shoulder-length curly black hair wears a yellow sweater black knitted cardigan dark purple leggings and black sneakers with yellow socks. Leafless trees and a clear blue sky are in the background. She is mid-step right hand raised left hand on her hip. The scene is sunny and serene.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/eb1.png b/dataset/ebPhotos-001/eb1.png
new file mode 100644
index 0000000000000000000000000000000000000000..74e3c857dace8d58d75832550918b650137a7b19
--- /dev/null
+++ b/dataset/ebPhotos-001/eb1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a56760e95dae70dd0fad56084317607c0c7820cd29afe1093e005c97d9a2383
+size 723369
diff --git a/dataset/ebPhotos-001/eb1.txt b/dataset/ebPhotos-001/eb1.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0a9cccdc8b46197f99adc5deff9d8b57be0f7258
--- /dev/null
+++ b/dataset/ebPhotos-001/eb1.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin shoulder-length wavy black hair standing outdoors on a sunny day. She wears a beige sleeveless top with black vertical stripes black shiny leather leggings and black strappy high heels. She holds up one sleeve with her right hand and touches her hair with her left. She has a green bracelet on her left wrist and a watch on her right wrist. The background shows a tree-lined sidewalk with a blurred building and greenery. She looks confident and stylish.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/eb_jpg - Copy.jpg b/dataset/ebPhotos-001/eb_jpg - Copy.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c700b1ee25bf6b8ed55911f447f8450f2f93875d
--- /dev/null
+++ b/dataset/ebPhotos-001/eb_jpg - Copy.jpg	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc58afe3fb485fb78dc1415fd2885ce4eda13ddafd27143e63f8cf177e08cd14
+size 3100
diff --git a/dataset/ebPhotos-001/eb_jpg - Copy.txt b/dataset/ebPhotos-001/eb_jpg - Copy.txt
new file mode 100644
index 0000000000000000000000000000000000000000..06f7130606871041b56e0a75a03394ecbb1dc917
--- /dev/null
+++ b/dataset/ebPhotos-001/eb_jpg - Copy.txt	
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin and wavy black hair wearing a black short-sleeved button-up shirt standing outdoors in a blurred green forest background. She has a thoughtful expression with her right hand gently touching her chin. She wears a blue wristwatch on her left wrist. The image is sharp with a soft-focus background.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/eb_jpg.txt b/dataset/ebPhotos-001/eb_jpg.txt
new file mode 100644
index 0000000000000000000000000000000000000000..06f7130606871041b56e0a75a03394ecbb1dc917
--- /dev/null
+++ b/dataset/ebPhotos-001/eb_jpg.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin and wavy black hair wearing a black short-sleeved button-up shirt standing outdoors in a blurred green forest background. She has a thoughtful expression with her right hand gently touching her chin. She wears a blue wristwatch on her left wrist. The image is sharp with a soft-focus background.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/eb_jpg.webp b/dataset/ebPhotos-001/eb_jpg.webp
new file mode 100644
index 0000000000000000000000000000000000000000..c700b1ee25bf6b8ed55911f447f8450f2f93875d
--- /dev/null
+++ b/dataset/ebPhotos-001/eb_jpg.webp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc58afe3fb485fb78dc1415fd2885ce4eda13ddafd27143e63f8cf177e08cd14
+size 3100
diff --git a/dataset/ebPhotos-001/ebony5.jpg b/dataset/ebPhotos-001/ebony5.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..543c70b03942166e8ae1b34b87a1a7c205d13641
--- /dev/null
+++ b/dataset/ebPhotos-001/ebony5.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6af0ff17730636be15676d027c9497434f747efb72219bc1a957bc452b598ba
+size 7718
diff --git a/dataset/ebPhotos-001/ebony5.txt b/dataset/ebPhotos-001/ebony5.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3be97a67416d45b53fd80e377f81ed80130686c5
--- /dev/null
+++ b/dataset/ebPhotos-001/ebony5.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with dark skin smiling wearing a gray patterned dress with one shoulder down silver earrings and a beaded necklace. She has braided hair and stands against a beige textured wall in a dimly lit room. The background is dark and indistinct.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/ebsxy.jpg b/dataset/ebPhotos-001/ebsxy.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4b159ab169377d9f55ca7fbcfdb65d449eecf4fc
--- /dev/null
+++ b/dataset/ebPhotos-001/ebsxy.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0755fc7b7ca3e6d41722472290fa8bec8529ba2d7bd69bf89c10b4bcf70b3922
+size 94571
diff --git a/dataset/ebPhotos-001/ebsxy.txt b/dataset/ebPhotos-001/ebsxy.txt
new file mode 100644
index 0000000000000000000000000000000000000000..49b3a4ade999c47db5212aec7e70820332ba5ee7
--- /dev/null
+++ b/dataset/ebPhotos-001/ebsxy.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with medium brown skin and long black hair standing in a bathroom. she's wearing a revealing red plaid schoolgirl outfit with a tied white top exposing her midriff and cleavage and a matching short skirt. she holds the top smiling slightly. the bathroom has a white shower curtain beige walls and white door in the background. a mirror and countertop are partially visible on the right.
\ No newline at end of file
diff --git a/dataset/ebPhotos-001/ebsxy2.jpg b/dataset/ebPhotos-001/ebsxy2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ceb8ef096e7d5cc5392f5cc62f01ef126337781b
--- /dev/null
+++ b/dataset/ebPhotos-001/ebsxy2.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6dba4569a1c2209edb7cde62b386e7fadcfbfd83fb675e25549fcdfe37051933
+size 112587
diff --git a/dataset/ebPhotos-001/ebsxy2.txt b/dataset/ebPhotos-001/ebsxy2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3b6dc82e3eba24bf9d6455d83b9c987e81b72a9d
--- /dev/null
+++ b/dataset/ebPhotos-001/ebsxy2.txt
@@ -0,0 +1 @@
+photo of a beautiful black woman with short curly hair standing in a bedroom. She wears a red plaid schoolgirl outfit with a tied white crop top revealing her midriff and matching skirt. Her hands are behind her head and she smiles at the camera. The background includes a white door beige walls and a white curtain. The lighting is soft and the overall mood is playful and confident.
\ No newline at end of file
diff --git a/dataset/huggingfacetoml.toml b/dataset/huggingfacetoml.toml
new file mode 100644
index 0000000000000000000000000000000000000000..0b302c606b930d6ebc3ded896474832fa6365f40
--- /dev/null
+++ b/dataset/huggingfacetoml.toml
@@ -0,0 +1,11 @@
+[general]
+resolution = [960, 544]
+caption_extension = ".txt"
+batch_size = 1
+enable_bucket = true
+bucket_no_upscale = false
+
+[[datasets]]
+image_directory = "dataset/ebPhotos-001"
+cache_directory = "/cache_directory"
+num_repeats = 1 # optional, default is 1. Number of times to repeat the dataset. Useful to balance the multiple datasets with different sizes.
diff --git a/dataset/image_video_dataset.py b/dataset/image_video_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce8eb52a6b580b9b2d5755ee2b4c168714cc3ad7
--- /dev/null
+++ b/dataset/image_video_dataset.py
@@ -0,0 +1,1400 @@
+from concurrent.futures import ThreadPoolExecutor
+import glob
+import json
+import math
+import os
+import random
+import time
+from typing import Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from safetensors.torch import save_file, load_file
+from safetensors import safe_open
+from PIL import Image
+import cv2
+import av
+
+from utils import safetensors_utils
+from utils.model_utils import dtype_to_str
+
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", ".webp", ".bmp", ".PNG", ".JPG", ".JPEG", ".WEBP", ".BMP"]
+
+try:
+    import pillow_avif
+
+    IMAGE_EXTENSIONS.extend([".avif", ".AVIF"])
+except:
+    pass
+
+# JPEG-XL on Linux
+try:
+    from jxlpy import JXLImagePlugin
+
+    IMAGE_EXTENSIONS.extend([".jxl", ".JXL"])
+except:
+    pass
+
+# JPEG-XL on Windows
+try:
+    import pillow_jxl
+
+    IMAGE_EXTENSIONS.extend([".jxl", ".JXL"])
+except:
+    pass
+
+VIDEO_EXTENSIONS = [
+    ".mp4",
+    ".webm",
+    ".avi",
+    ".mkv",
+    ".mov",
+    ".flv",
+    ".wmv",
+    ".m4v",
+    ".mpg",
+    ".mpeg",
+    ".MP4",
+    ".WEBM",
+    ".AVI",
+    ".MKV",
+    ".MOV",
+    ".FLV",
+    ".WMV",
+    ".M4V",
+    ".MPG",
+    ".MPEG",
+]  # some of them are not tested
+
+ARCHITECTURE_HUNYUAN_VIDEO = "hv"
+ARCHITECTURE_HUNYUAN_VIDEO_FULL = "hunyuan_video"
+ARCHITECTURE_WAN = "wan"
+ARCHITECTURE_WAN_FULL = "wan"
+
+
+def glob_images(directory, base="*"):
+    img_paths = []
+    for ext in IMAGE_EXTENSIONS:
+        if base == "*":
+            img_paths.extend(glob.glob(os.path.join(glob.escape(directory), base + ext)))
+        else:
+            img_paths.extend(glob.glob(glob.escape(os.path.join(directory, base + ext))))
+    img_paths = list(set(img_paths))  # remove duplicates
+    img_paths.sort()
+    return img_paths
+
+
+def glob_videos(directory, base="*"):
+    video_paths = []
+    for ext in VIDEO_EXTENSIONS:
+        if base == "*":
+            video_paths.extend(glob.glob(os.path.join(glob.escape(directory), base + ext)))
+        else:
+            video_paths.extend(glob.glob(glob.escape(os.path.join(directory, base + ext))))
+    video_paths = list(set(video_paths))  # remove duplicates
+    video_paths.sort()
+    return video_paths
+
+
+def divisible_by(num: int, divisor: int) -> int:
+    return num - num % divisor
+
+
+def resize_image_to_bucket(image: Union[Image.Image, np.ndarray], bucket_reso: tuple[int, int]) -> np.ndarray:
+    """
+    Resize the image to the bucket resolution.
+    """
+    is_pil_image = isinstance(image, Image.Image)
+    if is_pil_image:
+        image_width, image_height = image.size
+    else:
+        image_height, image_width = image.shape[:2]
+
+    if bucket_reso == (image_width, image_height):
+        return np.array(image) if is_pil_image else image
+
+    bucket_width, bucket_height = bucket_reso
+    if bucket_width == image_width or bucket_height == image_height:
+        image = np.array(image) if is_pil_image else image
+    else:
+        # resize the image to the bucket resolution to match the short side
+        scale_width = bucket_width / image_width
+        scale_height = bucket_height / image_height
+        scale = max(scale_width, scale_height)
+        image_width = int(image_width * scale + 0.5)
+        image_height = int(image_height * scale + 0.5)
+
+        if scale > 1:
+            image = Image.fromarray(image) if not is_pil_image else image
+            image = image.resize((image_width, image_height), Image.LANCZOS)
+            image = np.array(image)
+        else:
+            image = np.array(image) if is_pil_image else image
+            image = cv2.resize(image, (image_width, image_height), interpolation=cv2.INTER_AREA)
+
+    # crop the image to the bucket resolution
+    crop_left = (image_width - bucket_width) // 2
+    crop_top = (image_height - bucket_height) // 2
+    image = image[crop_top : crop_top + bucket_height, crop_left : crop_left + bucket_width]
+    return image
+
+
+class ItemInfo:
+    def __init__(
+        self,
+        item_key: str,
+        caption: str,
+        original_size: tuple[int, int],
+        bucket_size: Optional[Union[tuple[int, int], tuple[int, int, int]]] = None,
+        frame_count: Optional[int] = None,
+        content: Optional[np.ndarray] = None,
+        latent_cache_path: Optional[str] = None,
+    ) -> None:
+        self.item_key = item_key
+        self.caption = caption
+        self.original_size = original_size
+        self.bucket_size = bucket_size
+        self.frame_count = frame_count
+        self.content = content
+        self.latent_cache_path = latent_cache_path
+        self.text_encoder_output_cache_path: Optional[str] = None
+
+    def __str__(self) -> str:
+        return (
+            f"ItemInfo(item_key={self.item_key}, caption={self.caption}, "
+            + f"original_size={self.original_size}, bucket_size={self.bucket_size}, "
+            + f"frame_count={self.frame_count}, latent_cache_path={self.latent_cache_path})"
+        )
+
+
+# We use simple if-else approach to support multiple architectures.
+# Maybe we can use a plugin system in the future.
+
+# the keys of the dict are `<content_type>_FxHxW_<dtype>` for latents
+# and `<content_type>_<dtype|mask>` for other tensors
+
+
+def save_latent_cache(item_info: ItemInfo, latent: torch.Tensor):
+    """HunyuanVideo architecture only"""
+    assert latent.dim() == 4, "latent should be 4D tensor (frame, channel, height, width)"
+
+    _, F, H, W = latent.shape
+    dtype_str = dtype_to_str(latent.dtype)
+    sd = {f"latents_{F}x{H}x{W}_{dtype_str}": latent.detach().cpu()}
+
+    save_latent_cache_common(item_info, sd, ARCHITECTURE_HUNYUAN_VIDEO_FULL)
+
+
+def save_latent_cache_wan(
+    item_info: ItemInfo, latent: torch.Tensor, clip_embed: Optional[torch.Tensor], image_latent: Optional[torch.Tensor]
+):
+    """Wan architecture only"""
+    assert latent.dim() == 4, "latent should be 4D tensor (frame, channel, height, width)"
+
+    _, F, H, W = latent.shape
+    dtype_str = dtype_to_str(latent.dtype)
+    sd = {f"latents_{F}x{H}x{W}_{dtype_str}": latent.detach().cpu()}
+
+    if clip_embed is not None:
+        sd[f"clip_{dtype_str}"] = clip_embed.detach().cpu()
+
+    if image_latent is not None:
+        sd[f"latents_image_{F}x{H}x{W}_{dtype_str}"] = image_latent.detach().cpu()
+
+    save_latent_cache_common(item_info, sd, ARCHITECTURE_WAN_FULL)
+
+
+def save_latent_cache_common(item_info: ItemInfo, sd: dict[str, torch.Tensor], arch_fullname: str):
+    metadata = {
+        "architecture": arch_fullname,
+        "width": f"{item_info.original_size[0]}",
+        "height": f"{item_info.original_size[1]}",
+        "format_version": "1.0.1",
+    }
+    if item_info.frame_count is not None:
+        metadata["frame_count"] = f"{item_info.frame_count}"
+
+    for key, value in sd.items():
+        # NaN check and show warning, replace NaN with 0
+        if torch.isnan(value).any():
+            logger.warning(f"{key} tensor has NaN: {item_info.item_key}, replace NaN with 0")
+            value[torch.isnan(value)] = 0
+
+    latent_dir = os.path.dirname(item_info.latent_cache_path)
+    os.makedirs(latent_dir, exist_ok=True)
+
+    save_file(sd, item_info.latent_cache_path, metadata=metadata)
+
+
+def save_text_encoder_output_cache(item_info: ItemInfo, embed: torch.Tensor, mask: Optional[torch.Tensor], is_llm: bool):
+    """HunyuanVideo architecture only"""
+    assert (
+        embed.dim() == 1 or embed.dim() == 2
+    ), f"embed should be 2D tensor (feature, hidden_size) or (hidden_size,), got {embed.shape}"
+    assert mask is None or mask.dim() == 1, f"mask should be 1D tensor (feature), got {mask.shape}"
+
+    sd = {}
+    dtype_str = dtype_to_str(embed.dtype)
+    text_encoder_type = "llm" if is_llm else "clipL"
+    sd[f"{text_encoder_type}_{dtype_str}"] = embed.detach().cpu()
+    if mask is not None:
+        sd[f"{text_encoder_type}_mask"] = mask.detach().cpu()
+
+    save_text_encoder_output_cache_common(item_info, sd, ARCHITECTURE_HUNYUAN_VIDEO_FULL)
+
+
+def save_text_encoder_output_cache_wan(item_info: ItemInfo, embed: torch.Tensor):
+    """Wan architecture only. Wan2.1 only has a single text encoder"""
+
+    sd = {}
+    dtype_str = dtype_to_str(embed.dtype)
+    text_encoder_type = "t5"
+    sd[f"varlen_{text_encoder_type}_{dtype_str}"] = embed.detach().cpu()
+
+    save_text_encoder_output_cache_common(item_info, sd, ARCHITECTURE_WAN_FULL)
+
+
+def save_text_encoder_output_cache_common(item_info: ItemInfo, sd: dict[str, torch.Tensor], arch_fullname: str):
+    for key, value in sd.items():
+        # NaN check and show warning, replace NaN with 0
+        if torch.isnan(value).any():
+            logger.warning(f"{key} tensor has NaN: {item_info.item_key}, replace NaN with 0")
+            value[torch.isnan(value)] = 0
+
+    metadata = {
+        "architecture": arch_fullname,
+        "caption1": item_info.caption,
+        "format_version": "1.0.1",
+    }
+
+    if os.path.exists(item_info.text_encoder_output_cache_path):
+        # load existing cache and update metadata
+        with safetensors_utils.MemoryEfficientSafeOpen(item_info.text_encoder_output_cache_path) as f:
+            existing_metadata = f.metadata()
+            for key in f.keys():
+                if key not in sd:  # avoid overwriting by existing cache, we keep the new one
+                    sd[key] = f.get_tensor(key)
+
+        assert existing_metadata["architecture"] == metadata["architecture"], "architecture mismatch"
+        if existing_metadata["caption1"] != metadata["caption1"]:
+            logger.warning(f"caption mismatch: existing={existing_metadata['caption1']}, new={metadata['caption1']}, overwrite")
+        # TODO verify format_version
+
+        existing_metadata.pop("caption1", None)
+        existing_metadata.pop("format_version", None)
+        metadata.update(existing_metadata)  # copy existing metadata except caption and format_version
+    else:
+        text_encoder_output_dir = os.path.dirname(item_info.text_encoder_output_cache_path)
+        os.makedirs(text_encoder_output_dir, exist_ok=True)
+
+    safetensors_utils.mem_eff_save_file(sd, item_info.text_encoder_output_cache_path, metadata=metadata)
+
+
+class BucketSelector:
+    RESOLUTION_STEPS_HUNYUAN = 16
+    RESOLUTION_STEPS_WAN = 16
+
+    def __init__(
+        self, resolution: Tuple[int, int], enable_bucket: bool = True, no_upscale: bool = False, architecture: str = "no_default"
+    ):
+        self.resolution = resolution
+        self.bucket_area = resolution[0] * resolution[1]
+        self.architecture = architecture
+
+        if self.architecture == ARCHITECTURE_HUNYUAN_VIDEO:
+            self.reso_steps = BucketSelector.RESOLUTION_STEPS_HUNYUAN
+        elif self.architecture == ARCHITECTURE_WAN:
+            self.reso_steps = BucketSelector.RESOLUTION_STEPS_WAN
+        else:
+            raise ValueError(f"Invalid architecture: {self.architecture}")
+
+        if not enable_bucket:
+            # only define one bucket
+            self.bucket_resolutions = [resolution]
+            self.no_upscale = False
+        else:
+            # prepare bucket resolution
+            self.no_upscale = no_upscale
+            sqrt_size = int(math.sqrt(self.bucket_area))
+            min_size = divisible_by(sqrt_size // 2, self.reso_steps)
+            self.bucket_resolutions = []
+            for w in range(min_size, sqrt_size + self.reso_steps, self.reso_steps):
+                h = divisible_by(self.bucket_area // w, self.reso_steps)
+                self.bucket_resolutions.append((w, h))
+                self.bucket_resolutions.append((h, w))
+
+            self.bucket_resolutions = list(set(self.bucket_resolutions))
+            self.bucket_resolutions.sort()
+
+        # calculate aspect ratio to find the nearest resolution
+        self.aspect_ratios = np.array([w / h for w, h in self.bucket_resolutions])
+
+    def get_bucket_resolution(self, image_size: tuple[int, int]) -> tuple[int, int]:
+        """
+        return the bucket resolution for the given image size, (width, height)
+        """
+        area = image_size[0] * image_size[1]
+        if self.no_upscale and area <= self.bucket_area:
+            w, h = image_size
+            w = divisible_by(w, self.reso_steps)
+            h = divisible_by(h, self.reso_steps)
+            return w, h
+
+        aspect_ratio = image_size[0] / image_size[1]
+        ar_errors = self.aspect_ratios - aspect_ratio
+        bucket_id = np.abs(ar_errors).argmin()
+        return self.bucket_resolutions[bucket_id]
+
+
+def load_video(
+    video_path: str,
+    start_frame: Optional[int] = None,
+    end_frame: Optional[int] = None,
+    bucket_selector: Optional[BucketSelector] = None,
+    bucket_reso: Optional[tuple[int, int]] = None,
+) -> list[np.ndarray]:
+    """
+    bucket_reso: if given, resize the video to the bucket resolution, (width, height)
+    """
+    container = av.open(video_path)
+    video = []
+    for i, frame in enumerate(container.decode(video=0)):
+        if start_frame is not None and i < start_frame:
+            continue
+        if end_frame is not None and i >= end_frame:
+            break
+        frame = frame.to_image()
+
+        if bucket_selector is not None and bucket_reso is None:
+            bucket_reso = bucket_selector.get_bucket_resolution(frame.size)
+
+        if bucket_reso is not None:
+            frame = resize_image_to_bucket(frame, bucket_reso)
+        else:
+            frame = np.array(frame)
+
+        video.append(frame)
+    container.close()
+    return video
+
+
+class BucketBatchManager:
+
+    def __init__(self, bucketed_item_info: dict[tuple[int, int], list[ItemInfo]], batch_size: int):
+        self.batch_size = batch_size
+        self.buckets = bucketed_item_info
+        self.bucket_resos = list(self.buckets.keys())
+        self.bucket_resos.sort()
+
+        self.bucket_batch_indices = []
+        for bucket_reso in self.bucket_resos:
+            bucket = self.buckets[bucket_reso]
+            num_batches = math.ceil(len(bucket) / self.batch_size)
+            for i in range(num_batches):
+                self.bucket_batch_indices.append((bucket_reso, i))
+
+        self.shuffle()
+
+    def show_bucket_info(self):
+        for bucket_reso in self.bucket_resos:
+            bucket = self.buckets[bucket_reso]
+            logger.info(f"bucket: {bucket_reso}, count: {len(bucket)}")
+
+        logger.info(f"total batches: {len(self)}")
+
+    def shuffle(self):
+        for bucket in self.buckets.values():
+            random.shuffle(bucket)
+        random.shuffle(self.bucket_batch_indices)
+
+    def __len__(self):
+        return len(self.bucket_batch_indices)
+
+    def __getitem__(self, idx):
+        bucket_reso, batch_idx = self.bucket_batch_indices[idx]
+        bucket = self.buckets[bucket_reso]
+        start = batch_idx * self.batch_size
+        end = min(start + self.batch_size, len(bucket))
+
+        batch_tensor_data = {}
+        varlen_keys = set()
+        for item_info in bucket[start:end]:
+            sd_latent = load_file(item_info.latent_cache_path)
+            sd_te = load_file(item_info.text_encoder_output_cache_path)
+            sd = {**sd_latent, **sd_te}
+
+            # TODO refactor this
+            for key in sd.keys():
+                is_varlen_key = key.startswith("varlen_")  # varlen keys are not stacked
+                content_key = key
+
+                if is_varlen_key:
+                    content_key = content_key.replace("varlen_", "")
+
+                if content_key.endswith("_mask"):
+                    pass
+                else:
+                    content_key = content_key.rsplit("_", 1)[0]  # remove dtype
+                    if content_key.startswith("latents_"):
+                        content_key = content_key.rsplit("_", 1)[0]  # remove FxHxW
+
+                if content_key not in batch_tensor_data:
+                    batch_tensor_data[content_key] = []
+                batch_tensor_data[content_key].append(sd[key])
+
+                if is_varlen_key:
+                    varlen_keys.add(content_key)
+
+        for key in batch_tensor_data.keys():
+            if key not in varlen_keys:
+                batch_tensor_data[key] = torch.stack(batch_tensor_data[key])
+
+        return batch_tensor_data
+
+
+class ContentDatasource:
+    def __init__(self):
+        self.caption_only = False
+
+    def set_caption_only(self, caption_only: bool):
+        self.caption_only = caption_only
+
+    def is_indexable(self):
+        return False
+
+    def get_caption(self, idx: int) -> tuple[str, str]:
+        """
+        Returns caption. May not be called if is_indexable() returns False.
+        """
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+    def __iter__(self):
+        raise NotImplementedError
+
+    def __next__(self):
+        raise NotImplementedError
+
+
+class ImageDatasource(ContentDatasource):
+    def __init__(self):
+        super().__init__()
+
+    def get_image_data(self, idx: int) -> tuple[str, Image.Image, str]:
+        """
+        Returns image data as a tuple of image path, image, and caption for the given index.
+        Key must be unique and valid as a file name.
+        May not be called if is_indexable() returns False.
+        """
+        raise NotImplementedError
+
+
+class ImageDirectoryDatasource(ImageDatasource):
+    def __init__(self, image_directory: str, caption_extension: Optional[str] = None):
+        super().__init__()
+        self.image_directory = image_directory
+        self.caption_extension = caption_extension
+        self.current_idx = 0
+
+        # glob images
+        logger.info(f"glob images in {self.image_directory}")
+        self.image_paths = glob_images(self.image_directory)
+        logger.info(f"found {len(self.image_paths)} images")
+
+    def is_indexable(self):
+        return True
+
+    def __len__(self):
+        return len(self.image_paths)
+
+    def get_image_data(self, idx: int) -> tuple[str, Image.Image, str]:
+        image_path = self.image_paths[idx]
+        image = Image.open(image_path).convert("RGB")
+
+        _, caption = self.get_caption(idx)
+
+        return image_path, image, caption
+
+    def get_caption(self, idx: int) -> tuple[str, str]:
+        image_path = self.image_paths[idx]
+        caption_path = os.path.splitext(image_path)[0] + self.caption_extension if self.caption_extension else ""
+        with open(caption_path, "r", encoding="utf-8") as f:
+            caption = f.read().strip()
+        return image_path, caption
+
+    def __iter__(self):
+        self.current_idx = 0
+        return self
+
+    def __next__(self) -> callable:
+        """
+        Returns a fetcher function that returns image data.
+        """
+        if self.current_idx >= len(self.image_paths):
+            raise StopIteration
+
+        if self.caption_only:
+
+            def create_caption_fetcher(index):
+                return lambda: self.get_caption(index)
+
+            fetcher = create_caption_fetcher(self.current_idx)
+        else:
+
+            def create_image_fetcher(index):
+                return lambda: self.get_image_data(index)
+
+            fetcher = create_image_fetcher(self.current_idx)
+
+        self.current_idx += 1
+        return fetcher
+
+
+class ImageJsonlDatasource(ImageDatasource):
+    def __init__(self, image_jsonl_file: str):
+        super().__init__()
+        self.image_jsonl_file = image_jsonl_file
+        self.current_idx = 0
+
+        # load jsonl
+        logger.info(f"load image jsonl from {self.image_jsonl_file}")
+        self.data = []
+        with open(self.image_jsonl_file, "r", encoding="utf-8") as f:
+            for line in f:
+                try:
+                    data = json.loads(line)
+                except json.JSONDecodeError:
+                    logger.error(f"failed to load json: {line} @ {self.image_jsonl_file}")
+                    raise
+                self.data.append(data)
+        logger.info(f"loaded {len(self.data)} images")
+
+    def is_indexable(self):
+        return True
+
+    def __len__(self):
+        return len(self.data)
+
+    def get_image_data(self, idx: int) -> tuple[str, Image.Image, str]:
+        data = self.data[idx]
+        image_path = data["image_path"]
+        image = Image.open(image_path).convert("RGB")
+
+        caption = data["caption"]
+
+        return image_path, image, caption
+
+    def get_caption(self, idx: int) -> tuple[str, str]:
+        data = self.data[idx]
+        image_path = data["image_path"]
+        caption = data["caption"]
+        return image_path, caption
+
+    def __iter__(self):
+        self.current_idx = 0
+        return self
+
+    def __next__(self) -> callable:
+        if self.current_idx >= len(self.data):
+            raise StopIteration
+
+        if self.caption_only:
+
+            def create_caption_fetcher(index):
+                return lambda: self.get_caption(index)
+
+            fetcher = create_caption_fetcher(self.current_idx)
+
+        else:
+
+            def create_fetcher(index):
+                return lambda: self.get_image_data(index)
+
+            fetcher = create_fetcher(self.current_idx)
+
+        self.current_idx += 1
+        return fetcher
+
+
+class VideoDatasource(ContentDatasource):
+    def __init__(self):
+        super().__init__()
+
+        # None means all frames
+        self.start_frame = None
+        self.end_frame = None
+
+        self.bucket_selector = None
+
+    def __len__(self):
+        raise NotImplementedError
+
+    def get_video_data_from_path(
+        self,
+        video_path: str,
+        start_frame: Optional[int] = None,
+        end_frame: Optional[int] = None,
+        bucket_selector: Optional[BucketSelector] = None,
+    ) -> tuple[str, list[Image.Image], str]:
+        # this method can resize the video if bucket_selector is given to reduce the memory usage
+
+        start_frame = start_frame if start_frame is not None else self.start_frame
+        end_frame = end_frame if end_frame is not None else self.end_frame
+        bucket_selector = bucket_selector if bucket_selector is not None else self.bucket_selector
+
+        video = load_video(video_path, start_frame, end_frame, bucket_selector)
+        return video
+
+    def set_start_and_end_frame(self, start_frame: Optional[int], end_frame: Optional[int]):
+        self.start_frame = start_frame
+        self.end_frame = end_frame
+
+    def set_bucket_selector(self, bucket_selector: BucketSelector):
+        self.bucket_selector = bucket_selector
+
+    def __iter__(self):
+        raise NotImplementedError
+
+    def __next__(self):
+        raise NotImplementedError
+
+
+class VideoDirectoryDatasource(VideoDatasource):
+    def __init__(self, video_directory: str, caption_extension: Optional[str] = None):
+        super().__init__()
+        self.video_directory = video_directory
+        self.caption_extension = caption_extension
+        self.current_idx = 0
+
+        # glob images
+        logger.info(f"glob images in {self.video_directory}")
+        self.video_paths = glob_videos(self.video_directory)
+        logger.info(f"found {len(self.video_paths)} videos")
+
+    def is_indexable(self):
+        return True
+
+    def __len__(self):
+        return len(self.video_paths)
+
+    def get_video_data(
+        self,
+        idx: int,
+        start_frame: Optional[int] = None,
+        end_frame: Optional[int] = None,
+        bucket_selector: Optional[BucketSelector] = None,
+    ) -> tuple[str, list[Image.Image], str]:
+        video_path = self.video_paths[idx]
+        video = self.get_video_data_from_path(video_path, start_frame, end_frame, bucket_selector)
+
+        _, caption = self.get_caption(idx)
+
+        return video_path, video, caption
+
+    def get_caption(self, idx: int) -> tuple[str, str]:
+        video_path = self.video_paths[idx]
+        caption_path = os.path.splitext(video_path)[0] + self.caption_extension if self.caption_extension else ""
+        with open(caption_path, "r", encoding="utf-8") as f:
+            caption = f.read().strip()
+        return video_path, caption
+
+    def __iter__(self):
+        self.current_idx = 0
+        return self
+
+    def __next__(self):
+        if self.current_idx >= len(self.video_paths):
+            raise StopIteration
+
+        if self.caption_only:
+
+            def create_caption_fetcher(index):
+                return lambda: self.get_caption(index)
+
+            fetcher = create_caption_fetcher(self.current_idx)
+
+        else:
+
+            def create_fetcher(index):
+                return lambda: self.get_video_data(index)
+
+            fetcher = create_fetcher(self.current_idx)
+
+        self.current_idx += 1
+        return fetcher
+
+
+class VideoJsonlDatasource(VideoDatasource):
+    def __init__(self, video_jsonl_file: str):
+        super().__init__()
+        self.video_jsonl_file = video_jsonl_file
+        self.current_idx = 0
+
+        # load jsonl
+        logger.info(f"load video jsonl from {self.video_jsonl_file}")
+        self.data = []
+        with open(self.video_jsonl_file, "r", encoding="utf-8") as f:
+            for line in f:
+                data = json.loads(line)
+                self.data.append(data)
+        logger.info(f"loaded {len(self.data)} videos")
+
+    def is_indexable(self):
+        return True
+
+    def __len__(self):
+        return len(self.data)
+
+    def get_video_data(
+        self,
+        idx: int,
+        start_frame: Optional[int] = None,
+        end_frame: Optional[int] = None,
+        bucket_selector: Optional[BucketSelector] = None,
+    ) -> tuple[str, list[Image.Image], str]:
+        data = self.data[idx]
+        video_path = data["video_path"]
+        video = self.get_video_data_from_path(video_path, start_frame, end_frame, bucket_selector)
+
+        caption = data["caption"]
+
+        return video_path, video, caption
+
+    def get_caption(self, idx: int) -> tuple[str, str]:
+        data = self.data[idx]
+        video_path = data["video_path"]
+        caption = data["caption"]
+        return video_path, caption
+
+    def __iter__(self):
+        self.current_idx = 0
+        return self
+
+    def __next__(self):
+        if self.current_idx >= len(self.data):
+            raise StopIteration
+
+        if self.caption_only:
+
+            def create_caption_fetcher(index):
+                return lambda: self.get_caption(index)
+
+            fetcher = create_caption_fetcher(self.current_idx)
+
+        else:
+
+            def create_fetcher(index):
+                return lambda: self.get_video_data(index)
+
+            fetcher = create_fetcher(self.current_idx)
+
+        self.current_idx += 1
+        return fetcher
+
+
+class BaseDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        resolution: Tuple[int, int] = (960, 544),
+        caption_extension: Optional[str] = None,
+        batch_size: int = 1,
+        num_repeats: int = 1,
+        enable_bucket: bool = False,
+        bucket_no_upscale: bool = False,
+        cache_directory: Optional[str] = None,
+        debug_dataset: bool = False,
+        architecture: str = "no_default",
+    ):
+        self.resolution = resolution
+        self.caption_extension = caption_extension
+        self.batch_size = batch_size
+        self.num_repeats = num_repeats
+        self.enable_bucket = enable_bucket
+        self.bucket_no_upscale = bucket_no_upscale
+        self.cache_directory = cache_directory
+        self.debug_dataset = debug_dataset
+        self.architecture = architecture
+        self.seed = None
+        self.current_epoch = 0
+
+        if not self.enable_bucket:
+            self.bucket_no_upscale = False
+
+    def get_metadata(self) -> dict:
+        metadata = {
+            "resolution": self.resolution,
+            "caption_extension": self.caption_extension,
+            "batch_size_per_device": self.batch_size,
+            "num_repeats": self.num_repeats,
+            "enable_bucket": bool(self.enable_bucket),
+            "bucket_no_upscale": bool(self.bucket_no_upscale),
+        }
+        return metadata
+
+    def get_all_latent_cache_files(self):
+        return glob.glob(os.path.join(self.cache_directory, f"*_{self.architecture}.safetensors"))
+
+    def get_all_text_encoder_output_cache_files(self):
+        return glob.glob(os.path.join(self.cache_directory, f"*_{self.architecture}_te.safetensors"))
+
+    def get_latent_cache_path(self, item_info: ItemInfo) -> str:
+        """
+        Returns the cache path for the latent tensor.
+
+        item_info: ItemInfo object
+
+        Returns:
+            str: cache path
+
+        cache_path is based on the item_key and the resolution.
+        """
+        w, h = item_info.original_size
+        basename = os.path.splitext(os.path.basename(item_info.item_key))[0]
+        assert self.cache_directory is not None, "cache_directory is required / cache_directoryは必須です"
+        return os.path.join(self.cache_directory, f"{basename}_{w:04d}x{h:04d}_{self.architecture}.safetensors")
+
+    def get_text_encoder_output_cache_path(self, item_info: ItemInfo) -> str:
+        basename = os.path.splitext(os.path.basename(item_info.item_key))[0]
+        assert self.cache_directory is not None, "cache_directory is required / cache_directoryは必須です"
+        return os.path.join(self.cache_directory, f"{basename}_{self.architecture}_te.safetensors")
+
+    def retrieve_latent_cache_batches(self, num_workers: int):
+        raise NotImplementedError
+
+    def retrieve_text_encoder_output_cache_batches(self, num_workers: int):
+        raise NotImplementedError
+
+    def prepare_for_training(self):
+        pass
+
+    def set_seed(self, seed: int):
+        self.seed = seed
+
+    def set_current_epoch(self, epoch):
+        if not self.current_epoch == epoch:  # shuffle buckets when epoch is incremented
+            if epoch > self.current_epoch:
+                logger.info("epoch is incremented. current_epoch: {}, epoch: {}".format(self.current_epoch, epoch))
+                num_epochs = epoch - self.current_epoch
+                for _ in range(num_epochs):
+                    self.current_epoch += 1
+                    self.shuffle_buckets()
+                # self.current_epoch seem to be set to 0 again in the next epoch. it may be caused by skipped_dataloader?
+            else:
+                logger.warning("epoch is not incremented. current_epoch: {}, epoch: {}".format(self.current_epoch, epoch))
+                self.current_epoch = epoch
+
+    def set_current_step(self, step):
+        self.current_step = step
+
+    def set_max_train_steps(self, max_train_steps):
+        self.max_train_steps = max_train_steps
+
+    def shuffle_buckets(self):
+        raise NotImplementedError
+
+    def __len__(self):
+        return NotImplementedError
+
+    def __getitem__(self, idx):
+        raise NotImplementedError
+
+    def _default_retrieve_text_encoder_output_cache_batches(self, datasource: ContentDatasource, batch_size: int, num_workers: int):
+        datasource.set_caption_only(True)
+        executor = ThreadPoolExecutor(max_workers=num_workers)
+
+        data: list[ItemInfo] = []
+        futures = []
+
+        def aggregate_future(consume_all: bool = False):
+            while len(futures) >= num_workers or (consume_all and len(futures) > 0):
+                completed_futures = [future for future in futures if future.done()]
+                if len(completed_futures) == 0:
+                    if len(futures) >= num_workers or consume_all:  # to avoid adding too many futures
+                        time.sleep(0.1)
+                        continue
+                    else:
+                        break  # submit batch if possible
+
+                for future in completed_futures:
+                    item_key, caption = future.result()
+                    item_info = ItemInfo(item_key, caption, (0, 0), (0, 0))
+                    item_info.text_encoder_output_cache_path = self.get_text_encoder_output_cache_path(item_info)
+                    data.append(item_info)
+
+                    futures.remove(future)
+
+        def submit_batch(flush: bool = False):
+            nonlocal data
+            if len(data) >= batch_size or (len(data) > 0 and flush):
+                batch = data[0:batch_size]
+                if len(data) > batch_size:
+                    data = data[batch_size:]
+                else:
+                    data = []
+                return batch
+            return None
+
+        for fetch_op in datasource:
+            future = executor.submit(fetch_op)
+            futures.append(future)
+            aggregate_future()
+            while True:
+                batch = submit_batch()
+                if batch is None:
+                    break
+                yield batch
+
+        aggregate_future(consume_all=True)
+        while True:
+            batch = submit_batch(flush=True)
+            if batch is None:
+                break
+            yield batch
+
+        executor.shutdown()
+
+
+class ImageDataset(BaseDataset):
+    def __init__(
+        self,
+        resolution: Tuple[int, int],
+        caption_extension: Optional[str],
+        batch_size: int,
+        num_repeats: int,
+        enable_bucket: bool,
+        bucket_no_upscale: bool,
+        image_directory: Optional[str] = None,
+        image_jsonl_file: Optional[str] = None,
+        cache_directory: Optional[str] = None,
+        debug_dataset: bool = False,
+        architecture: str = "no_default",
+    ):
+        super(ImageDataset, self).__init__(
+            resolution,
+            caption_extension,
+            batch_size,
+            num_repeats,
+            enable_bucket,
+            bucket_no_upscale,
+            cache_directory,
+            debug_dataset,
+            architecture,
+        )
+        self.image_directory = image_directory
+        self.image_jsonl_file = image_jsonl_file
+        if image_directory is not None:
+            self.datasource = ImageDirectoryDatasource(image_directory, caption_extension)
+        elif image_jsonl_file is not None:
+            self.datasource = ImageJsonlDatasource(image_jsonl_file)
+        else:
+            raise ValueError("image_directory or image_jsonl_file must be specified")
+
+        if self.cache_directory is None:
+            self.cache_directory = self.image_directory
+
+        self.batch_manager = None
+        self.num_train_items = 0
+
+    def get_metadata(self):
+        metadata = super().get_metadata()
+        if self.image_directory is not None:
+            metadata["image_directory"] = os.path.basename(self.image_directory)
+        if self.image_jsonl_file is not None:
+            metadata["image_jsonl_file"] = os.path.basename(self.image_jsonl_file)
+        return metadata
+
+    def get_total_image_count(self):
+        return len(self.datasource) if self.datasource.is_indexable() else None
+
+    def retrieve_latent_cache_batches(self, num_workers: int):
+        buckset_selector = BucketSelector(self.resolution, self.enable_bucket, self.bucket_no_upscale, self.architecture)
+        executor = ThreadPoolExecutor(max_workers=num_workers)
+
+        batches: dict[tuple[int, int], list[ItemInfo]] = {}  # (width, height) -> [ItemInfo]
+        futures = []
+
+        # aggregate futures and sort by bucket resolution
+        def aggregate_future(consume_all: bool = False):
+            while len(futures) >= num_workers or (consume_all and len(futures) > 0):
+                completed_futures = [future for future in futures if future.done()]
+                if len(completed_futures) == 0:
+                    if len(futures) >= num_workers or consume_all:  # to avoid adding too many futures
+                        time.sleep(0.1)
+                        continue
+                    else:
+                        break  # submit batch if possible
+
+                for future in completed_futures:
+                    original_size, item_key, image, caption = future.result()
+                    bucket_height, bucket_width = image.shape[:2]
+                    bucket_reso = (bucket_width, bucket_height)
+
+                    item_info = ItemInfo(item_key, caption, original_size, bucket_reso, content=image)
+                    item_info.latent_cache_path = self.get_latent_cache_path(item_info)
+
+                    if bucket_reso not in batches:
+                        batches[bucket_reso] = []
+                    batches[bucket_reso].append(item_info)
+
+                    futures.remove(future)
+
+        # submit batch if some bucket has enough items
+        def submit_batch(flush: bool = False):
+            for key in batches:
+                if len(batches[key]) >= self.batch_size or flush:
+                    batch = batches[key][0 : self.batch_size]
+                    if len(batches[key]) > self.batch_size:
+                        batches[key] = batches[key][self.batch_size :]
+                    else:
+                        del batches[key]
+                    return key, batch
+            return None, None
+
+        for fetch_op in self.datasource:
+
+            # fetch and resize image in a separate thread
+            def fetch_and_resize(op: callable) -> tuple[tuple[int, int], str, Image.Image, str]:
+                image_key, image, caption = op()
+                image: Image.Image
+                image_size = image.size
+
+                bucket_reso = buckset_selector.get_bucket_resolution(image_size)
+                image = resize_image_to_bucket(image, bucket_reso)
+                return image_size, image_key, image, caption
+
+            future = executor.submit(fetch_and_resize, fetch_op)
+            futures.append(future)
+            aggregate_future()
+            while True:
+                key, batch = submit_batch()
+                if key is None:
+                    break
+                yield key, batch
+
+        aggregate_future(consume_all=True)
+        while True:
+            key, batch = submit_batch(flush=True)
+            if key is None:
+                break
+            yield key, batch
+
+        executor.shutdown()
+
+    def retrieve_text_encoder_output_cache_batches(self, num_workers: int):
+        return self._default_retrieve_text_encoder_output_cache_batches(self.datasource, self.batch_size, num_workers)
+
+    def prepare_for_training(self):
+        bucket_selector = BucketSelector(self.resolution, self.enable_bucket, self.bucket_no_upscale, self.architecture)
+
+        # glob cache files
+        latent_cache_files = glob.glob(os.path.join(self.cache_directory, f"*_{self.architecture}.safetensors"))
+
+        # assign cache files to item info
+        bucketed_item_info: dict[tuple[int, int], list[ItemInfo]] = {}  # (width, height) -> [ItemInfo]
+        for cache_file in latent_cache_files:
+            tokens = os.path.basename(cache_file).split("_")
+
+            image_size = tokens[-2]  # 0000x0000
+            image_width, image_height = map(int, image_size.split("x"))
+            image_size = (image_width, image_height)
+
+            item_key = "_".join(tokens[:-2])
+            text_encoder_output_cache_file = os.path.join(self.cache_directory, f"{item_key}_{self.architecture}_te.safetensors")
+            if not os.path.exists(text_encoder_output_cache_file):
+                logger.warning(f"Text encoder output cache file not found: {text_encoder_output_cache_file}")
+                continue
+
+            bucket_reso = bucket_selector.get_bucket_resolution(image_size)
+            item_info = ItemInfo(item_key, "", image_size, bucket_reso, latent_cache_path=cache_file)
+            item_info.text_encoder_output_cache_path = text_encoder_output_cache_file
+
+            bucket = bucketed_item_info.get(bucket_reso, [])
+            for _ in range(self.num_repeats):
+                bucket.append(item_info)
+            bucketed_item_info[bucket_reso] = bucket
+
+        # prepare batch manager
+        self.batch_manager = BucketBatchManager(bucketed_item_info, self.batch_size)
+        self.batch_manager.show_bucket_info()
+
+        self.num_train_items = sum([len(bucket) for bucket in bucketed_item_info.values()])
+
+    def shuffle_buckets(self):
+        # set random seed for this epoch
+        random.seed(self.seed + self.current_epoch)
+        self.batch_manager.shuffle()
+
+    def __len__(self):
+        if self.batch_manager is None:
+            return 100  # dummy value
+        return len(self.batch_manager)
+
+    def __getitem__(self, idx):
+        return self.batch_manager[idx]
+
+
+class VideoDataset(BaseDataset):
+    def __init__(
+        self,
+        resolution: Tuple[int, int],
+        caption_extension: Optional[str],
+        batch_size: int,
+        num_repeats: int,
+        enable_bucket: bool,
+        bucket_no_upscale: bool,
+        frame_extraction: Optional[str] = "head",
+        frame_stride: Optional[int] = 1,
+        frame_sample: Optional[int] = 1,
+        target_frames: Optional[list[int]] = None,
+        video_directory: Optional[str] = None,
+        video_jsonl_file: Optional[str] = None,
+        cache_directory: Optional[str] = None,
+        debug_dataset: bool = False,
+        architecture: str = "no_default",
+    ):
+        super(VideoDataset, self).__init__(
+            resolution,
+            caption_extension,
+            batch_size,
+            num_repeats,
+            enable_bucket,
+            bucket_no_upscale,
+            cache_directory,
+            debug_dataset,
+            architecture,
+        )
+        self.video_directory = video_directory
+        self.video_jsonl_file = video_jsonl_file
+        self.target_frames = target_frames
+        self.frame_extraction = frame_extraction
+        self.frame_stride = frame_stride
+        self.frame_sample = frame_sample
+
+        if video_directory is not None:
+            self.datasource = VideoDirectoryDatasource(video_directory, caption_extension)
+        elif video_jsonl_file is not None:
+            self.datasource = VideoJsonlDatasource(video_jsonl_file)
+
+        if self.frame_extraction == "uniform" and self.frame_sample == 1:
+            self.frame_extraction = "head"
+            logger.warning("frame_sample is set to 1 for frame_extraction=uniform. frame_extraction is changed to head.")
+        if self.frame_extraction == "head":
+            # head extraction. we can limit the number of frames to be extracted
+            self.datasource.set_start_and_end_frame(0, max(self.target_frames))
+
+        if self.cache_directory is None:
+            self.cache_directory = self.video_directory
+
+        self.batch_manager = None
+        self.num_train_items = 0
+
+    def get_metadata(self):
+        metadata = super().get_metadata()
+        if self.video_directory is not None:
+            metadata["video_directory"] = os.path.basename(self.video_directory)
+        if self.video_jsonl_file is not None:
+            metadata["video_jsonl_file"] = os.path.basename(self.video_jsonl_file)
+        metadata["frame_extraction"] = self.frame_extraction
+        metadata["frame_stride"] = self.frame_stride
+        metadata["frame_sample"] = self.frame_sample
+        metadata["target_frames"] = self.target_frames
+        return metadata
+
+    def retrieve_latent_cache_batches(self, num_workers: int):
+        buckset_selector = BucketSelector(self.resolution, architecture=self.architecture)
+        self.datasource.set_bucket_selector(buckset_selector)
+
+        executor = ThreadPoolExecutor(max_workers=num_workers)
+
+        # key: (width, height, frame_count), value: [ItemInfo]
+        batches: dict[tuple[int, int, int], list[ItemInfo]] = {}
+        futures = []
+
+        def aggregate_future(consume_all: bool = False):
+            while len(futures) >= num_workers or (consume_all and len(futures) > 0):
+                completed_futures = [future for future in futures if future.done()]
+                if len(completed_futures) == 0:
+                    if len(futures) >= num_workers or consume_all:  # to avoid adding too many futures
+                        time.sleep(0.1)
+                        continue
+                    else:
+                        break  # submit batch if possible
+
+                for future in completed_futures:
+                    original_frame_size, video_key, video, caption = future.result()
+
+                    frame_count = len(video)
+                    video = np.stack(video, axis=0)
+                    height, width = video.shape[1:3]
+                    bucket_reso = (width, height)  # already resized
+
+                    crop_pos_and_frames = []
+                    if self.frame_extraction == "head":
+                        for target_frame in self.target_frames:
+                            if frame_count >= target_frame:
+                                crop_pos_and_frames.append((0, target_frame))
+                    elif self.frame_extraction == "chunk":
+                        # split by target_frames
+                        for target_frame in self.target_frames:
+                            for i in range(0, frame_count, target_frame):
+                                if i + target_frame <= frame_count:
+                                    crop_pos_and_frames.append((i, target_frame))
+                    elif self.frame_extraction == "slide":
+                        # slide window
+                        for target_frame in self.target_frames:
+                            if frame_count >= target_frame:
+                                for i in range(0, frame_count - target_frame + 1, self.frame_stride):
+                                    crop_pos_and_frames.append((i, target_frame))
+                    elif self.frame_extraction == "uniform":
+                        # select N frames uniformly
+                        for target_frame in self.target_frames:
+                            if frame_count >= target_frame:
+                                frame_indices = np.linspace(0, frame_count - target_frame, self.frame_sample, dtype=int)
+                                for i in frame_indices:
+                                    crop_pos_and_frames.append((i, target_frame))
+                    else:
+                        raise ValueError(f"frame_extraction {self.frame_extraction} is not supported")
+
+                    for crop_pos, target_frame in crop_pos_and_frames:
+                        cropped_video = video[crop_pos : crop_pos + target_frame]
+                        body, ext = os.path.splitext(video_key)
+                        item_key = f"{body}_{crop_pos:05d}-{target_frame:03d}{ext}"
+                        batch_key = (*bucket_reso, target_frame)  # bucket_reso with frame_count
+
+                        item_info = ItemInfo(
+                            item_key, caption, original_frame_size, batch_key, frame_count=target_frame, content=cropped_video
+                        )
+                        item_info.latent_cache_path = self.get_latent_cache_path(item_info)
+
+                        batch = batches.get(batch_key, [])
+                        batch.append(item_info)
+                        batches[batch_key] = batch
+
+                    futures.remove(future)
+
+        def submit_batch(flush: bool = False):
+            for key in batches:
+                if len(batches[key]) >= self.batch_size or flush:
+                    batch = batches[key][0 : self.batch_size]
+                    if len(batches[key]) > self.batch_size:
+                        batches[key] = batches[key][self.batch_size :]
+                    else:
+                        del batches[key]
+                    return key, batch
+            return None, None
+
+        for operator in self.datasource:
+
+            def fetch_and_resize(op: callable) -> tuple[tuple[int, int], str, list[np.ndarray], str]:
+                video_key, video, caption = op()
+                video: list[np.ndarray]
+                frame_size = (video[0].shape[1], video[0].shape[0])
+
+                # resize if necessary
+                bucket_reso = buckset_selector.get_bucket_resolution(frame_size)
+                video = [resize_image_to_bucket(frame, bucket_reso) for frame in video]
+
+                return frame_size, video_key, video, caption
+
+            future = executor.submit(fetch_and_resize, operator)
+            futures.append(future)
+            aggregate_future()
+            while True:
+                key, batch = submit_batch()
+                if key is None:
+                    break
+                yield key, batch
+
+        aggregate_future(consume_all=True)
+        while True:
+            key, batch = submit_batch(flush=True)
+            if key is None:
+                break
+            yield key, batch
+
+        executor.shutdown()
+
+    def retrieve_text_encoder_output_cache_batches(self, num_workers: int):
+        return self._default_retrieve_text_encoder_output_cache_batches(self.datasource, self.batch_size, num_workers)
+
+    def prepare_for_training(self):
+        bucket_selector = BucketSelector(self.resolution, self.enable_bucket, self.bucket_no_upscale, self.architecture)
+
+        # glob cache files
+        latent_cache_files = glob.glob(os.path.join(self.cache_directory, f"*_{self.architecture}.safetensors"))
+
+        # assign cache files to item info
+        bucketed_item_info: dict[tuple[int, int, int], list[ItemInfo]] = {}  # (width, height, frame_count) -> [ItemInfo]
+        for cache_file in latent_cache_files:
+            tokens = os.path.basename(cache_file).split("_")
+
+            image_size = tokens[-2]  # 0000x0000
+            image_width, image_height = map(int, image_size.split("x"))
+            image_size = (image_width, image_height)
+
+            frame_pos, frame_count = tokens[-3].split("-")
+            frame_pos, frame_count = int(frame_pos), int(frame_count)
+
+            item_key = "_".join(tokens[:-3])
+            text_encoder_output_cache_file = os.path.join(self.cache_directory, f"{item_key}_{self.architecture}_te.safetensors")
+            if not os.path.exists(text_encoder_output_cache_file):
+                logger.warning(f"Text encoder output cache file not found: {text_encoder_output_cache_file}")
+                continue
+
+            bucket_reso = bucket_selector.get_bucket_resolution(image_size)
+            bucket_reso = (*bucket_reso, frame_count)
+            item_info = ItemInfo(item_key, "", image_size, bucket_reso, frame_count=frame_count, latent_cache_path=cache_file)
+            item_info.text_encoder_output_cache_path = text_encoder_output_cache_file
+
+            bucket = bucketed_item_info.get(bucket_reso, [])
+            for _ in range(self.num_repeats):
+                bucket.append(item_info)
+            bucketed_item_info[bucket_reso] = bucket
+
+        # prepare batch manager
+        self.batch_manager = BucketBatchManager(bucketed_item_info, self.batch_size)
+        self.batch_manager.show_bucket_info()
+
+        self.num_train_items = sum([len(bucket) for bucket in bucketed_item_info.values()])
+
+    def shuffle_buckets(self):
+        # set random seed for this epoch
+        random.seed(self.seed + self.current_epoch)
+        self.batch_manager.shuffle()
+
+    def __len__(self):
+        if self.batch_manager is None:
+            return 100  # dummy value
+        return len(self.batch_manager)
+
+    def __getitem__(self, idx):
+        return self.batch_manager[idx]
+
+
+class DatasetGroup(torch.utils.data.ConcatDataset):
+    def __init__(self, datasets: Sequence[Union[ImageDataset, VideoDataset]]):
+        super().__init__(datasets)
+        self.datasets: list[Union[ImageDataset, VideoDataset]] = datasets
+        self.num_train_items = 0
+        for dataset in self.datasets:
+            self.num_train_items += dataset.num_train_items
+
+    def set_current_epoch(self, epoch):
+        for dataset in self.datasets:
+            dataset.set_current_epoch(epoch)
+
+    def set_current_step(self, step):
+        for dataset in self.datasets:
+            dataset.set_current_step(step)
+
+    def set_max_train_steps(self, max_train_steps):
+        for dataset in self.datasets:
+            dataset.set_max_train_steps(max_train_steps)
diff --git a/dataset/testtoml.toml b/dataset/testtoml.toml
new file mode 100644
index 0000000000000000000000000000000000000000..5e8964d5b1ce43b484c1a372a9014fee97d5aabe
--- /dev/null
+++ b/dataset/testtoml.toml
@@ -0,0 +1,11 @@
+[general]
+resolution = [960, 544]
+caption_extension = ".txt"
+batch_size = 1
+enable_bucket = true
+bucket_no_upscale = false
+
+[[datasets]]
+image_directory = "C:/Users/accou/Pictures/Saved Pictures/ebs/Photos-001/ebPhotos-001"
+cache_directory = "/cache_directory"
+num_repeats = 1 # optional, default is 1. Number of times to repeat the dataset. Useful to balance the multiple datasets with different sizes.
diff --git a/docs/Preview.png b/docs/Preview.png
new file mode 100644
index 0000000000000000000000000000000000000000..3ee47629e166883c400fcaad028273578db0b312
--- /dev/null
+++ b/docs/Preview.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11c5f67769069bc7be7e6ca3ea36674eb32c4d63b9f70eb6b8519e8799cf1c3d
+size 67012
diff --git a/docs/advanced_config.md b/docs/advanced_config.md
new file mode 100644
index 0000000000000000000000000000000000000000..1585df79c95e37841b053662c1b451a801843d77
--- /dev/null
+++ b/docs/advanced_config.md
@@ -0,0 +1,220 @@
+> 📝 Click on the language section to expand / 言語をクリックして展開
+
+# Advanced configuration / 高度な設定
+
+## Table of contents / 目次
+
+- [How to specify `network_args`](#how-to-specify-network_args--network_argsの指定方法)
+- [LoRA+](#lora)
+- [Select the target modules of LoRA](#select-the-target-modules-of-lora--loraの対象モジュールを選択する)
+- [Save and view logs in TensorBoard format](#save-and-view-logs-in-tensorboard-format--tensorboard形式のログの保存と参照)
+- [Save and view logs in wandb](#save-and-view-logs-in-wandb--wandbでログの保存と参照)
+- [FP8 weight optimization for models](#fp8-weight-optimization-for-models--モデルの重みのfp8への最適化)
+
+## How to specify `network_args` / `network_args`の指定方法
+
+The `--network_args` option is an option for specifying detailed arguments to LoRA. Specify the arguments in the form of `key=value` in `--network_args`.
+
+<details>
+<summary>日本語</summary>
+`--network_args`オプションは、LoRAへの詳細な引数を指定するためのオプションです。`--network_args`には、`key=value`の形式で引数を指定します。
+</details>
+
+### Example / 記述例
+
+If you specify it on the command line, write as follows. / コマンドラインで指定する場合は以下のように記述します。
+
+```bash
+accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 hv_train_network.py --dit ... 
+    --network_module networks.lora --network_dim 32 
+    --network_args "key1=value1" "key2=value2" ...
+```
+
+If you specify it in the configuration file, write as follows. / 設定ファイルで指定する場合は以下のように記述します。
+
+```toml
+network_args = ["key1=value1", "key2=value2", ...]
+```
+
+If you specify `"verbose=True"`, detailed information of LoRA will be displayed. / `"verbose=True"`を指定するとLoRAの詳細な情報が表示されます。
+
+```bash
+--network_args "verbose=True" "key1=value1" "key2=value2" ...
+```
+
+## LoRA+
+
+LoRA+ is a method to improve the training speed by increasing the learning rate of the UP side (LoRA-B) of LoRA. Specify the multiplier for the learning rate. The original paper recommends 16, but adjust as needed. It seems to be good to start from around 4. For details, please refer to the [related PR of sd-scripts](https://github.com/kohya-ss/sd-scripts/pull/1233).
+
+Specify `loraplus_lr_ratio` with `--network_args`.
+
+<details>
+<summary>日本語</summary>
+
+LoRA+は、LoRAのUP側（LoRA-B）の学習率を上げることで学習速度を向上させる手法です。学習率に対する倍率を指定します。元論文では16を推奨していますが、必要に応じて調整してください。4程度から始めるとよいようです。詳細は[sd-scriptsの関連PR]https://github.com/kohya-ss/sd-scripts/pull/1233)を参照してください。
+
+`--network_args`で`loraplus_lr_ratio`を指定します。
+</details>
+
+### Example / 記述例
+
+```bash
+accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 hv_train_network.py --dit ... 
+    --network_module networks.lora --network_dim 32 --network_args "loraplus_lr_ratio=4" ...
+```
+
+## Select the target modules of LoRA / LoRAの対象モジュールを選択する
+
+*This feature is highly experimental and the specification may change. / この機能は特に実験的なもので、仕様は変更される可能性があります。*
+
+By specifying `exclude_patterns` and `include_patterns` with `--network_args`, you can select the target modules of LoRA.
+
+`exclude_patterns` excludes modules that match the specified pattern. `include_patterns` targets only modules that match the specified pattern.
+
+Specify the values as a list. For example, `"exclude_patterns=[r'.*single_blocks.*', r'.*double_blocks\.[0-9]\..*']"`.
+
+The pattern is a regular expression for the module name. The module name is in the form of `double_blocks.0.img_mod.linear` or `single_blocks.39.modulation.linear`. The regular expression is not a partial match but a complete match.
+
+The patterns are applied in the order of `exclude_patterns`→`include_patterns`. By default, the Linear layers of `img_mod`, `txt_mod`, and `modulation` of double blocks and single blocks are excluded.
+
+(`.*(img_mod|txt_mod|modulation).*` is specified.)
+
+<details>
+<summary>日本語</summary>
+
+`--network_args`で`exclude_patterns`と`include_patterns`を指定することで、LoRAの対象モジュールを選択することができます。
+
+`exclude_patterns`は、指定したパターンに一致するモジュールを除外します。`include_patterns`は、指定したパターンに一致するモジュールのみを対象とします。
+
+値は、リストで指定します。`"exclude_patterns=[r'.*single_blocks.*', r'.*double_blocks\.[0-9]\..*']"`のようになります。
+
+パターンは、モジュール名に対する正規表現です。モジュール名は、たとえば`double_blocks.0.img_mod.linear`や`single_blocks.39.modulation.linear`のような形式です。正規表現は部分一致ではなく完全一致です。
+
+パターンは、`exclude_patterns`→`include_patterns`の順で適用されます。デフォルトは、double blocksとsingle blocksのLinear層のうち、`img_mod`、`txt_mod`、`modulation`が除外されています。
+
+（`.*(img_mod|txt_mod|modulation).*`が指定されています。）
+</details>
+
+### Example / 記述例
+
+Only the modules of double blocks / double blocksのモジュールのみを対象とする場合:
+
+```bash
+--network_args "exclude_patterns=[r'.*single_blocks.*']"
+```
+
+Only the modules of single blocks from the 10th / single blocksの10番目以降のLinearモジュールのみを対象とする場合:
+
+```bash
+--network_args "exclude_patterns=[r'.*']" "include_patterns=[r'.*single_blocks\.\d{2}\.linear.*']"
+```
+
+## Save and view logs in TensorBoard format / TensorBoard形式のログの保存と参照
+
+Specify the folder to save the logs with the `--logging_dir` option. Logs in TensorBoard format will be saved.
+
+For example, if you specify `--logging_dir=logs`, a `logs` folder will be created in the working folder, and logs will be saved in the date folder inside it.
+
+Also, if you specify the `--log_prefix` option, the specified string will be added before the date. For example, use `--logging_dir=logs --log_prefix=lora_setting1_` for identification.
+
+To view logs in TensorBoard, open another command prompt and activate the virtual environment. Then enter the following in the working folder.
+
+```powershell
+tensorboard --logdir=logs
+```
+
+(tensorboard installation is required.)
+
+Then open a browser and access http://localhost:6006/ to display it.
+
+<details>
+<summary>日本語</summary>
+`--logging_dir`オプションにログ保存先フォルダを指定してください。TensorBoard形式のログが保存されます。
+
+たとえば`--logging_dir=logs`と指定すると、作業フォルダにlogsフォルダが作成され、その中の日時フォルダにログが保存されます。
+
+また`--log_prefix`オプションを指定すると、日時の前に指定した文字列が追加されます。`--logging_dir=logs --log_prefix=lora_setting1_`などとして識別用にお使いください。
+
+TensorBoardでログを確認するには、別のコマンドプロンプトを開き、仮想環境を有効にしてから、作業フォルダで以下のように入力します。
+
+```powershell
+tensorboard --logdir=logs
+```
+
+（tensorboardのインストールが必要です。）
+
+その後ブラウザを開き、http://localhost:6006/ へアクセスすると表示されます。
+</details>
+
+## Save and view logs in wandb / wandbでログの保存と参照
+
+`--log_with wandb` option is available to save logs in wandb format. `tensorboard` or `all` is also available. The default is `tensorboard`.
+
+Specify the project name with `--log_tracker_name` when using wandb.
+
+<details>
+<summary>日本語</summary>
+`--log_with wandb`オプションを指定するとwandb形式でログを保存することができます。`tensorboard`や`all`も指定可能です。デフォルトは`tensorboard`です。
+
+wandbを使用する場合は、`--log_tracker_name`でプロジェクト名を指定してください。
+</details>
+
+## FP8 weight optimization for models / モデルの重みのFP8への最適化
+
+The `--fp8_scaled` option is available to quantize the weights of the model to FP8 (E4M3) format with appropriate scaling. This reduces the VRAM usage while maintaining precision. Important weights are kept in FP16/BF16/FP32 format.
+
+The model weights must be in fp16 or bf16. Weights that have been pre-converted to float8_e4m3 cannot be used.
+
+Wan2.1 inference and training are supported.
+
+Specify the `--fp8_scaled` option in addition to the `--fp8` option during inference.
+
+Specify the `--fp8_scaled` option in addition to the `--fp8_base` option during training.
+
+Acknowledgments: This feature is based on the [implementation](https://github.com/Tencent/HunyuanVideo/blob/7df4a45c7e424a3f6cd7d653a7ff1f60cddc1eb1/hyvideo/modules/fp8_optimization.py) of [HunyuanVideo](https://github.com/Tencent/HunyuanVideo). The selection of high-precision modules is based on the [implementation](https://github.com/tdrussell/diffusion-pipe/blob/407c04fdae1c9ab5e67b54d33bef62c3e0a8dbc7/models/wan.py) of [diffusion-pipe](https://github.com/tdrussell/diffusion-pipe). I would like to thank these repositories.
+
+<details>
+<summary>日本語</summary>
+重みを単純にFP8へcastするのではなく、適切なスケーリングでFP8形式に量子化することで、精度を維持しつつVRAM使用量を削減します。また、重要な重みはFP16/BF16/FP32形式で保持します。
+
+モデルの重みは、fp16またはbf16が必要です。あらかじめfloat8_e4m3に変換された重みは使用できません。
+
+Wan2.1の推論、学習のみ対応しています。
+
+推論時は`--fp8`オプションに加えて `--fp8_scaled`オプションを指定してください。
+
+学習時は`--fp8_base`オプションに加えて `--fp8_scaled`オプションを指定してください。
+
+謝辞：この機能は、[HunyuanVideo](https://github.com/Tencent/HunyuanVideo)の[実装](https://github.com/Tencent/HunyuanVideo/blob/7df4a45c7e424a3f6cd7d653a7ff1f60cddc1eb1/hyvideo/modules/fp8_optimization.py)を参考にしました。また、高精度モジュールの選択においては[diffusion-pipe](https://github.com/tdrussell/diffusion-pipe)の[実装](https://github.com/tdrussell/diffusion-pipe/blob/407c04fdae1c9ab5e67b54d33bef62c3e0a8dbc7/models/wan.py)を参考にしました。これらのリポジトリに感謝します。
+
+</details>
+
+### Key features and implementation details / 主な特徴と実装の詳細
+
+- Implements FP8 (E4M3) weight quantization for Linear layers
+- Reduces VRAM requirements by using 8-bit weights for storage (slightly increased compared to existing `--fp8` `--fp8_base` options)
+- Quantizes weights to FP8 format with appropriate scaling instead of simple cast to FP8
+- Maintains computational precision by dequantizing to original precision (FP16/BF16/FP32) during forward pass
+- Preserves important weights in FP16/BF16/FP32 format
+
+The implementation:
+
+1. Quantizes weights to FP8 format with appropriate scaling
+2. Replaces weights by FP8 quantized weights and stores scale factors in model state dict
+3. Applies monkey patching to Linear layers for transparent dequantization during computation
+
+<details>
+<summary>日本語</summary>
+
+- Linear層のFP8（E4M3）重み量子化を実装
+- 8ビットの重みを使用することでVRAM使用量を削減（既存の`--fp8` `--fp8_base` オプションに比べて微増）
+- 単純なFP8へのcastではなく、適切な値でスケールして重みをFP8形式に量子化
+- forward時に元の精度（FP16/BF16/FP32）に逆量子化して計算精度を維持
+- 精度が重要な重みはFP16/BF16/FP32のまま保持
+
+実装:
+
+1. 精度を維持できる適切な倍率で重みをFP8形式に量子化
+2. 重みをFP8量子化重みに置き換え、倍率をモデルのstate dictに保存
+3. Linear層にmonkey patchingすることでモデルを変更せずに逆量子化
+ 
\ No newline at end of file
diff --git a/docs/sampling_during_training.md b/docs/sampling_during_training.md
new file mode 100644
index 0000000000000000000000000000000000000000..04201e201ab9147ed4fa83dcf63b0c2c394192f7
--- /dev/null
+++ b/docs/sampling_during_training.md
@@ -0,0 +1,108 @@
+> 📝 Click on the language section to expand / 言語をクリックして展開
+
+# Sampling during training / 学習中のサンプル画像生成
+
+By preparing a prompt file, you can generate sample images during training.
+
+Please be aware that it consumes a considerable amount of VRAM, so be careful when generating sample images for videos with a large number of frames. Also, since it takes time to generate, adjust the frequency of sample image generation as needed.
+
+<details>
+<summary>日本語</summary>
+
+プロンプトファイルを用意することで、学習中にサンプル画像を生成することができます。
+
+VRAMをそれなりに消費しますので、特にフレーム数が多い動画を生成する場合は注意してください。また生成には時間がかかりますので、サンプル画像生成の頻度は適宜調整してください。
+</details>
+
+## How to use / 使い方
+
+### Command line options for training with sampling / サンプル画像生成に関連する学習時のコマンドラインオプション
+
+Example of command line options for training with sampling / 記述例:  
+
+```bash
+--vae path/to/ckpts/hunyuan-video-t2v-720p/vae/pytorch_model.pt 
+--vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128
+--text_encoder1 path/to/ckpts/text_encoder 
+--text_encoder2 path/to/ckpts/text_encoder_2 
+--sample_prompts /path/to/prompt_file.txt 
+--sample_every_n_epochs 1 --sample_every_n_steps 1000 --sample_at_first
+```
+
+`--vae`, `--vae_chunk_size`, `--vae_spatial_tile_sample_min_size`, `--text_encoder1`, `--text_encoder2` are the same as when generating images, so please refer to [here](/README.md#inference) for details. `--fp8_llm` can also be specified.
+
+`--sample_prompts` specifies the path to the prompt file used for sample image generation. Details are described below.
+
+`--sample_every_n_epochs` specifies how often to generate sample images in epochs, and `--sample_every_n_steps` specifies how often to generate sample images in steps.
+
+`--sample_at_first` is specified when generating sample images at the beginning of training.
+
+Sample images and videos are saved in the `sample` directory in the directory specified by `--output_dir`. They are saved as `.png` for still images and `.mp4` for videos.
+
+<details>
+<summary>日本語</summary>
+
+`--vae`、`--vae_chunk_size`、`--vae_spatial_tile_sample_min_size`、`--text_encoder1`、`--text_encoder2`は、画像生成時と同様ですので、詳細は[こちら](/README.ja.md#推論)を参照してください。`--fp8_llm`も指定可能です。
+
+`--sample_prompts`は、サンプル画像生成に使用するプロンプトファイルのパスを指定します。詳細は後述します。
+
+`--sample_every_n_epochs`は、何エポックごとにサンプル画像を生成するかを、`--sample_every_n_steps`は、何ステップごとにサンプル画像を生成するかを指定します。
+
+`--sample_at_first`は、学習開始時にサンプル画像を生成する場合に指定します。
+
+サンプル画像、動画は、`--output_dir`で指定したディレクトリ内の、`sample`ディレクトリに保存されます。静止画の場合は`.png`、動画の場合は`.mp4`で保存されます。
+</details>
+
+### Prompt file / プロンプトファイル
+
+The prompt file is a text file that contains the prompts for generating sample images. The example is as follows. / プロンプトファイルは、サンプル画像生成のためのプロンプトを記述したテキストファイルです。例は以下の通りです。
+
+```
+# prompt 1: for generating a cat video
+A cat walks on the grass, realistic style. --w 640 --h 480 --f 25 --d 1 --s 20
+
+# prompt 2: for generating a dog image
+A dog runs on the beach, realistic style. --w 960 --h 544 --f 1 --d 2 --s 20
+```
+
+A line starting with `#` is a comment.
+
+* `--w` specifies the width of the generated image or video. The default is 256.
+* `--h` specifies the height. The default is 256.
+* `--f` specifies the number of frames. The default is 1, which generates a still image.
+* `--d` specifies the seed. The default is random.
+* `--s` specifies the number of steps in generation. The default is 20.
+* `--g` specifies the guidance scale. The default is 6.0, which is the default value during inference of HunyuanVideo. Specify 1.0 for SkyReels V1 models. Ignore this option for Wan2.1 models.
+* `--fs` specifies the discrete flow shift. The default is 14.5, which corresponds to the number of steps 20. In the HunyuanVideo paper, 7.0 is recommended for 50 steps, and 17.0 is recommended for less than 20 steps (e.g. 10).
+
+If you train I2V models, you can use the additional options below. 
+
+* `--i path/to/image.png`: the image path for image2video inference.
+
+If you train the model with classifier free guidance, you can use the additional options below.
+
+*`--n negative prompt...`: the negative prompt for the classifier free guidance.
+*`--l 6.0`: the classifier free guidance scale. Should be set to 6.0 for SkyReels V1 models. 5.0 is the default value for Wan2.1 (if omitted).
+
+<details>
+<summary>日本語</summary>
+
+`#` で始まる行はコメントです。
+
+* `--w` 生成画像、動画の幅を指定します。省略時は256です。
+* `--h` 高さを指定します。省略時は256です。
+* `--f` フレーム数を指定します。省略時は1で、静止画を生成します。
+* `--d` シードを指定します。省略時はランダムです。
+* `--s` 生成におけるステップ数を指定します。省略時は20です。
+* `--g` guidance scaleを指定します。省略時は6.0で、HunyuanVideoの推論時のデフォルト値です。
+* `--fs` discrete flow shiftを指定します。省略時は14.5で、ステップ数20の場合に対応した値です。HunyuanVideoの論文では、ステップ数50の場合は7.0、ステップ数20未満（10など）で17.0が推奨されています。
+
+I2Vモデルを学習する場合、以下の追加オプションを使用できます。
+
+* `--i path/to/image.png`: image2video推論用の画像パス。
+
+classifier free guidance（ネガティブプロンプト）を必要とするモデルを学習する場合、以下の追加オプションを使用できます。
+
+*`--n negative prompt...`: classifier free guidance用のネガティブプロンプト。
+*`--l 6.0`: classifier free guidance scale。SkyReels V1モデルの場合は6.0に設定してください。Wan2.1の場合はデフォルト値が5.0です（省略時）。
+</details>
diff --git a/docs/wan.md b/docs/wan.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa23077e7e7019f4bf833895f7a2aacee05e02c8
--- /dev/null
+++ b/docs/wan.md
@@ -0,0 +1,285 @@
+> 📝 Click on the language section to expand / 言語をクリックして展開
+
+# Wan 2.1
+
+## Overview / 概要
+
+This is an unofficial training and inference script for [Wan2.1](https://github.com/Wan-Video/Wan2.1). The features are as follows.
+
+- fp8 support and memory reduction by block swap: Inference of a 720x1280x81frames videos with 24GB VRAM, training with 720x1280 images with 24GB VRAM
+- Inference without installing Flash attention (using PyTorch's scaled dot product attention)
+- Supports xformers and Sage attention
+
+This feature is experimental.
+
+<details>
+<summary>日本語</summary>
+[Wan2.1](https://github.com/Wan-Video/Wan2.1) の非公式の学習および推論スクリプトです。
+
+以下の特徴があります。
+
+- fp8対応およびblock swapによる省メモリ化：720x1280x81framesの動画を24GB VRAMで推論可能、720x1280の画像での学習が24GB VRAMで可能
+- Flash attentionのインストールなしでの実行（PyTorchのscaled dot product attentionを使用）
+- xformersおよびSage attention対応
+
+この機能は実験的なものです。
+</details>
+
+## Download the model / モデルのダウンロード
+
+Download the T5 `models_t5_umt5-xxl-enc-bf16.pth` and CLIP `models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` from the following page: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P/tree/main
+
+Download the VAE from the above page `Wan2.1_VAE.pth` or download `split_files/vae/wan_2.1_vae.safetensors` from the following page: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/vae
+
+Download the DiT weights from the following page: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
+
+Please select the appropriate weights according to T2V, I2V, resolution, model size, etc. 
+
+`fp16` and `bf16` models can be used, and `fp8_e4m3fn` models can be used if `--fp8` (or `--fp8_base`) is specified without specifying `--fp8_scaled`. **Please note that `fp8_scaled` models are not supported even with `--fp8_scaled`.**
+
+(Thanks to Comfy-Org for providing the repackaged weights.)
+
+### Model support matrix / モデルサポートマトリックス
+
+* columns: training dtype (行：学習時のデータ型)
+* rows: model dtype (列：モデルのデータ型)
+
+| model \ training |bf16|fp16|--fp8_base|--fp8base & --fp8_scaled|
+|--|--|--|--|--|
+|bf16|✓|--|✓|✓|
+|fp16|--|✓|✓|✓|
+|fp8_e4m3fn|--|--|✓|--|
+|fp8_scaled|--|--|--|--|
+
+<details>
+<summary>日本語</summary>
+T5 `models_t5_umt5-xxl-enc-bf16.pth` およびCLIP `models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` を、次のページからダウンロードしてください：https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P/tree/main
+
+VAEは上のページから `Wan2.1_VAE.pth` をダウンロードするか、次のページから `split_files/vae/wan_2.1_vae.safetensors` をダウンロードしてください：https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/vae
+
+DiTの重みを次のページからダウンロードしてください：https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
+
+T2VやI2V、解像度、モデルサイズなどにより適切な重みを選択してください。
+
+`fp16` および `bf16` モデルを使用できます。また、`--fp8` （または`--fp8_base`）を指定し`--fp8_scaled`を指定をしないときには `fp8_e4m3fn` モデルを使用できます。**`fp8_scaled` モデルはいずれの場合もサポートされていませんのでご注意ください。**
+
+（repackaged版の重みを提供してくださっているComfy-Orgに感謝いたします。）
+</details>
+
+## Pre-caching / 事前キャッシュ
+
+### Latent Pre-caching
+
+Latent pre-caching is almost the same as in HunyuanVideo. Create the cache using the following command:
+
+```bash
+python wan_cache_latents.py --dataset_config path/to/toml --vae path/to/wan_2.1_vae.safetensors
+```
+
+If you train I2V models, add `--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` to specify the CLIP model. If not specified, the training will raise an error.
+
+If you're running low on VRAM, specify `--vae_cache_cpu` to use the CPU for the VAE internal cache, which will reduce VRAM usage somewhat.
+
+<details>
+<summary>日本語</summary>
+latentの事前キャッシングはHunyuanVideoとほぼ同じです。上のコマンド例を使用してキャッシュを作成してください。
+
+I2Vモデルを学習する場合は、`--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` を追加してCLIPモデルを指定してください。指定しないと学習時にエラーが発生します。
+
+VRAMが不足している場合は、`--vae_cache_cpu` を指定するとVAEの内部キャッシュにCPUを使うことで、使用VRAMを多少削減できます。
+</details>
+
+### Text Encoder Output Pre-caching
+
+Text encoder output pre-caching is also almost the same as in HunyuanVideo. Create the cache using the following command:
+
+```bash
+python wan_cache_text_encoder_outputs.py --dataset_config path/to/toml  --t5 path/to/models_t5_umt5-xxl-enc-bf16.pth --batch_size 16 
+```
+
+Adjust `--batch_size` according to your available VRAM.
+
+For systems with limited VRAM (less than ~16GB), use `--fp8_t5` to run the T5 in fp8 mode.
+
+<details>
+<summary>日本語</summary>
+テキストエンコーダ出力の事前キャッシングもHunyuanVideoとほぼ同じです。上のコマンド例を使用してキャッシュを作成してください。
+
+使用可能なVRAMに合わせて `--batch_size` を調整してください。
+
+VRAMが限られているシステム（約16GB未満）の場合は、T5をfp8モードで実行するために `--fp8_t5` を使用してください。
+</details>
+
+## Training / 学習
+
+### Training
+
+Start training using the following command (input as a single line):
+
+```bash
+accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 wan_train_network.py 
+    --task t2v-1.3B 
+    --dit path/to/wan2.1_xxx_bf16.safetensors 
+    --dataset_config path/to/toml --sdpa --mixed_precision bf16 --fp8_base 
+    --optimizer_type adamw8bit --learning_rate 2e-4 --gradient_checkpointing 
+    --max_data_loader_n_workers 2 --persistent_data_loader_workers 
+    --network_module networks.lora_wan --network_dim 32 
+    --timestep_sampling shift --discrete_flow_shift 3.0 
+    --max_train_epochs 16 --save_every_n_epochs 1 --seed 42
+    --output_dir path/to/output_dir --output_name name-of-lora
+```
+The above is an example. The appropriate values for `timestep_sampling` and `discrete_flow_shift` need to be determined by experimentation.
+
+For additional options, use `python wan_train_network.py --help` (note that many options are unverified).
+
+`--task` is one of `t2v-1.3B`, `t2v-14B`, `i2v-14B` and `t2i-14B`. Specify the DiT weights for the task with `--dit`.
+
+Don't forget to specify `--network_module networks.lora_wan`.
+
+Other options are mostly the same as `hv_train_network.py`.
+
+Use `convert_lora.py` for converting the LoRA weights after training, as in HunyuanVideo.
+
+<details>
+<summary>日本語</summary>
+`timestep_sampling`や`discrete_flow_shift`は一例です。どのような値が適切かは実験が必要です。
+
+その他のオプションについては `python wan_train_network.py --help` を使用してください（多くのオプションは未検証です）。
+
+`--task` には `t2v-1.3B`, `t2v-14B`, `i2v-14B`, `t2i-14B` のいずれかを指定します。`--dit`に、taskに応じたDiTの重みを指定してください。
+
+ `--network_module` に `networks.lora_wan` を指定することを忘れないでください。
+
+その他のオプションは、ほぼ`hv_train_network.py`と同様です。
+
+学習後のLoRAの重みの変換は、HunyuanVideoと同様に`convert_lora.py`を使用してください。
+</details>
+
+### Command line options for training with sampling / サンプル画像生成に関連する学習時のコマンドラインオプション
+
+Example of command line options for training with sampling / 記述例:  
+
+```bash
+--vae path/to/wan_2.1_vae.safetensors 
+--t5 path/to/models_t5_umt5-xxl-enc-bf16.pth 
+--sample_prompts /path/to/prompt_file.txt 
+--sample_every_n_epochs 1 --sample_every_n_steps 1000 -- sample_at_first
+```
+Each option is the same as when generating images or as HunyuanVideo. Please refer to [here](/docs/sampling_during_training.md) for details.
+
+If you train I2V models, add `--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` to specify the CLIP model. 
+
+You can specify the initial image and negative prompts in the prompt file. Please refer to [here](/docs/sampling_during_training.md#prompt-file--プロンプトファイル).
+
+<details>
+<summary>日本語</summary>
+各オプションは推論時、およびHunyuanVideoの場合と同様です。[こちら](/docs/sampling_during_training.md)を参照してください。
+
+I2Vモデルを学習する場合は、`--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` を追加してCLIPモデルを指定してください。
+
+プロンプトファイルで、初期画像やネガティブプロンプト等を指定できます。[こちら](/docs/sampling_during_training.md#prompt-file--プロンプトファイル)を参照してください。
+</details>
+
+
+## Inference / 推論
+
+### Inference Options Comparison / 推論オプション比較
+
+#### Speed Comparison (Faster → Slower) / 速度比較（速い→遅い）
+*Note: Results may vary depending on GPU type*
+
+fp8_fast > bf16/fp16 (no block swap) > fp8 > fp8_scaled > bf16/fp16 (block swap)
+
+#### Quality Comparison (Higher → Lower) / 品質比較（高→低）
+
+bf16/fp16 > fp8_scaled > fp8 >> fp8_fast
+
+### T2V Inference / T2V推論
+
+The following is an example of T2V inference (input as a single line):
+
+```bash
+python wan_generate_video.py --fp8 --task t2v-1.3B --video_size  832 480 --video_length 81 --infer_steps 20 
+--prompt "prompt for the video" --save_path path/to/save.mp4 --output_type both 
+--dit path/to/wan2.1_t2v_1.3B_bf16_etc.safetensors --vae path/to/wan_2.1_vae.safetensors 
+--t5 path/to/models_t5_umt5-xxl-enc-bf16.pth 
+--attn_mode torch
+```
+
+`--task` is one of `t2v-1.3B`, `t2v-14B`, `i2v-14B` and `t2i-14B`.
+
+`--attn_mode` is `torch`, `sdpa` (same as `torch`), `xformers`, `sageattn`,`flash2`, `flash` (same as `flash2`) or `flash3`. `torch` is the default. Other options require the corresponding library to be installed. `flash3` (Flash attention 3) is not tested.
+
+Specifying `--fp8` runs DiT in fp8 mode. fp8 can significantly reduce memory consumption but may impact output quality.
+
+`--fp8_scaled` can be specified in addition to `--fp8` to run the model in fp8 weights optimization. This increases memory consumption and speed slightly but improves output quality. See [here](advanced_config.md#fp8-weight-optimization-for-models--モデルの重みのfp8への最適化) for details.
+
+`--fp8_fast` option is also available for faster inference on RTX 40x0 GPUs. This option requires `--fp8_scaled` option. **This option seems to degrade the output quality.**
+
+`--fp8_t5` can be used to specify the T5 model in fp8 format. This option reduces memory usage for the T5 model.  
+
+`--negative_prompt` can be used to specify a negative prompt. If omitted, the default negative prompt is used.
+
+`--flow_shift` can be used to specify the flow shift (default 3.0 for I2V with 480p, 5.0 for others).
+
+`--guidance_scale` can be used to specify the guidance scale for classifier free guidance (default 5.0).
+
+`--blocks_to_swap` is the number of blocks to swap during inference. The default value is None (no block swap). The maximum value is 39 for 14B model and 29 for 1.3B model.
+
+`--vae_cache_cpu` enables VAE cache in main memory. This reduces VRAM usage slightly but processing is slower.
+
+`--compile` enables torch.compile. See [here](/README.md#inference) for details.
+
+Other options are same as `hv_generate_video.py` (some options are not supported, please check the help).
+
+<details>
+<summary>日本語</summary>
+`--task` には `t2v-1.3B`, `t2v-14B`, `i2v-14B`, `t2i-14B` のいずれかを指定します。
+
+`--attn_mode` には `torch`, `sdpa`（`torch`と同じ）、`xformers`, `sageattn`, `flash2`, `flash`（`flash2`と同じ）, `flash3` のいずれかを指定します。デフォルトは `torch` です。その他のオプションを使用する場合は、対応するライブラリをインストールする必要があります。`flash3`（Flash attention 3）は未テストです。
+
+`--fp8` を指定するとDiTモデルをfp8形式で実行します。fp8はメモリ消費を大幅に削減できますが、出力品質に影響を与える可能性があります。
+    
+`--fp8_scaled` を `--fp8` と併用すると、fp8への重み量子化を行います。メモリ消費と速度はわずかに悪化しますが、出力品質が向上します。詳しくは[こちら](advanced_config.md#fp8-weight-optimization-for-models--モデルの重みのfp8への最適化)を参照してください。
+
+`--fp8_fast` オプションはRTX 40x0 GPUでの高速推論に使用されるオプションです。このオプションは `--fp8_scaled` オプションが必要です。**出力品質が劣化するようです。**
+
+`--fp8_t5` を指定するとT5モデルをfp8形式で実行します。T5モデル呼び出し時のメモリ使用量を削減します。
+
+`--negative_prompt` でネガティブプロンプトを指定できます。省略した場合はデフォルトのネガティブプロンプトが使用されます。
+
+`--flow_shift` でflow shiftを指定できます（480pのI2Vの場合はデフォルト3.0、それ以外は5.0）。
+
+`--guidance_scale` でclassifier free guianceのガイダンススケールを指定できます（デフォルト5.0）。
+
+`--blocks_to_swap` は推論時のblock swapの数です。デフォルト値はNone（block swapなし）です。最大値は14Bモデルの場合39、1.3Bモデルの場合29です。
+
+`--vae_cache_cpu` を有効にすると、VAEのキャッシュをメインメモリに保持します。VRAM使用量が多少減りますが、処理は遅くなります。
+
+`--compile`でtorch.compileを有効にします。詳細については[こちら](/README.md#inference)を参照してください。
+
+その他のオプションは `hv_generate_video.py` と同じです（一部のオプションはサポートされていないため、ヘルプを確認してください）。
+</details>
+
+### I2V Inference / I2V推論
+
+The following is an example of I2V inference (input as a single line):
+
+```bash
+python wan_generate_video.py --fp8 --task i2v-14B --video_size 832 480 --video_length 81 --infer_steps 20 
+--prompt "prompt for the video" --save_path path/to/save.mp4 --output_type both 
+--dit path/to/wan2.1_i2v_480p_14B_bf16_etc.safetensors --vae path/to/wan_2.1_vae.safetensors 
+--t5 path/to/models_t5_umt5-xxl-enc-bf16.pth --clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth 
+--attn_mode torch --image_path path/to/image.jpg
+```
+
+Add `--clip` to specify the CLIP model. `--image_path` is the path to the image to be used as the initial frame.
+
+Other options are same as T2V inference.
+
+<details>
+<summary>日本語</summary>
+`--clip` を追加してCLIPモデルを指定します。`--image_path` は初期フレームとして使用する画像のパスです。
+
+その他のオプションはT2V推論と同じです。
+</details>
diff --git a/ebtest.json b/ebtest.json
new file mode 100644
index 0000000000000000000000000000000000000000..e12aa5ae4ff194ab1e4bf41dd2c6bb00f2c562c1
--- /dev/null
+++ b/ebtest.json
@@ -0,0 +1,44 @@
+{
+    "DATASET_CONFIG": "C:/AI/WAN Tuner/QWEN-tuner/musubi-tuner-wan-gui/dataset/testtoml.toml",
+    "VAE_MODEL": "C:/AI/WAN Tuner/QWEN-tuner/musubi-tuner/Wan2.1-I2V-14B-720P/Wan2.1_VAE.pth",
+    "CLIP_MODEL": "C:/AI/WAN Tuner/QWEN-tuner/musubi-tuner/Wan2.1-I2V-14B-720P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth",
+    "T5_MODEL": "C:/AI/WAN Tuner/QWEN-tuner/musubi-tuner/Wan2.1-I2V-14B-720P/models_t5_umt5-xxl-enc-bf16.pth",
+    "DIT_MODEL": "C:/AI/WAN Tuner/QWEN-tuner/musubi-tuner/Wan2.1-I2V-14B-720P/wan2.1_i2v_720p_14B_fp8_e4m3fn.safetensors",
+    "LORA_OUTPUT_DIR": "C:/AI/WAN Tuner/QWEN-tuner/musubi-tuner-wan-gui/Output_LoRAs",
+    "LORA_NAME": "My_Test_Lora_v2",
+    "MODEL_TYPE": "i2v-14B",
+    "FLOW_SHIFT": "3.0",
+    "LEARNING_RATE": "2e-5",
+    "LORA_LR_RATIO": "4",
+    "NETWORK_DIM": "32",
+    "NETWORK_ALPHA": "4",
+    "MAX_TRAIN_EPOCHS": "70",
+    "SAVE_EVERY_N_EPOCHS": "10",
+    "SEED": "1234",
+    "BLOCKS_SWAP": "16",
+    "RESUME_TRAINING": "",
+    "OPTIMIZER_TYPE": "adamw8bit",
+    "OPTIMIZER_ARGS": "",
+    "ATTENTION_MECHANISM": "none",
+    "LOGGING_DIR": "",
+    "LOG_WITH": "none",
+    "LOG_PREFIX": "",
+    "IMG_IN_TXT_IN_OFFLOADING": false,
+    "LR_SCHEDULER": "constant",
+    "LR_WARMUP_STEPS": "",
+    "LR_DECAY_STEPS": "",
+    "TIMESTEP_SAMPLING": "shift",
+    "DISCRETE_FLOW_SHIFT": "3.0",
+    "WEIGHTING_SCHEME": "none",
+    "METADATA_TITLE": "",
+    "METADATA_AUTHOR": "",
+    "METADATA_DESCRIPTION": "",
+    "METADATA_LICENSE": "",
+    "METADATA_TAGS": "",
+    "INPUT_LORA": "",
+    "OUTPUT_DIR": "",
+    "CONVERTED_LORA_NAME": "",
+    "FP8": false,
+    "SCALED": false,
+    "ENABLE_CACHE": true
+}
\ No newline at end of file
diff --git a/hunyuan_model/__init__.py b/hunyuan_model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/hunyuan_model/activation_layers.py b/hunyuan_model/activation_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8774c26ceef6081482ca0dbbf930b207d4ac03b
--- /dev/null
+++ b/hunyuan_model/activation_layers.py
@@ -0,0 +1,23 @@
+import torch.nn as nn
+
+
+def get_activation_layer(act_type):
+    """get activation layer
+
+    Args:
+        act_type (str): the activation type
+
+    Returns:
+        torch.nn.functional: the activation layer
+    """
+    if act_type == "gelu":
+        return lambda: nn.GELU()
+    elif act_type == "gelu_tanh":
+        # Approximate `tanh` requires torch >= 1.13
+        return lambda: nn.GELU(approximate="tanh")
+    elif act_type == "relu":
+        return nn.ReLU
+    elif act_type == "silu":
+        return nn.SiLU
+    else:
+        raise ValueError(f"Unknown activation type: {act_type}")
diff --git a/hunyuan_model/attention.py b/hunyuan_model/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..e94253df0aceb11e4f5812b728df75b9d38bf8c2
--- /dev/null
+++ b/hunyuan_model/attention.py
@@ -0,0 +1,295 @@
+import importlib.metadata
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+try:
+    import flash_attn
+    from flash_attn.flash_attn_interface import _flash_attn_forward
+    from flash_attn.flash_attn_interface import flash_attn_varlen_func
+    from flash_attn.flash_attn_interface import flash_attn_func
+except ImportError:
+    flash_attn = None
+    flash_attn_varlen_func = None
+    _flash_attn_forward = None
+    flash_attn_func = None
+
+try:
+    print(f"Trying to import sageattention")
+    from sageattention import sageattn_varlen, sageattn
+
+    print("Successfully imported sageattention")
+except ImportError:
+    print(f"Failed to import sageattention")
+    sageattn_varlen = None
+    sageattn = None
+
+try:
+    import xformers.ops as xops
+except ImportError:
+    xops = None
+
+MEMORY_LAYOUT = {
+    "flash": (
+        lambda x: x.view(x.shape[0] * x.shape[1], *x.shape[2:]),
+        lambda x: x,
+    ),
+    "flash_fixlen": (
+        lambda x: x,
+        lambda x: x,
+    ),
+    "sageattn": (
+        lambda x: x.view(x.shape[0] * x.shape[1], *x.shape[2:]),
+        lambda x: x,
+    ),
+    "sageattn_fixlen": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+    "torch": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+    "xformers": (
+        lambda x: x,
+        lambda x: x,
+    ),
+    "vanilla": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+}
+
+
+def get_cu_seqlens(text_mask, img_len):
+    """Calculate cu_seqlens_q, cu_seqlens_kv using text_mask and img_len
+
+    Args:
+        text_mask (torch.Tensor): the mask of text
+        img_len (int): the length of image
+
+    Returns:
+        torch.Tensor: the calculated cu_seqlens for flash attention
+    """
+    batch_size = text_mask.shape[0]
+    text_len = text_mask.sum(dim=1)
+    max_len = text_mask.shape[1] + img_len
+
+    cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device="cuda")
+
+    for i in range(batch_size):
+        s = text_len[i] + img_len
+        s1 = i * max_len + s
+        s2 = (i + 1) * max_len
+        cu_seqlens[2 * i + 1] = s1
+        cu_seqlens[2 * i + 2] = s2
+
+    return cu_seqlens
+
+
+def attention(
+    q_or_qkv_list,
+    k=None,
+    v=None,
+    mode="flash",
+    drop_rate=0,
+    attn_mask=None,
+    total_len=None,
+    causal=False,
+    cu_seqlens_q=None,
+    cu_seqlens_kv=None,
+    max_seqlen_q=None,
+    max_seqlen_kv=None,
+    batch_size=1,
+):
+    """
+    Perform QKV self attention.
+
+    Args:
+        q (torch.Tensor): Query tensor with shape [b, s, a, d], where a is the number of heads.
+        k (torch.Tensor): Key tensor with shape [b, s1, a, d]
+        v (torch.Tensor): Value tensor with shape [b, s1, a, d]
+        mode (str): Attention mode. Choose from 'self_flash', 'cross_flash', 'torch', and 'vanilla'.
+        drop_rate (float): Dropout rate in attention map. (default: 0)
+        attn_mask (torch.Tensor): Attention mask with shape [b, s1] (cross_attn), or [b, a, s, s1] (torch or vanilla).
+            (default: None)
+        causal (bool): Whether to use causal attention. (default: False)
+        cu_seqlens_q (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
+            used to index into q.
+        cu_seqlens_kv (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
+            used to index into kv.
+        max_seqlen_q (int): The maximum sequence length in the batch of q.
+        max_seqlen_kv (int): The maximum sequence length in the batch of k and v.
+
+    Returns:
+        torch.Tensor: Output tensor after self attention with shape [b, s, ad]
+    """
+    q, k, v = q_or_qkv_list if type(q_or_qkv_list) == list else (q_or_qkv_list, k, v)
+    if type(q_or_qkv_list) == list:
+        q_or_qkv_list.clear()
+    split_attn = total_len is not None
+    if split_attn and mode == "sageattn":
+        mode = "sageattn_fixlen"
+    elif split_attn and mode == "flash":
+        mode = "flash_fixlen"
+    # print(f"Attention mode: {mode}, split_attn: {split_attn}")
+    pre_attn_layout, post_attn_layout = MEMORY_LAYOUT[mode]
+
+    # trim the sequence length to the actual length instead of attn_mask
+    if split_attn:
+        trimmed_len = q.shape[1] - total_len
+        q = [q[i : i + 1, : total_len[i]] for i in range(len(q))]
+        k = [k[i : i + 1, : total_len[i]] for i in range(len(k))]
+        v = [v[i : i + 1, : total_len[i]] for i in range(len(v))]
+        q = [pre_attn_layout(q_i) for q_i in q]
+        k = [pre_attn_layout(k_i) for k_i in k]
+        v = [pre_attn_layout(v_i) for v_i in v]
+        # print(
+        #     f"Trimming the sequence length to {total_len},trimmed_len: {trimmed_len}, q.shape: {[q_i.shape for q_i in q]}, mode: {mode}"
+        # )
+    else:
+        q = pre_attn_layout(q)
+        k = pre_attn_layout(k)
+        v = pre_attn_layout(v)
+
+    if mode == "torch":
+        if split_attn:
+            x = []
+            for i in range(len(q)):
+                x_i = F.scaled_dot_product_attention(q[i], k[i], v[i], dropout_p=drop_rate, is_causal=causal)
+                q[i], k[i], v[i] = None, None, None
+                x.append(x_i)
+            del q, k, v
+        else:
+            if attn_mask is not None and attn_mask.dtype != torch.bool:
+                attn_mask = attn_mask.to(q.dtype)
+            x = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal)
+            del q, k, v
+            del attn_mask
+
+    elif mode == "xformers":
+        # B, M, H, K: M is the sequence length, H is the number of heads, K is the dimension of the heads -> it is same as input dimension
+        # currently only support batch_size = 1
+        assert split_attn, "Xformers only supports splitting"
+        x = []
+        for i in range(len(q)):
+            x_i = xops.memory_efficient_attention(q[i], k[i], v[i], p=drop_rate)  # , causal=causal)
+            q[i], k[i], v[i] = None, None, None
+            x.append(x_i)
+        del q, k, v
+
+    elif mode == "flash":
+        x = flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
+        del q, k, v
+        # x with shape [(bxs), a, d]
+        x = x.view(batch_size, max_seqlen_q, x.shape[-2], x.shape[-1])  # reshape x to [b, s, a, d]
+    elif mode == "flash_fixlen":
+        x = []
+        for i in range(len(q)):
+            # q: (batch_size, seqlen, nheads, headdim), k: (batch_size, seqlen, nheads_k, headdim), v: (batch_size, seqlen, nheads_k, headdim)
+            x_i = flash_attn_func(q[i], k[i], v[i], dropout_p=drop_rate, causal=causal)
+            q[i], k[i], v[i] = None, None, None
+            x.append(x_i)
+        del q, k, v
+    elif mode == "sageattn":
+        x = sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
+        del q, k, v
+        # x with shape [(bxs), a, d]
+        x = x.view(batch_size, max_seqlen_q, x.shape[-2], x.shape[-1])  # reshape x to [b, s, a, d]
+    elif mode == "sageattn_fixlen":
+        x = []
+        for i in range(len(q)):
+            # HND seems to cause an error
+            x_i = sageattn(q[i], k[i], v[i])  # (batch_size, seq_len, head_num, head_dim)
+            q[i], k[i], v[i] = None, None, None
+            x.append(x_i)
+        del q, k, v
+    elif mode == "vanilla":
+        assert not split_attn, "Vanilla attention does not support trimming"
+        scale_factor = 1 / math.sqrt(q.size(-1))
+
+        b, a, s, _ = q.shape
+        s1 = k.size(2)
+        attn_bias = torch.zeros(b, a, s, s1, dtype=q.dtype, device=q.device)
+        if causal:
+            # Only applied to self attention
+            assert attn_mask is None, "Causal mask and attn_mask cannot be used together"
+            temp_mask = torch.ones(b, a, s, s, dtype=torch.bool, device=q.device).tril(diagonal=0)
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias.to(q.dtype)
+
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            else:
+                attn_bias += attn_mask
+
+        # TODO: Maybe force q and k to be float32 to avoid numerical overflow
+        attn = (q @ k.transpose(-2, -1)) * scale_factor
+        attn += attn_bias
+        attn = attn.softmax(dim=-1)
+        attn = torch.dropout(attn, p=drop_rate, train=True)
+        x = attn @ v
+    else:
+        raise NotImplementedError(f"Unsupported attention mode: {mode}")
+
+    if split_attn:
+        x = [post_attn_layout(x_i) for x_i in x]
+        for i in range(len(x)):
+            x[i] = F.pad(x[i], (0, 0, 0, 0, 0, trimmed_len[i]))
+        x = torch.cat(x, dim=0)
+    else:
+        x = post_attn_layout(x)
+
+    b, s, a, d = x.shape
+    out = x.reshape(b, s, -1)
+    return out
+
+
+def parallel_attention(hybrid_seq_parallel_attn, q, k, v, img_q_len, img_kv_len, cu_seqlens_q, cu_seqlens_kv):
+    attn1 = hybrid_seq_parallel_attn(
+        None,
+        q[:, :img_q_len, :, :],
+        k[:, :img_kv_len, :, :],
+        v[:, :img_kv_len, :, :],
+        dropout_p=0.0,
+        causal=False,
+        joint_tensor_query=q[:, img_q_len : cu_seqlens_q[1]],
+        joint_tensor_key=k[:, img_kv_len : cu_seqlens_kv[1]],
+        joint_tensor_value=v[:, img_kv_len : cu_seqlens_kv[1]],
+        joint_strategy="rear",
+    )
+    if flash_attn.__version__ >= "2.7.0":
+        attn2, *_ = _flash_attn_forward(
+            q[:, cu_seqlens_q[1] :],
+            k[:, cu_seqlens_kv[1] :],
+            v[:, cu_seqlens_kv[1] :],
+            dropout_p=0.0,
+            softmax_scale=q.shape[-1] ** (-0.5),
+            causal=False,
+            window_size_left=-1,
+            window_size_right=-1,
+            softcap=0.0,
+            alibi_slopes=None,
+            return_softmax=False,
+        )
+    else:
+        attn2, *_ = _flash_attn_forward(
+            q[:, cu_seqlens_q[1] :],
+            k[:, cu_seqlens_kv[1] :],
+            v[:, cu_seqlens_kv[1] :],
+            dropout_p=0.0,
+            softmax_scale=q.shape[-1] ** (-0.5),
+            causal=False,
+            window_size=(-1, -1),
+            softcap=0.0,
+            alibi_slopes=None,
+            return_softmax=False,
+        )
+    attn = torch.cat([attn1, attn2], dim=1)
+    b, s, a, d = attn.shape
+    attn = attn.reshape(b, s, -1)
+
+    return attn
diff --git a/hunyuan_model/autoencoder_kl_causal_3d.py b/hunyuan_model/autoencoder_kl_causal_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7e70737325a50e1ee1fbbee96b4a0aafbdcd241
--- /dev/null
+++ b/hunyuan_model/autoencoder_kl_causal_3d.py
@@ -0,0 +1,609 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+from typing import Dict, Optional, Tuple, Union
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+
+# try:
+#     # This diffusers is modified and packed in the mirror.
+#     from diffusers.loaders import FromOriginalVAEMixin
+# except ImportError:
+#     # Use this to be compatible with the original diffusers.
+#     from diffusers.loaders.single_file_model import FromOriginalModelMixin as FromOriginalVAEMixin
+from diffusers.utils.accelerate_utils import apply_forward_hook
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.models.modeling_utils import ModelMixin
+from .vae import DecoderCausal3D, BaseOutput, DecoderOutput, DiagonalGaussianDistribution, EncoderCausal3D
+
+
+@dataclass
+class DecoderOutput2(BaseOutput):
+    sample: torch.FloatTensor
+    posterior: Optional[DiagonalGaussianDistribution] = None
+
+
+class AutoencoderKLCausal3D(ModelMixin, ConfigMixin):
+    r"""
+    A VAE model with KL loss for encoding images/videos into latents and decoding latent representations into images/videos.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlockCausal3D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlockCausal3D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 32,
+        sample_tsize: int = 64,
+        scaling_factor: float = 0.18215,
+        force_upcast: float = True,
+        spatial_compression_ratio: int = 8,
+        time_compression_ratio: int = 4,
+        mid_block_add_attention: bool = True,
+    ):
+        super().__init__()
+
+        self.time_compression_ratio = time_compression_ratio
+
+        self.encoder = EncoderCausal3D(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=True,
+            time_compression_ratio=time_compression_ratio,
+            spatial_compression_ratio=spatial_compression_ratio,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+
+        self.decoder = DecoderCausal3D(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
+            time_compression_ratio=time_compression_ratio,
+            spatial_compression_ratio=spatial_compression_ratio,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+
+        self.quant_conv = nn.Conv3d(2 * latent_channels, 2 * latent_channels, kernel_size=1)
+        self.post_quant_conv = nn.Conv3d(latent_channels, latent_channels, kernel_size=1)
+
+        self.use_slicing = False
+        self.use_spatial_tiling = False
+        self.use_temporal_tiling = False
+
+        # only relevant if vae tiling is enabled
+        self.tile_sample_min_tsize = sample_tsize
+        self.tile_latent_min_tsize = sample_tsize // time_compression_ratio
+
+        self.tile_sample_min_size = self.config.sample_size
+        sample_size = self.config.sample_size[0] if isinstance(self.config.sample_size, (list, tuple)) else self.config.sample_size
+        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
+        self.tile_overlap_factor = 0.25
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (EncoderCausal3D, DecoderCausal3D)):
+            module.gradient_checkpointing = value
+
+    def enable_temporal_tiling(self, use_tiling: bool = True):
+        self.use_temporal_tiling = use_tiling
+
+    def disable_temporal_tiling(self):
+        self.enable_temporal_tiling(False)
+
+    def enable_spatial_tiling(self, use_tiling: bool = True):
+        self.use_spatial_tiling = use_tiling
+
+    def disable_spatial_tiling(self):
+        self.enable_spatial_tiling(False)
+
+    def enable_tiling(self, use_tiling: bool = True):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger videos.
+        """
+        self.enable_spatial_tiling(use_tiling)
+        self.enable_temporal_tiling(use_tiling)
+
+    def disable_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.disable_spatial_tiling()
+        self.disable_temporal_tiling()
+
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    def set_chunk_size_for_causal_conv_3d(self, chunk_size: int):
+        # set chunk_size to CausalConv3d recursively
+        def set_chunk_size(module):
+            if hasattr(module, "chunk_size"):
+                module.chunk_size = chunk_size
+
+        self.apply(set_chunk_size)
+
+    @property
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor, _remove_lora=True)
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images/videos into latents.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images/videos.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded images/videos. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        assert len(x.shape) == 5, "The input tensor should have 5 dimensions."
+
+        if self.use_temporal_tiling and x.shape[2] > self.tile_sample_min_tsize:
+            return self.temporal_tiled_encode(x, return_dict=return_dict)
+
+        if self.use_spatial_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+            return self.spatial_tiled_encode(x, return_dict=return_dict)
+
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self.encoder(x)
+
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        assert len(z.shape) == 5, "The input tensor should have 5 dimensions."
+
+        if self.use_temporal_tiling and z.shape[2] > self.tile_latent_min_tsize:
+            return self.temporal_tiled_decode(z, return_dict=return_dict)
+
+        if self.use_spatial_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
+            return self.spatial_tiled_decode(z, return_dict=return_dict)
+
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    @apply_forward_hook
+    def decode(self, z: torch.FloatTensor, return_dict: bool = True, generator=None) -> Union[DecoderOutput, torch.FloatTensor]:
+        """
+        Decode a batch of images/videos.
+
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+
+        if not return_dict:
+            return (decoded,)
+
+        return DecoderOutput(sample=decoded)
+
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (y / blend_extent)
+        return b
+
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (x / blend_extent)
+        return b
+
+    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (x / blend_extent)
+        return b
+
+    def spatial_tiled_encode(
+        self, x: torch.FloatTensor, return_dict: bool = True, return_moments: bool = False
+    ) -> AutoencoderKLOutput:
+        r"""Encode a batch of images/videos using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image/videos size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images/videos.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
+                `tuple` is returned.
+        """
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+
+        # Split video into tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[-2], overlap_size):
+            row = []
+            for j in range(0, x.shape[-1], overlap_size):
+                tile = x[:, :, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+
+        moments = torch.cat(result_rows, dim=-2)
+        if return_moments:
+            return moments
+
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def spatial_tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Decode a batch of images/videos using a tiled decoder.
+
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_sample_min_size - blend_extent
+
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, z.shape[-2], overlap_size):
+            row = []
+            for j in range(0, z.shape[-1], overlap_size):
+                tile = z[:, :, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
+                tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+
+        dec = torch.cat(result_rows, dim=-2)
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    def temporal_tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
+
+        B, C, T, H, W = x.shape
+        overlap_size = int(self.tile_sample_min_tsize * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_tsize * self.tile_overlap_factor)
+        t_limit = self.tile_latent_min_tsize - blend_extent
+
+        # Split the video into tiles and encode them separately.
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = x[:, :, i : i + self.tile_sample_min_tsize + 1, :, :]
+            if self.use_spatial_tiling and (
+                tile.shape[-1] > self.tile_sample_min_size or tile.shape[-2] > self.tile_sample_min_size
+            ):
+                tile = self.spatial_tiled_encode(tile, return_moments=True)
+            else:
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+            if i > 0:
+                tile = tile[:, :, 1:, :, :]
+            row.append(tile)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :t_limit, :, :])
+            else:
+                result_row.append(tile[:, :, : t_limit + 1, :, :])
+
+        moments = torch.cat(result_row, dim=2)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def temporal_tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        # Split z into overlapping tiles and decode them separately.
+
+        B, C, T, H, W = z.shape
+        overlap_size = int(self.tile_latent_min_tsize * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_tsize * self.tile_overlap_factor)
+        t_limit = self.tile_sample_min_tsize - blend_extent
+
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = z[:, :, i : i + self.tile_latent_min_tsize + 1, :, :]
+            if self.use_spatial_tiling and (
+                tile.shape[-1] > self.tile_latent_min_size or tile.shape[-2] > self.tile_latent_min_size
+            ):
+                decoded = self.spatial_tiled_decode(tile, return_dict=True).sample
+            else:
+                tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile)
+            if i > 0:
+                decoded = decoded[:, :, 1:, :, :]
+            row.append(decoded)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :t_limit, :, :])
+            else:
+                result_row.append(tile[:, :, : t_limit + 1, :, :])
+
+        dec = torch.cat(result_row, dim=2)
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        return_posterior: bool = False,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput2, torch.FloatTensor]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z).sample
+
+        if not return_dict:
+            if return_posterior:
+                return (dec, posterior)
+            else:
+                return (dec,)
+        if return_posterior:
+            return DecoderOutput2(sample=dec, posterior=posterior)
+        else:
+            return DecoderOutput2(sample=dec)
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
diff --git a/hunyuan_model/embed_layers.py b/hunyuan_model/embed_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e31ba9cc58d1aa05e0f17b919762f69bd693b5c0
--- /dev/null
+++ b/hunyuan_model/embed_layers.py
@@ -0,0 +1,132 @@
+import collections
+import math
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+
+from .helpers import to_2tuple
+
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding
+
+    Image to Patch Embedding using Conv2d
+
+    A convolution based approach to patchifying a 2D image w/ embedding projection.
+
+    Based on the impl in https://github.com/google-research/vision_transformer
+
+    Hacked together by / Copyright 2020 Ross Wightman
+
+    Remove the _assert function in forward function to be compatible with multi-resolution images.
+    """
+
+    def __init__(
+        self,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+        bias=True,
+        dtype=None,
+        device=None,
+    ):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.flatten = flatten
+
+        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias, **factory_kwargs)
+        nn.init.xavier_uniform_(self.proj.weight.view(self.proj.weight.size(0), -1))
+        if bias:
+            nn.init.zeros_(self.proj.bias)
+
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+
+
+class TextProjection(nn.Module):
+    """
+    Projects text embeddings. Also handles dropout for classifier-free guidance.
+
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+
+    def __init__(self, in_channels, hidden_size, act_layer, dtype=None, device=None):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.linear_1 = nn.Linear(in_features=in_channels, out_features=hidden_size, bias=True, **factory_kwargs)
+        self.act_1 = act_layer()
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True, **factory_kwargs)
+
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+def timestep_embedding(t, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+
+    Args:
+        t (torch.Tensor): a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        dim (int): the dimension of the output.
+        max_period (int): controls the minimum frequency of the embeddings.
+
+    Returns:
+        embedding (torch.Tensor): An (N, D) Tensor of positional embeddings.
+
+    .. ref_link: https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+    """
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(device=t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+
+
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        act_layer,
+        frequency_embedding_size=256,
+        max_period=10000,
+        out_size=None,
+        dtype=None,
+        device=None,
+    ):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_size
+        self.max_period = max_period
+        if out_size is None:
+            out_size = hidden_size
+
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True, **factory_kwargs),
+            act_layer(),
+            nn.Linear(hidden_size, out_size, bias=True, **factory_kwargs),
+        )
+        nn.init.normal_(self.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.mlp[2].weight, std=0.02)
+
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size, self.max_period).type(self.mlp[0].weight.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
diff --git a/hunyuan_model/fp8_optimization.py b/hunyuan_model/fp8_optimization.py
new file mode 100644
index 0000000000000000000000000000000000000000..90b978baca8cd9a3401b8b66a6575c0c3c29c991
--- /dev/null
+++ b/hunyuan_model/fp8_optimization.py
@@ -0,0 +1,39 @@
+#based on ComfyUI's and MinusZoneAI's fp8_linear optimization
+#further borrowed from HunyuanVideoWrapper for Musubi Tuner
+import torch
+import torch.nn as nn
+
+def fp8_linear_forward(cls, original_dtype, input):
+    weight_dtype = cls.weight.dtype
+    if weight_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
+        if len(input.shape) == 3:
+            target_dtype = torch.float8_e5m2 if weight_dtype == torch.float8_e4m3fn else torch.float8_e4m3fn
+            inn = input.reshape(-1, input.shape[2]).to(target_dtype)
+            w = cls.weight.t()
+
+            scale = torch.ones((1), device=input.device, dtype=torch.float32)
+            bias = cls.bias.to(original_dtype) if cls.bias is not None else None
+
+            if bias is not None:
+                o = torch._scaled_mm(inn, w, out_dtype=original_dtype, bias=bias, scale_a=scale, scale_b=scale)
+            else:
+                o = torch._scaled_mm(inn, w, out_dtype=original_dtype, scale_a=scale, scale_b=scale)
+
+            if isinstance(o, tuple):
+                o = o[0]
+
+            return o.reshape((-1, input.shape[1], cls.weight.shape[0]))
+        else:
+            return cls.original_forward(input.to(original_dtype))
+    else:
+        return cls.original_forward(input)
+
+def convert_fp8_linear(module, original_dtype, params_to_keep={}):
+    setattr(module, "fp8_matmul_enabled", True)
+   
+    for name, module in module.named_modules():
+        if not any(keyword in name for keyword in params_to_keep):
+            if isinstance(module, nn.Linear):
+                original_forward = module.forward
+                setattr(module, "original_forward", original_forward)
+                setattr(module, "forward", lambda input, m=module: fp8_linear_forward(m, original_dtype, input))
diff --git a/hunyuan_model/helpers.py b/hunyuan_model/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..72ab8cb1feba4ce7782f1ea841fd42c71be7b0d1
--- /dev/null
+++ b/hunyuan_model/helpers.py
@@ -0,0 +1,40 @@
+import collections.abc
+
+from itertools import repeat
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            x = tuple(x)
+            if len(x) == 1:
+                x = tuple(repeat(x[0], n))
+            return x
+        return tuple(repeat(x, n))
+    return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+
+
+def as_tuple(x):
+    if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+        return tuple(x)
+    if x is None or isinstance(x, (int, float, str)):
+        return (x,)
+    else:
+        raise ValueError(f"Unknown type {type(x)}")
+
+
+def as_list_of_2tuple(x):
+    x = as_tuple(x)
+    if len(x) == 1:
+        x = (x[0], x[0])
+    assert len(x) % 2 == 0, f"Expect even length, got {len(x)}."
+    lst = []
+    for i in range(0, len(x), 2):
+        lst.append((x[i], x[i + 1]))
+    return lst
diff --git a/hunyuan_model/mlp_layers.py b/hunyuan_model/mlp_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcc9547a6a0ba80ab19a472a9ea7aef525f46613
--- /dev/null
+++ b/hunyuan_model/mlp_layers.py
@@ -0,0 +1,118 @@
+# Modified from timm library:
+# https://github.com/huggingface/pytorch-image-models/blob/648aaa41233ba83eb38faf5ba9d415d574823241/timm/layers/mlp.py#L13
+
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+from .modulate_layers import modulate
+from .helpers import to_2tuple
+
+
+class MLP(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        out_features = out_features or in_channels
+        hidden_channels = hidden_channels or in_channels
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+
+        self.fc1 = linear_layer(
+            in_channels, hidden_channels, bias=bias[0], **factory_kwargs
+        )
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = (
+            norm_layer(hidden_channels, **factory_kwargs)
+            if norm_layer is not None
+            else nn.Identity()
+        )
+        self.fc2 = linear_layer(
+            hidden_channels, out_features, bias=bias[1], **factory_kwargs
+        )
+        self.drop2 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+
+
+# 
+class MLPEmbedder(nn.Module):
+    """copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py"""
+    def __init__(self, in_dim: int, hidden_dim: int, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True, **factory_kwargs)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True, **factory_kwargs)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+
+
+class FinalLayer(nn.Module):
+    """The final layer of DiT."""
+
+    def __init__(
+        self, hidden_size, patch_size, out_channels, act_layer, device=None, dtype=None
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+
+        # Just use LayerNorm for the final layer
+        self.norm_final = nn.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+        )
+        if isinstance(patch_size, int):
+            self.linear = nn.Linear(
+                hidden_size,
+                patch_size * patch_size * out_channels,
+                bias=True,
+                **factory_kwargs
+            )
+        else:
+            self.linear = nn.Linear(
+                hidden_size,
+                patch_size[0] * patch_size[1] * patch_size[2] * out_channels,
+                bias=True,
+            )
+        nn.init.zeros_(self.linear.weight)
+        nn.init.zeros_(self.linear.bias)
+
+        # Here we don't distinguish between the modulate types. Just use the simple one.
+        self.adaLN_modulation = nn.Sequential(
+            act_layer(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.adaLN_modulation[1].weight)
+        nn.init.zeros_(self.adaLN_modulation[1].bias)
+
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift=shift, scale=scale)
+        x = self.linear(x)
+        return x
diff --git a/hunyuan_model/models.py b/hunyuan_model/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..20921f4beb93f35d241020a4f14474e29bcf485a
--- /dev/null
+++ b/hunyuan_model/models.py
@@ -0,0 +1,1044 @@
+import os
+from typing import Any, List, Tuple, Optional, Union, Dict
+import accelerate
+from einops import rearrange
+
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+
+from .activation_layers import get_activation_layer
+from .norm_layers import get_norm_layer
+from .embed_layers import TimestepEmbedder, PatchEmbed, TextProjection
+from .attention import attention, parallel_attention, get_cu_seqlens
+from .posemb_layers import apply_rotary_emb
+from .mlp_layers import MLP, MLPEmbedder, FinalLayer
+from .modulate_layers import ModulateDiT, modulate, apply_gate
+from .token_refiner import SingleTokenRefiner
+from modules.custom_offloading_utils import ModelOffloader, synchronize_device, clean_memory_on_device
+from hunyuan_model.posemb_layers import get_nd_rotary_pos_embed
+
+from utils.safetensors_utils import MemoryEfficientSafeOpen
+
+
+class MMDoubleStreamBlock(nn.Module):
+    """
+    A multimodal dit block with seperate modulation for
+    text and image/video, see more details (SD3): https://arxiv.org/abs/2403.03206
+                                     (Flux.1): https://github.com/black-forest-labs/flux
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        heads_num: int,
+        mlp_width_ratio: float,
+        mlp_act_type: str = "gelu_tanh",
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        qkv_bias: bool = False,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        attn_mode: str = "flash",
+        split_attn: bool = False,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.attn_mode = attn_mode
+        self.split_attn = split_attn
+
+        self.deterministic = False
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+
+        self.img_mod = ModulateDiT(
+            hidden_size,
+            factor=6,
+            act_layer=get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+
+        self.img_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.img_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.img_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.img_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.img_mlp = MLP(
+            hidden_size,
+            mlp_hidden_dim,
+            act_layer=get_activation_layer(mlp_act_type),
+            bias=True,
+            **factory_kwargs,
+        )
+
+        self.txt_mod = ModulateDiT(
+            hidden_size,
+            factor=6,
+            act_layer=get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+
+        self.txt_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
+        self.txt_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.txt_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.txt_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.txt_mlp = MLP(
+            hidden_size,
+            mlp_hidden_dim,
+            act_layer=get_activation_layer(mlp_act_type),
+            bias=True,
+            **factory_kwargs,
+        )
+        self.hybrid_seq_parallel_attn = None
+
+        self.gradient_checkpointing = False
+
+    def enable_deterministic(self):
+        self.deterministic = True
+
+    def disable_deterministic(self):
+        self.deterministic = False
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+
+    def _forward(
+        self,
+        img: torch.Tensor,
+        txt: torch.Tensor,
+        vec: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        total_len: Optional[torch.Tensor] = None,
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        freqs_cis: tuple = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        (img_mod1_shift, img_mod1_scale, img_mod1_gate, img_mod2_shift, img_mod2_scale, img_mod2_gate) = self.img_mod(vec).chunk(
+            6, dim=-1
+        )
+        (txt_mod1_shift, txt_mod1_scale, txt_mod1_gate, txt_mod2_shift, txt_mod2_scale, txt_mod2_gate) = self.txt_mod(vec).chunk(
+            6, dim=-1
+        )
+
+        # Prepare image for attention.
+        img_modulated = self.img_norm1(img)
+        img_modulated = modulate(img_modulated, shift=img_mod1_shift, scale=img_mod1_scale)
+        img_qkv = self.img_attn_qkv(img_modulated)
+        img_modulated = None
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        img_qkv = None
+        # Apply QK-Norm if needed
+        img_q = self.img_attn_q_norm(img_q).to(img_v)
+        img_k = self.img_attn_k_norm(img_k).to(img_v)
+
+        # Apply RoPE if needed.
+        if freqs_cis is not None:
+            img_q_shape = img_q.shape
+            img_k_shape = img_k.shape
+            img_q, img_k = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+            assert (
+                img_q.shape == img_q_shape and img_k.shape == img_k_shape
+            ), f"img_kk: {img_q.shape}, img_q: {img_q_shape}, img_kk: {img_k.shape}, img_k: {img_k_shape}"
+            # img_q, img_k = img_qq, img_kk
+
+        # Prepare txt for attention.
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = modulate(txt_modulated, shift=txt_mod1_shift, scale=txt_mod1_scale)
+        txt_qkv = self.txt_attn_qkv(txt_modulated)
+        txt_modulated = None
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        txt_qkv = None
+        # Apply QK-Norm if needed.
+        txt_q = self.txt_attn_q_norm(txt_q).to(txt_v)
+        txt_k = self.txt_attn_k_norm(txt_k).to(txt_v)
+
+        # Run actual attention.
+        img_q_len = img_q.shape[1]
+        img_kv_len = img_k.shape[1]
+        batch_size = img_k.shape[0]
+        q = torch.cat((img_q, txt_q), dim=1)
+        img_q = txt_q = None
+        k = torch.cat((img_k, txt_k), dim=1)
+        img_k = txt_k = None
+        v = torch.cat((img_v, txt_v), dim=1)
+        img_v = txt_v = None
+
+        assert (
+            cu_seqlens_q.shape[0] == 2 * img.shape[0] + 1
+        ), f"cu_seqlens_q.shape:{cu_seqlens_q.shape}, img.shape[0]:{img.shape[0]}"
+
+        # attention computation start
+        if not self.hybrid_seq_parallel_attn:
+            l = [q, k, v]
+            q = k = v = None
+            attn = attention(
+                l,
+                mode=self.attn_mode,
+                attn_mask=attn_mask,
+                total_len=total_len,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                batch_size=batch_size,
+            )
+        else:
+            attn = parallel_attention(
+                self.hybrid_seq_parallel_attn,
+                q,
+                k,
+                v,
+                img_q_len=img_q_len,
+                img_kv_len=img_kv_len,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+            )
+
+        # attention computation end
+
+        img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1] :]
+        attn = None
+
+        # Calculate the img bloks.
+        img = img + apply_gate(self.img_attn_proj(img_attn), gate=img_mod1_gate)
+        img_attn = None
+        img = img + apply_gate(
+            self.img_mlp(modulate(self.img_norm2(img), shift=img_mod2_shift, scale=img_mod2_scale)),
+            gate=img_mod2_gate,
+        )
+
+        # Calculate the txt bloks.
+        txt = txt + apply_gate(self.txt_attn_proj(txt_attn), gate=txt_mod1_gate)
+        txt_attn = None
+        txt = txt + apply_gate(
+            self.txt_mlp(modulate(self.txt_norm2(txt), shift=txt_mod2_shift, scale=txt_mod2_scale)),
+            gate=txt_mod2_gate,
+        )
+
+        return img, txt
+
+    # def forward(
+    #     self,
+    #     img: torch.Tensor,
+    #     txt: torch.Tensor,
+    #     vec: torch.Tensor,
+    #     attn_mask: Optional[torch.Tensor] = None,
+    #     cu_seqlens_q: Optional[torch.Tensor] = None,
+    #     cu_seqlens_kv: Optional[torch.Tensor] = None,
+    #     max_seqlen_q: Optional[int] = None,
+    #     max_seqlen_kv: Optional[int] = None,
+    #     freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+    # ) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+
+
+class MMSingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    Also refer to (SD3): https://arxiv.org/abs/2403.03206
+                  (Flux.1): https://github.com/black-forest-labs/flux
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        heads_num: int,
+        mlp_width_ratio: float = 4.0,
+        mlp_act_type: str = "gelu_tanh",
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        qk_scale: float = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        attn_mode: str = "flash",
+        split_attn: bool = False,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.attn_mode = attn_mode
+        self.split_attn = split_attn
+
+        self.deterministic = False
+        self.hidden_size = hidden_size
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+        self.mlp_hidden_dim = mlp_hidden_dim
+        self.scale = qk_scale or head_dim**-0.5
+
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim, **factory_kwargs)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + mlp_hidden_dim, hidden_size, **factory_kwargs)
+
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.q_norm = qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        self.k_norm = qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+
+        self.mlp_act = get_activation_layer(mlp_act_type)()
+        self.modulation = ModulateDiT(hidden_size, factor=3, act_layer=get_activation_layer("silu"), **factory_kwargs)
+        self.hybrid_seq_parallel_attn = None
+
+        self.gradient_checkpointing = False
+
+    def enable_deterministic(self):
+        self.deterministic = True
+
+    def disable_deterministic(self):
+        self.deterministic = False
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+
+    def _forward(
+        self,
+        x: torch.Tensor,
+        vec: torch.Tensor,
+        txt_len: int,
+        attn_mask: Optional[torch.Tensor] = None,
+        total_len: Optional[torch.Tensor] = None,
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+    ) -> torch.Tensor:
+        mod_shift, mod_scale, mod_gate = self.modulation(vec).chunk(3, dim=-1)
+        x_mod = modulate(self.pre_norm(x), shift=mod_shift, scale=mod_scale)
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        x_mod = None
+        # mlp = mlp.to("cpu", non_blocking=True)
+        # clean_memory_on_device(x.device)
+
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        qkv = None
+
+        # Apply QK-Norm if needed.
+        q = self.q_norm(q).to(v)
+        k = self.k_norm(k).to(v)
+
+        # Apply RoPE if needed.
+        if freqs_cis is not None:
+            img_q, txt_q = q[:, :-txt_len, :, :], q[:, -txt_len:, :, :]
+            img_k, txt_k = k[:, :-txt_len, :, :], k[:, -txt_len:, :, :]
+            q = k = None
+            img_q_shape = img_q.shape
+            img_k_shape = img_k.shape
+            img_q, img_k = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+            assert (
+                img_q.shape == img_q_shape and img_k_shape == img_k.shape
+            ), f"img_kk: {img_q.shape}, img_q: {img_q.shape}, img_kk: {img_k.shape}, img_k: {img_k.shape}"
+            # img_q, img_k = img_qq, img_kk
+            # del img_qq, img_kk
+            q = torch.cat((img_q, txt_q), dim=1)
+            k = torch.cat((img_k, txt_k), dim=1)
+            del img_q, txt_q, img_k, txt_k
+
+        # Compute attention.
+        assert cu_seqlens_q.shape[0] == 2 * x.shape[0] + 1, f"cu_seqlens_q.shape:{cu_seqlens_q.shape}, x.shape[0]:{x.shape[0]}"
+
+        # attention computation start
+        if not self.hybrid_seq_parallel_attn:
+            l = [q, k, v]
+            q = k = v = None
+            attn = attention(
+                l,
+                mode=self.attn_mode,
+                attn_mask=attn_mask,
+                total_len=total_len,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                batch_size=x.shape[0],
+            )
+        else:
+            attn = parallel_attention(
+                self.hybrid_seq_parallel_attn,
+                q,
+                k,
+                v,
+                img_q_len=img_q.shape[1],
+                img_kv_len=img_k.shape[1],
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+            )
+        # attention computation end
+
+        # Compute activation in mlp stream, cat again and run second linear layer.
+        # mlp = mlp.to(x.device)
+        mlp = self.mlp_act(mlp)
+        attn_mlp = torch.cat((attn, mlp), 2)
+        attn = None
+        mlp = None
+        output = self.linear2(attn_mlp)
+        attn_mlp = None
+        return x + apply_gate(output, gate=mod_gate)
+
+    # def forward(
+    #     self,
+    #     x: torch.Tensor,
+    #     vec: torch.Tensor,
+    #     txt_len: int,
+    #     attn_mask: Optional[torch.Tensor] = None,
+    #     cu_seqlens_q: Optional[torch.Tensor] = None,
+    #     cu_seqlens_kv: Optional[torch.Tensor] = None,
+    #     max_seqlen_q: Optional[int] = None,
+    #     max_seqlen_kv: Optional[int] = None,
+    #     freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+    # ) -> torch.Tensor:
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+
+
+class HYVideoDiffusionTransformer(nn.Module):  # ModelMixin, ConfigMixin):
+    """
+    HunyuanVideo Transformer backbone
+
+    Inherited from ModelMixin and ConfigMixin for compatibility with diffusers' sampler StableDiffusionPipeline.
+
+    Reference:
+    [1] Flux.1: https://github.com/black-forest-labs/flux
+    [2] MMDiT: http://arxiv.org/abs/2403.03206
+
+    Parameters
+    ----------
+    args: argparse.Namespace
+        The arguments parsed by argparse.
+    patch_size: list
+        The size of the patch.
+    in_channels: int
+        The number of input channels.
+    out_channels: int
+        The number of output channels.
+    hidden_size: int
+        The hidden size of the transformer backbone.
+    heads_num: int
+        The number of attention heads.
+    mlp_width_ratio: float
+        The ratio of the hidden size of the MLP in the transformer block.
+    mlp_act_type: str
+        The activation function of the MLP in the transformer block.
+    depth_double_blocks: int
+        The number of transformer blocks in the double blocks.
+    depth_single_blocks: int
+        The number of transformer blocks in the single blocks.
+    rope_dim_list: list
+        The dimension of the rotary embedding for t, h, w.
+    qkv_bias: bool
+        Whether to use bias in the qkv linear layer.
+    qk_norm: bool
+        Whether to use qk norm.
+    qk_norm_type: str
+        The type of qk norm.
+    guidance_embed: bool
+        Whether to use guidance embedding for distillation.
+    text_projection: str
+        The type of the text projection, default is single_refiner.
+    use_attention_mask: bool
+        Whether to use attention mask for text encoder.
+    dtype: torch.dtype
+        The dtype of the model.
+    device: torch.device
+        The device of the model.
+    attn_mode: str
+        The mode of the attention, default is flash.
+    split_attn: bool
+        Whether to use split attention (make attention as batch size 1).
+    """
+
+    # @register_to_config
+    def __init__(
+        self,
+        text_states_dim: int,
+        text_states_dim_2: int,
+        patch_size: list = [1, 2, 2],
+        in_channels: int = 4,  # Should be VAE.config.latent_channels.
+        out_channels: int = None,
+        hidden_size: int = 3072,
+        heads_num: int = 24,
+        mlp_width_ratio: float = 4.0,
+        mlp_act_type: str = "gelu_tanh",
+        mm_double_blocks_depth: int = 20,
+        mm_single_blocks_depth: int = 40,
+        rope_dim_list: List[int] = [16, 56, 56],
+        qkv_bias: bool = True,
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        guidance_embed: bool = False,  # For modulation.
+        text_projection: str = "single_refiner",
+        use_attention_mask: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        attn_mode: str = "flash",
+        split_attn: bool = False,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.unpatchify_channels = self.out_channels
+        self.guidance_embed = guidance_embed
+        self.rope_dim_list = rope_dim_list
+
+        # Text projection. Default to linear projection.
+        # Alternative: TokenRefiner. See more details (LI-DiT): http://arxiv.org/abs/2406.11831
+        self.use_attention_mask = use_attention_mask
+        self.text_projection = text_projection
+
+        self.text_states_dim = text_states_dim
+        self.text_states_dim_2 = text_states_dim_2
+
+        if hidden_size % heads_num != 0:
+            raise ValueError(f"Hidden size {hidden_size} must be divisible by heads_num {heads_num}")
+        pe_dim = hidden_size // heads_num
+        if sum(rope_dim_list) != pe_dim:
+            raise ValueError(f"Got {rope_dim_list} but expected positional dim {pe_dim}")
+        self.hidden_size = hidden_size
+        self.heads_num = heads_num
+
+        self.attn_mode = attn_mode
+        self.split_attn = split_attn
+        print(f"Using {self.attn_mode} attention mode, split_attn: {self.split_attn}")
+
+        # image projection
+        self.img_in = PatchEmbed(self.patch_size, self.in_channels, self.hidden_size, **factory_kwargs)
+
+        # text projection
+        if self.text_projection == "linear":
+            self.txt_in = TextProjection(
+                self.text_states_dim,
+                self.hidden_size,
+                get_activation_layer("silu"),
+                **factory_kwargs,
+            )
+        elif self.text_projection == "single_refiner":
+            self.txt_in = SingleTokenRefiner(self.text_states_dim, hidden_size, heads_num, depth=2, **factory_kwargs)
+        else:
+            raise NotImplementedError(f"Unsupported text_projection: {self.text_projection}")
+
+        # time modulation
+        self.time_in = TimestepEmbedder(self.hidden_size, get_activation_layer("silu"), **factory_kwargs)
+
+        # text modulation
+        self.vector_in = MLPEmbedder(self.text_states_dim_2, self.hidden_size, **factory_kwargs)
+
+        # guidance modulation
+        self.guidance_in = (
+            TimestepEmbedder(self.hidden_size, get_activation_layer("silu"), **factory_kwargs) if guidance_embed else None
+        )
+
+        # double blocks
+        self.double_blocks = nn.ModuleList(
+            [
+                MMDoubleStreamBlock(
+                    self.hidden_size,
+                    self.heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_act_type=mlp_act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    qkv_bias=qkv_bias,
+                    attn_mode=attn_mode,
+                    split_attn=split_attn,
+                    **factory_kwargs,
+                )
+                for _ in range(mm_double_blocks_depth)
+            ]
+        )
+
+        # single blocks
+        self.single_blocks = nn.ModuleList(
+            [
+                MMSingleStreamBlock(
+                    self.hidden_size,
+                    self.heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_act_type=mlp_act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    attn_mode=attn_mode,
+                    split_attn=split_attn,
+                    **factory_kwargs,
+                )
+                for _ in range(mm_single_blocks_depth)
+            ]
+        )
+
+        self.final_layer = FinalLayer(
+            self.hidden_size,
+            self.patch_size,
+            self.out_channels,
+            get_activation_layer("silu"),
+            **factory_kwargs,
+        )
+
+        self.gradient_checkpointing = False
+        self.blocks_to_swap = None
+        self.offloader_double = None
+        self.offloader_single = None
+        self._enable_img_in_txt_in_offloading = False
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+
+        self.txt_in.enable_gradient_checkpointing()
+
+        for block in self.double_blocks + self.single_blocks:
+            block.enable_gradient_checkpointing()
+
+        print(f"HYVideoDiffusionTransformer: Gradient checkpointing enabled.")
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+
+        self.txt_in.disable_gradient_checkpointing()
+
+        for block in self.double_blocks + self.single_blocks:
+            block.disable_gradient_checkpointing()
+
+        print(f"HYVideoDiffusionTransformer: Gradient checkpointing disabled.")
+
+    def enable_img_in_txt_in_offloading(self):
+        self._enable_img_in_txt_in_offloading = True
+
+    def enable_block_swap(self, num_blocks: int, device: torch.device, supports_backward: bool):
+        self.blocks_to_swap = num_blocks
+        self.num_double_blocks = len(self.double_blocks)
+        self.num_single_blocks = len(self.single_blocks)
+        double_blocks_to_swap = num_blocks // 2
+        single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2 + 1
+
+        assert double_blocks_to_swap <= self.num_double_blocks - 1 and single_blocks_to_swap <= self.num_single_blocks - 1, (
+            f"Cannot swap more than {self.num_double_blocks - 1} double blocks and {self.num_single_blocks - 1} single blocks. "
+            f"Requested {double_blocks_to_swap} double blocks and {single_blocks_to_swap} single blocks."
+        )
+
+        self.offloader_double = ModelOffloader(
+            "double", self.double_blocks, self.num_double_blocks, double_blocks_to_swap, supports_backward, device  # , debug=True
+        )
+        self.offloader_single = ModelOffloader(
+            "single", self.single_blocks, self.num_single_blocks, single_blocks_to_swap, supports_backward, device  # , debug=True
+        )
+        print(
+            f"HYVideoDiffusionTransformer: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
+        )
+
+    def switch_block_swap_for_inference(self):
+        if self.blocks_to_swap:
+            self.offloader_double.set_forward_only(True)
+            self.offloader_single.set_forward_only(True)
+            self.prepare_block_swap_before_forward()
+            print(f"HYVideoDiffusionTransformer: Block swap set to forward only.")
+
+    def switch_block_swap_for_training(self):
+        if self.blocks_to_swap:
+            self.offloader_double.set_forward_only(False)
+            self.offloader_single.set_forward_only(False)
+            self.prepare_block_swap_before_forward()
+            print(f"HYVideoDiffusionTransformer: Block swap set to forward and backward.")
+
+    def move_to_device_except_swap_blocks(self, device: torch.device):
+        # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
+        if self.blocks_to_swap:
+            save_double_blocks = self.double_blocks
+            save_single_blocks = self.single_blocks
+            self.double_blocks = None
+            self.single_blocks = None
+
+        self.to(device)
+
+        if self.blocks_to_swap:
+            self.double_blocks = save_double_blocks
+            self.single_blocks = save_single_blocks
+
+    def prepare_block_swap_before_forward(self):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self.offloader_double.prepare_block_devices_before_forward(self.double_blocks)
+        self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)
+
+    def enable_deterministic(self):
+        for block in self.double_blocks:
+            block.enable_deterministic()
+        for block in self.single_blocks:
+            block.enable_deterministic()
+
+    def disable_deterministic(self):
+        for block in self.double_blocks:
+            block.disable_deterministic()
+        for block in self.single_blocks:
+            block.disable_deterministic()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,  # Should be in range(0, 1000).
+        text_states: torch.Tensor = None,
+        text_mask: torch.Tensor = None,  # Now we don't use it.
+        text_states_2: Optional[torch.Tensor] = None,  # Text embedding for modulation.
+        freqs_cos: Optional[torch.Tensor] = None,
+        freqs_sin: Optional[torch.Tensor] = None,
+        guidance: torch.Tensor = None,  # Guidance for modulation, should be cfg_scale x 1000.
+        return_dict: bool = True,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        out = {}
+        img = x
+        txt = text_states
+        _, _, ot, oh, ow = x.shape
+        tt, th, tw = (
+            ot // self.patch_size[0],
+            oh // self.patch_size[1],
+            ow // self.patch_size[2],
+        )
+
+        # Prepare modulation vectors.
+        vec = self.time_in(t)
+
+        # text modulation
+        vec = vec + self.vector_in(text_states_2)
+
+        # guidance modulation
+        if self.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+
+            # our timestep_embedding is merged into guidance_in(TimestepEmbedder)
+            vec = vec + self.guidance_in(guidance)
+
+        # Embed image and text.
+        if self._enable_img_in_txt_in_offloading:
+            self.img_in.to(x.device, non_blocking=True)
+            self.txt_in.to(x.device, non_blocking=True)
+            synchronize_device(x.device)
+
+        img = self.img_in(img)
+        if self.text_projection == "linear":
+            txt = self.txt_in(txt)
+        elif self.text_projection == "single_refiner":
+            txt = self.txt_in(txt, t, text_mask if self.use_attention_mask else None)
+        else:
+            raise NotImplementedError(f"Unsupported text_projection: {self.text_projection}")
+
+        if self._enable_img_in_txt_in_offloading:
+            self.img_in.to(torch.device("cpu"), non_blocking=True)
+            self.txt_in.to(torch.device("cpu"), non_blocking=True)
+            synchronize_device(x.device)
+            clean_memory_on_device(x.device)
+
+        txt_seq_len = txt.shape[1]
+        img_seq_len = img.shape[1]
+
+        # Compute cu_squlens and max_seqlen for flash attention
+        cu_seqlens_q = get_cu_seqlens(text_mask, img_seq_len)
+        cu_seqlens_kv = cu_seqlens_q
+        max_seqlen_q = img_seq_len + txt_seq_len
+        max_seqlen_kv = max_seqlen_q
+
+        attn_mask = total_len = None
+        if self.split_attn or self.attn_mode == "torch":
+            # calculate text length and total length
+            text_len = text_mask.sum(dim=1)  #  (bs, )
+            total_len = img_seq_len + text_len  # (bs, )
+        if self.attn_mode == "torch" and not self.split_attn:
+            # initialize attention mask: bool tensor for sdpa, (b, 1, n, n)
+            bs = img.shape[0]
+            attn_mask = torch.zeros((bs, 1, max_seqlen_q, max_seqlen_q), dtype=torch.bool, device=text_mask.device)
+
+            # set attention mask with total_len
+            for i in range(bs):
+                attn_mask[i, :, : total_len[i], : total_len[i]] = True
+            total_len = None  # means we don't use split_attn
+
+        freqs_cis = (freqs_cos, freqs_sin) if freqs_cos is not None else None
+        # --------------------- Pass through DiT blocks ------------------------
+        for block_idx, block in enumerate(self.double_blocks):
+            double_block_args = [
+                img,
+                txt,
+                vec,
+                attn_mask,
+                total_len,
+                cu_seqlens_q,
+                cu_seqlens_kv,
+                max_seqlen_q,
+                max_seqlen_kv,
+                freqs_cis,
+            ]
+
+            if self.blocks_to_swap:
+                self.offloader_double.wait_for_block(block_idx)
+
+            img, txt = block(*double_block_args)
+
+            if self.blocks_to_swap:
+                self.offloader_double.submit_move_blocks_forward(self.double_blocks, block_idx)
+
+        # Merge txt and img to pass through single stream blocks.
+        x = torch.cat((img, txt), 1)
+        if self.blocks_to_swap:
+            # delete img, txt to reduce memory usage
+            del img, txt
+            clean_memory_on_device(x.device)
+
+        if len(self.single_blocks) > 0:
+            for block_idx, block in enumerate(self.single_blocks):
+                single_block_args = [
+                    x,
+                    vec,
+                    txt_seq_len,
+                    attn_mask,
+                    total_len,
+                    cu_seqlens_q,
+                    cu_seqlens_kv,
+                    max_seqlen_q,
+                    max_seqlen_kv,
+                    freqs_cis,
+                ]
+                if self.blocks_to_swap:
+                    self.offloader_single.wait_for_block(block_idx)
+
+                x = block(*single_block_args)
+
+                if self.blocks_to_swap:
+                    self.offloader_single.submit_move_blocks_forward(self.single_blocks, block_idx)
+
+        img = x[:, :img_seq_len, ...]
+        x = None
+
+        # ---------------------------- Final layer ------------------------------
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+
+        img = self.unpatchify(img, tt, th, tw)
+        if return_dict:
+            out["x"] = img
+            return out
+        return img
+
+    def unpatchify(self, x, t, h, w):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.unpatchify_channels
+        pt, ph, pw = self.patch_size
+        assert t * h * w == x.shape[1]
+
+        x = x.reshape(shape=(x.shape[0], t, h, w, c, pt, ph, pw))
+        x = torch.einsum("nthwcopq->nctohpwq", x)
+        imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
+
+        return imgs
+
+    def params_count(self):
+        counts = {
+            "double": sum(
+                [
+                    sum(p.numel() for p in block.img_attn_qkv.parameters())
+                    + sum(p.numel() for p in block.img_attn_proj.parameters())
+                    + sum(p.numel() for p in block.img_mlp.parameters())
+                    + sum(p.numel() for p in block.txt_attn_qkv.parameters())
+                    + sum(p.numel() for p in block.txt_attn_proj.parameters())
+                    + sum(p.numel() for p in block.txt_mlp.parameters())
+                    for block in self.double_blocks
+                ]
+            ),
+            "single": sum(
+                [
+                    sum(p.numel() for p in block.linear1.parameters()) + sum(p.numel() for p in block.linear2.parameters())
+                    for block in self.single_blocks
+                ]
+            ),
+            "total": sum(p.numel() for p in self.parameters()),
+        }
+        counts["attn+mlp"] = counts["double"] + counts["single"]
+        return counts
+
+
+#################################################################################
+#                             HunyuanVideo Configs                              #
+#################################################################################
+
+HUNYUAN_VIDEO_CONFIG = {
+    "HYVideo-T/2": {
+        "mm_double_blocks_depth": 20,
+        "mm_single_blocks_depth": 40,
+        "rope_dim_list": [16, 56, 56],
+        "hidden_size": 3072,
+        "heads_num": 24,
+        "mlp_width_ratio": 4,
+    },
+    "HYVideo-T/2-cfgdistill": {
+        "mm_double_blocks_depth": 20,
+        "mm_single_blocks_depth": 40,
+        "rope_dim_list": [16, 56, 56],
+        "hidden_size": 3072,
+        "heads_num": 24,
+        "mlp_width_ratio": 4,
+        "guidance_embed": True,
+    },
+}
+
+
+def load_dit_model(text_states_dim, text_states_dim_2, in_channels, out_channels, factor_kwargs):
+    """load hunyuan video model
+
+    NOTE: Only support HYVideo-T/2-cfgdistill now.
+
+    Args:
+        text_state_dim (int): text state dimension
+        text_state_dim_2 (int): text state dimension 2
+        in_channels (int): input channels number
+        out_channels (int): output channels number
+        factor_kwargs (dict): factor kwargs
+
+    Returns:
+        model (nn.Module): The hunyuan video model
+    """
+    # if args.model in HUNYUAN_VIDEO_CONFIG.keys():
+    model = HYVideoDiffusionTransformer(
+        text_states_dim=text_states_dim,
+        text_states_dim_2=text_states_dim_2,
+        in_channels=in_channels,
+        out_channels=out_channels,
+        **HUNYUAN_VIDEO_CONFIG["HYVideo-T/2-cfgdistill"],
+        **factor_kwargs,
+    )
+    return model
+    # else:
+    #     raise NotImplementedError()
+
+
+def load_state_dict(model, model_path):
+    state_dict = torch.load(model_path, map_location=lambda storage, loc: storage, weights_only=True)
+
+    load_key = "module"
+    if load_key in state_dict:
+        state_dict = state_dict[load_key]
+    else:
+        raise KeyError(
+            f"Missing key: `{load_key}` in the checkpoint: {model_path}. The keys in the checkpoint "
+            f"are: {list(state_dict.keys())}."
+        )
+    model.load_state_dict(state_dict, strict=True, assign=True)
+    return model
+
+
+def load_transformer(dit_path, attn_mode, split_attn, device, dtype, in_channels=16) -> HYVideoDiffusionTransformer:
+    # =========================== Build main model ===========================
+    factor_kwargs = {"device": device, "dtype": dtype, "attn_mode": attn_mode, "split_attn": split_attn}
+    latent_channels = 16
+    out_channels = latent_channels
+
+    with accelerate.init_empty_weights():
+        transformer = load_dit_model(
+            text_states_dim=4096,
+            text_states_dim_2=768,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            factor_kwargs=factor_kwargs,
+        )
+
+    if os.path.splitext(dit_path)[-1] == ".safetensors":
+        # loading safetensors: may be already fp8
+        with MemoryEfficientSafeOpen(dit_path) as f:
+            state_dict = {}
+            for k in f.keys():
+                tensor = f.get_tensor(k)
+                tensor = tensor.to(device=device, dtype=dtype)
+                # TODO support comfy model
+                # if k.startswith("model.model."):
+                #     k = convert_comfy_model_key(k)
+                state_dict[k] = tensor
+        transformer.load_state_dict(state_dict, strict=True, assign=True)
+    else:
+        transformer = load_state_dict(transformer, dit_path)
+
+    return transformer
+
+
+def get_rotary_pos_embed_by_shape(model, latents_size):
+    target_ndim = 3
+    ndim = 5 - 2
+
+    if isinstance(model.patch_size, int):
+        assert all(s % model.patch_size == 0 for s in latents_size), (
+            f"Latent size(last {ndim} dimensions) should be divisible by patch size({model.patch_size}), "
+            f"but got {latents_size}."
+        )
+        rope_sizes = [s // model.patch_size for s in latents_size]
+    elif isinstance(model.patch_size, list):
+        assert all(s % model.patch_size[idx] == 0 for idx, s in enumerate(latents_size)), (
+            f"Latent size(last {ndim} dimensions) should be divisible by patch size({model.patch_size}), "
+            f"but got {latents_size}."
+        )
+        rope_sizes = [s // model.patch_size[idx] for idx, s in enumerate(latents_size)]
+
+    if len(rope_sizes) != target_ndim:
+        rope_sizes = [1] * (target_ndim - len(rope_sizes)) + rope_sizes  # time axis
+    head_dim = model.hidden_size // model.heads_num
+    rope_dim_list = model.rope_dim_list
+    if rope_dim_list is None:
+        rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
+    assert sum(rope_dim_list) == head_dim, "sum(rope_dim_list) should equal to head_dim of attention layer"
+
+    rope_theta = 256
+    freqs_cos, freqs_sin = get_nd_rotary_pos_embed(
+        rope_dim_list, rope_sizes, theta=rope_theta, use_real=True, theta_rescale_factor=1
+    )
+    return freqs_cos, freqs_sin
+
+
+def get_rotary_pos_embed(vae_name, model, video_length, height, width):
+    # 884
+    if "884" in vae_name:
+        latents_size = [(video_length - 1) // 4 + 1, height // 8, width // 8]
+    elif "888" in vae_name:
+        latents_size = [(video_length - 1) // 8 + 1, height // 8, width // 8]
+    else:
+        latents_size = [video_length, height // 8, width // 8]
+
+    return get_rotary_pos_embed_by_shape(model, latents_size)
diff --git a/hunyuan_model/modulate_layers.py b/hunyuan_model/modulate_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..93a57c6d2fdc0fca9bf44aeee6996bf1d8a05901
--- /dev/null
+++ b/hunyuan_model/modulate_layers.py
@@ -0,0 +1,76 @@
+from typing import Callable
+
+import torch
+import torch.nn as nn
+
+
+class ModulateDiT(nn.Module):
+    """Modulation layer for DiT."""
+    def __init__(
+        self,
+        hidden_size: int,
+        factor: int,
+        act_layer: Callable,
+        dtype=None,
+        device=None,
+    ):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.act = act_layer()
+        self.linear = nn.Linear(
+            hidden_size, factor * hidden_size, bias=True, **factory_kwargs
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.linear.weight)
+        nn.init.zeros_(self.linear.bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear(self.act(x))
+
+
+def modulate(x, shift=None, scale=None):
+    """modulate by shift and scale
+
+    Args:
+        x (torch.Tensor): input tensor.
+        shift (torch.Tensor, optional): shift tensor. Defaults to None.
+        scale (torch.Tensor, optional): scale tensor. Defaults to None.
+
+    Returns:
+        torch.Tensor: the output tensor after modulate.
+    """
+    if scale is None and shift is None:
+        return x
+    elif shift is None:
+        return x * (1 + scale.unsqueeze(1))
+    elif scale is None:
+        return x + shift.unsqueeze(1)
+    else:
+        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+
+def apply_gate(x, gate=None, tanh=False):
+    """AI is creating summary for apply_gate
+
+    Args:
+        x (torch.Tensor): input tensor.
+        gate (torch.Tensor, optional): gate tensor. Defaults to None.
+        tanh (bool, optional): whether to use tanh function. Defaults to False.
+
+    Returns:
+        torch.Tensor: the output tensor after apply gate.
+    """
+    if gate is None:
+        return x
+    if tanh:
+        return x * gate.unsqueeze(1).tanh()
+    else:
+        return x * gate.unsqueeze(1)
+
+
+def ckpt_wrapper(module):
+    def ckpt_forward(*inputs):
+        outputs = module(*inputs)
+        return outputs
+
+    return ckpt_forward
diff --git a/hunyuan_model/norm_layers.py b/hunyuan_model/norm_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..a53d167436b6971d3aabf5cfe51c0b9d6dfc022f
--- /dev/null
+++ b/hunyuan_model/norm_layers.py
@@ -0,0 +1,79 @@
+import torch
+import torch.nn as nn
+
+
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        elementwise_affine=True,
+        eps: float = 1e-6,
+        device=None,
+        dtype=None,
+    ):
+        """
+        Initialize the RMSNorm normalization layer.
+
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim, **factory_kwargs))
+
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The normalized tensor.
+
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+
+        """
+        output = self._norm(x.float()).type_as(x)
+        if hasattr(self, "weight"):
+            # output = output * self.weight
+            # support fp8
+            output = output * self.weight.to(output.dtype)
+        return output
+
+
+def get_norm_layer(norm_layer):
+    """
+    Get the normalization layer.
+
+    Args:
+        norm_layer (str): The type of normalization layer.
+
+    Returns:
+        norm_layer (nn.Module): The normalization layer.
+    """
+    if norm_layer == "layer":
+        return nn.LayerNorm
+    elif norm_layer == "rms":
+        return RMSNorm
+    else:
+        raise NotImplementedError(f"Norm layer {norm_layer} is not implemented")
diff --git a/hunyuan_model/pipeline_hunyuan_video.py b/hunyuan_model/pipeline_hunyuan_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1293161e13a47ae7dcedfef2c55e3baefc655f4
--- /dev/null
+++ b/hunyuan_model/pipeline_hunyuan_video.py
@@ -0,0 +1,1100 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import torch
+import torch.distributed as dist
+import numpy as np
+from dataclasses import dataclass
+from packaging import version
+
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import BaseOutput
+
+from ...constants import PRECISION_TO_TYPE
+from ...vae.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from ...text_encoder import TextEncoder
+from ...modules import HYVideoDiffusionTransformer
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """"""
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(
+        dim=list(range(1, noise_pred_text.ndim)), keepdim=True
+    )
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = (
+        guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    )
+    return noise_cfg
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError(
+            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+        )
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+@dataclass
+class HunyuanVideoPipelineOutput(BaseOutput):
+    videos: Union[torch.Tensor, np.ndarray]
+
+
+class HunyuanVideoPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-video generation using HunyuanVideo.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`TextEncoder`]):
+            Frozen text-encoder.
+        text_encoder_2 ([`TextEncoder`]):
+            Frozen text-encoder_2.
+        transformer ([`HYVideoDiffusionTransformer`]):
+            A `HYVideoDiffusionTransformer` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = ["text_encoder_2"]
+    _exclude_from_cpu_offload = ["transformer"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: TextEncoder,
+        transformer: HYVideoDiffusionTransformer,
+        scheduler: KarrasDiffusionSchedulers,
+        text_encoder_2: Optional[TextEncoder] = None,
+        progress_bar_config: Dict[str, Any] = None,
+        args=None,
+    ):
+        super().__init__()
+
+        # ==========================================================================================
+        if progress_bar_config is None:
+            progress_bar_config = {}
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        self._progress_bar_config.update(progress_bar_config)
+
+        self.args = args
+        # ==========================================================================================
+
+        if (
+            hasattr(scheduler.config, "steps_offset")
+            and scheduler.config.steps_offset != 1
+        ):
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate(
+                "steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False
+            )
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if (
+            hasattr(scheduler.config, "clip_sample")
+            and scheduler.config.clip_sample is True
+        ):
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate(
+                "clip_sample not set", "1.0.0", deprecation_message, standard_warn=False
+            )
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+            text_encoder_2=text_encoder_2,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_videos_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_attention_mask: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+        text_encoder: Optional[TextEncoder] = None,
+        data_type: Optional[str] = "image",
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_videos_per_prompt (`int`):
+                number of videos that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the video generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            attention_mask (`torch.Tensor`, *optional*):
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_attention_mask (`torch.Tensor`, *optional*):
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            text_encoder (TextEncoder, *optional*):
+            data_type (`str`, *optional*):
+        """
+        if text_encoder is None:
+            text_encoder = self.text_encoder
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(text_encoder.model, lora_scale)
+            else:
+                scale_lora_layers(text_encoder.model, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, text_encoder.tokenizer)
+
+            text_inputs = text_encoder.text2tokens(prompt, data_type=data_type)
+
+            if clip_skip is None:
+                prompt_outputs = text_encoder.encode(
+                    text_inputs, data_type=data_type, device=device
+                )
+                prompt_embeds = prompt_outputs.hidden_state
+            else:
+                prompt_outputs = text_encoder.encode(
+                    text_inputs,
+                    output_hidden_states=True,
+                    data_type=data_type,
+                    device=device,
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_outputs.hidden_states_list[-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = text_encoder.model.text_model.final_layer_norm(
+                    prompt_embeds
+                )
+
+            attention_mask = prompt_outputs.attention_mask
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(device)
+                bs_embed, seq_len = attention_mask.shape
+                attention_mask = attention_mask.repeat(1, num_videos_per_prompt)
+                attention_mask = attention_mask.view(
+                    bs_embed * num_videos_per_prompt, seq_len
+                )
+
+        if text_encoder is not None:
+            prompt_embeds_dtype = text_encoder.dtype
+        elif self.transformer is not None:
+            prompt_embeds_dtype = self.transformer.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        if prompt_embeds.ndim == 2:
+            bs_embed, _ = prompt_embeds.shape
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt)
+            prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, -1)
+        else:
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+            prompt_embeds = prompt_embeds.view(
+                bs_embed * num_videos_per_prompt, seq_len, -1
+            )
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(
+                    uncond_tokens, text_encoder.tokenizer
+                )
+
+            # max_length = prompt_embeds.shape[1]
+            uncond_input = text_encoder.text2tokens(uncond_tokens, data_type=data_type)
+
+            negative_prompt_outputs = text_encoder.encode(
+                uncond_input, data_type=data_type, device=device
+            )
+            negative_prompt_embeds = negative_prompt_outputs.hidden_state
+
+            negative_attention_mask = negative_prompt_outputs.attention_mask
+            if negative_attention_mask is not None:
+                negative_attention_mask = negative_attention_mask.to(device)
+                _, seq_len = negative_attention_mask.shape
+                negative_attention_mask = negative_attention_mask.repeat(
+                    1, num_videos_per_prompt
+                )
+                negative_attention_mask = negative_attention_mask.view(
+                    batch_size * num_videos_per_prompt, seq_len
+                )
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=prompt_embeds_dtype, device=device
+            )
+
+            if negative_prompt_embeds.ndim == 2:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(
+                    1, num_videos_per_prompt
+                )
+                negative_prompt_embeds = negative_prompt_embeds.view(
+                    batch_size * num_videos_per_prompt, -1
+                )
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(
+                    1, num_videos_per_prompt, 1
+                )
+                negative_prompt_embeds = negative_prompt_embeds.view(
+                    batch_size * num_videos_per_prompt, seq_len, -1
+                )
+
+        if text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(text_encoder.model, lora_scale)
+
+        return (
+            prompt_embeds,
+            negative_prompt_embeds,
+            attention_mask,
+            negative_attention_mask,
+        )
+
+    def decode_latents(self, latents, enable_tiling=True):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        if enable_tiling:
+            self.vae.enable_tiling()
+            image = self.vae.decode(latents, return_dict=False)[0]
+        else:
+            image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        if image.ndim == 4:
+            image = image.cpu().permute(0, 2, 3, 1).float()
+        else:
+            image = image.cpu().float()
+        return image
+
+    def prepare_extra_func_kwargs(self, func, kwargs):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        extra_step_kwargs = {}
+
+        for k, v in kwargs.items():
+            accepts = k in set(inspect.signature(func).parameters.keys())
+            if accepts:
+                extra_step_kwargs[k] = v
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        video_length,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        vae_ver="88-4c-sd",
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+
+        if video_length is not None:
+            if "884" in vae_ver:
+                if video_length != 1 and (video_length - 1) % 4 != 0:
+                    raise ValueError(
+                        f"`video_length` has to be 1 or a multiple of 4 but is {video_length}."
+                    )
+            elif "888" in vae_ver:
+                if video_length != 1 and (video_length - 1) % 8 != 0:
+                    raise ValueError(
+                        f"`video_length` has to be 1 or a multiple of 8 but is {video_length}."
+                    )
+
+        if callback_steps is not None and (
+            not isinstance(callback_steps, int) or callback_steps <= 0
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs
+            for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (
+            not isinstance(prompt, str) and not isinstance(prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        video_length,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            video_length,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(
+                shape, generator=generator, device=device, dtype=dtype
+            )
+        else:
+            latents = latents.to(device)
+
+        # Check existence to make it compatible with FlowMatchEulerDiscreteScheduler
+        if hasattr(self.scheduler, "init_noise_sigma"):
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self,
+        w: torch.Tensor,
+        embedding_dim: int = 512,
+        dtype: torch.dtype = torch.float32,
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        # return self._guidance_scale > 1 and self.transformer.config.time_cond_proj_dim is None
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: int,
+        width: int,
+        video_length: int,
+        data_type: str = "video",
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[
+                Callable[[int, int, Dict], None],
+                PipelineCallback,
+                MultiPipelineCallbacks,
+            ]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+        vae_ver: str = "88-4c-sd",
+        enable_tiling: bool = False,
+        n_tokens: Optional[int] = None,
+        embedded_guidance_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`):
+                The height in pixels of the generated image.
+            width (`int`):
+                The width in pixels of the generated image.
+            video_length (`int`):
+                The number of frames in the generated video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+                
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`HunyuanVideoPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~HunyuanVideoPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`HunyuanVideoPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. Default height and width to unet
+        # height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        # width = width or self.transformer.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            video_length,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+            vae_ver=vae_ver,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = torch.device(f"cuda:{dist.get_rank()}") if dist.is_initialized() else self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None)
+            if self.cross_attention_kwargs is not None
+            else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_mask,
+            negative_prompt_mask,
+        ) = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            attention_mask=attention_mask,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_attention_mask=negative_attention_mask,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+            data_type=data_type,
+        )
+        if self.text_encoder_2 is not None:
+            (
+                prompt_embeds_2,
+                negative_prompt_embeds_2,
+                prompt_mask_2,
+                negative_prompt_mask_2,
+            ) = self.encode_prompt(
+                prompt,
+                device,
+                num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=None,
+                attention_mask=None,
+                negative_prompt_embeds=None,
+                negative_attention_mask=None,
+                lora_scale=lora_scale,
+                clip_skip=self.clip_skip,
+                text_encoder=self.text_encoder_2,
+                data_type=data_type,
+            )
+        else:
+            prompt_embeds_2 = None
+            negative_prompt_embeds_2 = None
+            prompt_mask_2 = None
+            negative_prompt_mask_2 = None
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            if prompt_mask is not None:
+                prompt_mask = torch.cat([negative_prompt_mask, prompt_mask])
+            if prompt_embeds_2 is not None:
+                prompt_embeds_2 = torch.cat([negative_prompt_embeds_2, prompt_embeds_2])
+            if prompt_mask_2 is not None:
+                prompt_mask_2 = torch.cat([negative_prompt_mask_2, prompt_mask_2])
+
+
+        # 4. Prepare timesteps
+        extra_set_timesteps_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.set_timesteps, {"n_tokens": n_tokens}
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            **extra_set_timesteps_kwargs,
+        )
+
+        if "884" in vae_ver:
+            video_length = (video_length - 1) // 4 + 1
+        elif "888" in vae_ver:
+            video_length = (video_length - 1) // 8 + 1
+        else:
+            video_length = video_length
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            video_length,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.step,
+            {"generator": generator, "eta": eta},
+        )
+
+        target_dtype = PRECISION_TO_TYPE[self.args.precision]
+        autocast_enabled = (
+            target_dtype != torch.float32
+        ) and not self.args.disable_autocast
+        vae_dtype = PRECISION_TO_TYPE[self.args.vae_precision]
+        vae_autocast_enabled = (
+            vae_dtype != torch.float32
+        ) and not self.args.disable_autocast
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+
+        # if is_progress_bar:
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2)
+                    if self.do_classifier_free_guidance
+                    else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+
+                t_expand = t.repeat(latent_model_input.shape[0])
+                guidance_expand = (
+                    torch.tensor(
+                        [embedded_guidance_scale] * latent_model_input.shape[0],
+                        dtype=torch.float32,
+                        device=device,
+                    ).to(target_dtype)
+                    * 1000.0
+                    if embedded_guidance_scale is not None
+                    else None
+                )
+
+                # predict the noise residual
+                with torch.autocast(
+                    device_type="cuda", dtype=target_dtype, enabled=autocast_enabled
+                ):
+                    noise_pred = self.transformer(  # For an input image (129, 192, 336) (1, 256, 256)
+                        latent_model_input,  # [2, 16, 33, 24, 42]
+                        t_expand,  # [2]
+                        text_states=prompt_embeds,  # [2, 256, 4096]
+                        text_mask=prompt_mask,  # [2, 256]
+                        text_states_2=prompt_embeds_2,  # [2, 768]
+                        freqs_cos=freqs_cis[0],  # [seqlen, head_dim]
+                        freqs_sin=freqs_cis[1],  # [seqlen, head_dim]
+                        guidance=guidance_expand,
+                        return_dict=True,
+                    )[
+                        "x"
+                    ]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_text,
+                        guidance_rescale=self.guidance_rescale,
+                    )
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop(
+                        "negative_prompt_embeds", negative_prompt_embeds
+                    )
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    if progress_bar is not None:
+                        progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            expand_temporal_dim = False
+            if len(latents.shape) == 4:
+                if isinstance(self.vae, AutoencoderKLCausal3D):
+                    latents = latents.unsqueeze(2)
+                    expand_temporal_dim = True
+            elif len(latents.shape) == 5:
+                pass
+            else:
+                raise ValueError(
+                    f"Only support latents with shape (b, c, h, w) or (b, c, f, h, w), but got {latents.shape}."
+                )
+
+            if (
+                hasattr(self.vae.config, "shift_factor")
+                and self.vae.config.shift_factor
+            ):
+                latents = (
+                    latents / self.vae.config.scaling_factor
+                    + self.vae.config.shift_factor
+                )
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            with torch.autocast(
+                device_type="cuda", dtype=vae_dtype, enabled=vae_autocast_enabled
+            ):
+                if enable_tiling:
+                    self.vae.enable_tiling()
+                    image = self.vae.decode(
+                        latents, return_dict=False, generator=generator
+                    )[0]
+                else:
+                    image = self.vae.decode(
+                        latents, return_dict=False, generator=generator
+                    )[0]
+
+            if expand_temporal_dim or image.shape[2] == 1:
+                image = image.squeeze(2)
+
+        else:
+            image = latents
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().float()
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return image
+
+        return HunyuanVideoPipelineOutput(videos=image)
diff --git a/hunyuan_model/posemb_layers.py b/hunyuan_model/posemb_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfce82c690540d17a55a51b7997ee7ceb0bdbf44
--- /dev/null
+++ b/hunyuan_model/posemb_layers.py
@@ -0,0 +1,310 @@
+import torch
+from typing import Union, Tuple, List
+
+
+def _to_tuple(x, dim=2):
+    if isinstance(x, int):
+        return (x,) * dim
+    elif len(x) == dim:
+        return x
+    else:
+        raise ValueError(f"Expected length {dim} or int, but got {x}")
+
+
+def get_meshgrid_nd(start, *args, dim=2):
+    """
+    Get n-D meshgrid with start, stop and num.
+
+    Args:
+        start (int or tuple): If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop,
+            step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num. For n-dim, start/stop/num
+            should be int or n-tuple. If n-tuple is provided, the meshgrid will be stacked following the dim order in
+            n-tuples.
+        *args: See above.
+        dim (int): Dimension of the meshgrid. Defaults to 2.
+
+    Returns:
+        grid (np.ndarray): [dim, ...]
+    """
+    if len(args) == 0:
+        # start is grid_size
+        num = _to_tuple(start, dim=dim)
+        start = (0,) * dim
+        stop = num
+    elif len(args) == 1:
+        # start is start, args[0] is stop, step is 1
+        start = _to_tuple(start, dim=dim)
+        stop = _to_tuple(args[0], dim=dim)
+        num = [stop[i] - start[i] for i in range(dim)]
+    elif len(args) == 2:
+        # start is start, args[0] is stop, args[1] is num
+        start = _to_tuple(start, dim=dim)  # Left-Top       eg: 12,0
+        stop = _to_tuple(args[0], dim=dim)  # Right-Bottom   eg: 20,32
+        num = _to_tuple(args[1], dim=dim)  # Target Size    eg: 32,124
+    else:
+        raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")
+
+    # PyTorch implement of np.linspace(start[i], stop[i], num[i], endpoint=False)
+    axis_grid = []
+    for i in range(dim):
+        a, b, n = start[i], stop[i], num[i]
+        g = torch.linspace(a, b, n + 1, dtype=torch.float32)[:n]
+        axis_grid.append(g)
+    grid = torch.meshgrid(*axis_grid, indexing="ij")  # dim x [W, H, D]
+    grid = torch.stack(grid, dim=0)  # [dim, W, H, D]
+
+    return grid
+
+
+#################################################################################
+#                   Rotary Positional Embedding Functions                       #
+#################################################################################
+# https://github.com/meta-llama/llama/blob/be327c427cc5e89cc1d3ab3d3fec4484df771245/llama/model.py#L80
+
+
+def reshape_for_broadcast(
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    x: torch.Tensor,
+    head_first=False,
+):
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+
+    Notes:
+        When using FlashMHAModified, head_first should be False.
+        When using Attention, head_first should be True.
+
+    Args:
+        freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+        head_first (bool): head dimension first (except batch dim) or not.
+
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+
+    Raises:
+        AssertionError: If the frequency tensor doesn't match the expected shape.
+        AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
+    """
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+
+    if isinstance(freqs_cis, tuple):
+        # freqs_cis: (cos, sin) in real space
+        if head_first:
+            assert freqs_cis[0].shape == (
+                x.shape[-2],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}"
+            shape = [
+                d if i == ndim - 2 or i == ndim - 1 else 1
+                for i, d in enumerate(x.shape)
+            ]
+        else:
+            assert freqs_cis[0].shape == (
+                x.shape[1],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}"
+            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
+    else:
+        # freqs_cis: values in complex space
+        if head_first:
+            assert freqs_cis.shape == (
+                x.shape[-2],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}"
+            shape = [
+                d if i == ndim - 2 or i == ndim - 1 else 1
+                for i, d in enumerate(x.shape)
+            ]
+        else:
+            assert freqs_cis.shape == (
+                x.shape[1],
+                x.shape[-1],
+            ), f"freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}"
+            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return freqs_cis.view(*shape)
+
+
+def rotate_half(x):
+    x_real, x_imag = (
+        x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)
+    )  # [B, S, H, D//2]
+    return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+
+
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+    head_first: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor.
+
+    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
+    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
+    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
+    returned as real tensors.
+
+    Args:
+        xq (torch.Tensor): Query tensor to apply rotary embeddings. [B, S, H, D]
+        xk (torch.Tensor): Key tensor to apply rotary embeddings.   [B, S, H, D]
+        freqs_cis (torch.Tensor or tuple): Precomputed frequency tensor for complex exponential.
+        head_first (bool): head dimension first (except batch dim) or not.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+
+    """
+    xk_out = None
+    if isinstance(freqs_cis, tuple):
+        cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first)  # [S, D]
+        cos, sin = cos.to(xq.device), sin.to(xq.device)
+        # real * cos - imag * sin
+        # imag * cos + real * sin
+        xq_out = (xq.float() * cos + rotate_half(xq.float()) * sin).type_as(xq)
+        xk_out = (xk.float() * cos + rotate_half(xk.float()) * sin).type_as(xk)
+    else:
+        # view_as_complex will pack [..., D/2, 2](real) to [..., D/2](complex)
+        xq_ = torch.view_as_complex(
+            xq.float().reshape(*xq.shape[:-1], -1, 2)
+        )  # [B, S, H, D//2]
+        freqs_cis = reshape_for_broadcast(freqs_cis, xq_, head_first).to(
+            xq.device
+        )  # [S, D//2] --> [1, S, 1, D//2]
+        # (real, imag) * (cos, sin) = (real * cos - imag * sin, imag * cos + real * sin)
+        # view_as_real will expand [..., D/2](complex) to [..., D/2, 2](real)
+        xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
+        xk_ = torch.view_as_complex(
+            xk.float().reshape(*xk.shape[:-1], -1, 2)
+        )  # [B, S, H, D//2]
+        xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
+
+    return xq_out, xk_out
+
+
+def get_nd_rotary_pos_embed(
+    rope_dim_list,
+    start,
+    *args,
+    theta=10000.0,
+    use_real=False,
+    theta_rescale_factor: Union[float, List[float]] = 1.0,
+    interpolation_factor: Union[float, List[float]] = 1.0,
+):
+    """
+    This is a n-d version of precompute_freqs_cis, which is a RoPE for tokens with n-d structure.
+
+    Args:
+        rope_dim_list (list of int): Dimension of each rope. len(rope_dim_list) should equal to n.
+            sum(rope_dim_list) should equal to head_dim of attention layer.
+        start (int | tuple of int | list of int): If len(args) == 0, start is num; If len(args) == 1, start is start,
+            args[0] is stop, step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num.
+        *args: See above.
+        theta (float): Scaling factor for frequency computation. Defaults to 10000.0.
+        use_real (bool): If True, return real part and imaginary part separately. Otherwise, return complex numbers.
+            Some libraries such as TensorRT does not support complex64 data type. So it is useful to provide a real
+            part and an imaginary part separately.
+        theta_rescale_factor (float): Rescale factor for theta. Defaults to 1.0.
+
+    Returns:
+        pos_embed (torch.Tensor): [HW, D/2]
+    """
+
+    grid = get_meshgrid_nd(
+        start, *args, dim=len(rope_dim_list)
+    )  # [3, W, H, D] / [2, W, H]
+
+    if isinstance(theta_rescale_factor, int) or isinstance(theta_rescale_factor, float):
+        theta_rescale_factor = [theta_rescale_factor] * len(rope_dim_list)
+    elif isinstance(theta_rescale_factor, list) and len(theta_rescale_factor) == 1:
+        theta_rescale_factor = [theta_rescale_factor[0]] * len(rope_dim_list)
+    assert len(theta_rescale_factor) == len(
+        rope_dim_list
+    ), "len(theta_rescale_factor) should equal to len(rope_dim_list)"
+
+    if isinstance(interpolation_factor, int) or isinstance(interpolation_factor, float):
+        interpolation_factor = [interpolation_factor] * len(rope_dim_list)
+    elif isinstance(interpolation_factor, list) and len(interpolation_factor) == 1:
+        interpolation_factor = [interpolation_factor[0]] * len(rope_dim_list)
+    assert len(interpolation_factor) == len(
+        rope_dim_list
+    ), "len(interpolation_factor) should equal to len(rope_dim_list)"
+
+    # use 1/ndim of dimensions to encode grid_axis
+    embs = []
+    for i in range(len(rope_dim_list)):
+        emb = get_1d_rotary_pos_embed(
+            rope_dim_list[i],
+            grid[i].reshape(-1),
+            theta,
+            use_real=use_real,
+            theta_rescale_factor=theta_rescale_factor[i],
+            interpolation_factor=interpolation_factor[i],
+        )  # 2 x [WHD, rope_dim_list[i]]
+        embs.append(emb)
+
+    if use_real:
+        cos = torch.cat([emb[0] for emb in embs], dim=1)  # (WHD, D/2)
+        sin = torch.cat([emb[1] for emb in embs], dim=1)  # (WHD, D/2)
+        return cos, sin
+    else:
+        emb = torch.cat(embs, dim=1)  # (WHD, D/2)
+        return emb
+
+
+def get_1d_rotary_pos_embed(
+    dim: int,
+    pos: Union[torch.FloatTensor, int],
+    theta: float = 10000.0,
+    use_real: bool = False,
+    theta_rescale_factor: float = 1.0,
+    interpolation_factor: float = 1.0,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    Precompute the frequency tensor for complex exponential (cis) with given dimensions.
+    (Note: `cis` means `cos + i * sin`, where i is the imaginary unit.)
+
+    This function calculates a frequency tensor with complex exponential using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        pos (int or torch.FloatTensor): Position indices for the frequency tensor. [S] or scalar
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+        use_real (bool, optional): If True, return real part and imaginary part separately.
+                                   Otherwise, return complex numbers.
+        theta_rescale_factor (float, optional): Rescale factor for theta. Defaults to 1.0.
+
+    Returns:
+        freqs_cis: Precomputed frequency tensor with complex exponential. [S, D/2]
+        freqs_cos, freqs_sin: Precomputed frequency tensor with real and imaginary parts separately. [S, D]
+    """
+    if isinstance(pos, int):
+        pos = torch.arange(pos).float()
+
+    # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+    # has some connection to NTK literature
+    if theta_rescale_factor != 1.0:
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+
+    freqs = 1.0 / (
+        theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+    )  # [D/2]
+    # assert interpolation_factor == 1.0, f"interpolation_factor: {interpolation_factor}"
+    freqs = torch.outer(pos * interpolation_factor, freqs)  # [S, D/2]
+    if use_real:
+        freqs_cos = freqs.cos().repeat_interleave(2, dim=1)  # [S, D]
+        freqs_sin = freqs.sin().repeat_interleave(2, dim=1)  # [S, D]
+        return freqs_cos, freqs_sin
+    else:
+        freqs_cis = torch.polar(
+            torch.ones_like(freqs), freqs
+        )  # complex64     # [S, D/2]
+        return freqs_cis
diff --git a/hunyuan_model/text_encoder.py b/hunyuan_model/text_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b424c880c07d4344548b2e8bb01c397ad4d448a
--- /dev/null
+++ b/hunyuan_model/text_encoder.py
@@ -0,0 +1,710 @@
+from dataclasses import dataclass
+import json
+import os
+from typing import Optional, Tuple, Union
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+from transformers import (
+    CLIPTextModel,
+    CLIPTokenizer,
+    AutoTokenizer,
+    AutoModel,
+    CLIPConfig,
+    LlamaForCausalLM,
+    LlamaConfig,
+)
+from transformers.utils import ModelOutput
+from transformers.models.llama import LlamaModel
+from safetensors.torch import load_file
+from accelerate import init_empty_weights
+
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+CLIP_L_HUGGINGFACE_MODEL_ID = "openai/clip-vit-large-patch14"
+LLAVA_HUGGINGFACE_MODEL_ID = "xtuner/llava-llama-3-8b-v1_1-transformers"
+
+CLIP_CONFIG = {
+    "_name_or_path": "clip-vit-large-patch14/",
+    "architectures": ["CLIPModel"],
+    "initializer_factor": 1.0,
+    "logit_scale_init_value": 2.6592,
+    "model_type": "clip",
+    "projection_dim": 768,
+    #   "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": False,
+    "architectures": None,
+    "attention_dropout": 0.0,
+    "bad_words_ids": None,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": None,
+    "decoder_start_token_id": None,
+    "diversity_penalty": 0.0,
+    "do_sample": False,
+    "dropout": 0.0,
+    "early_stopping": False,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "finetuning_task": None,
+    "forced_bos_token_id": None,
+    "forced_eos_token_id": None,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "id2label": {"0": "LABEL_0", "1": "LABEL_1"},
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": False,
+    "is_encoder_decoder": False,
+    "label2id": {"LABEL_0": 0, "LABEL_1": 1},
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 77,
+    "min_length": 0,
+    "model_type": "clip_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": False,
+    "output_hidden_states": False,
+    "output_scores": False,
+    "pad_token_id": 1,
+    "prefix": None,
+    "problem_type": None,
+    "projection_dim": 768,
+    "pruned_heads": {},
+    "remove_invalid_values": False,
+    "repetition_penalty": 1.0,
+    "return_dict": True,
+    "return_dict_in_generate": False,
+    "sep_token_id": None,
+    "task_specific_params": None,
+    "temperature": 1.0,
+    "tie_encoder_decoder": False,
+    "tie_word_embeddings": True,
+    "tokenizer_class": None,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": None,
+    "torchscript": False,
+    "transformers_version": "4.16.0.dev0",
+    "use_bfloat16": False,
+    "vocab_size": 49408,
+    #   },
+    #   "text_config_dict": {
+    "hidden_size": 768,
+    "intermediate_size": 3072,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "projection_dim": 768,
+    #   },
+    #   "torch_dtype": "float32",
+    #   "transformers_version": null
+}
+
+LLAMA_CONFIG = {
+    "architectures": ["LlamaForCausalLM"],
+    "attention_bias": False,
+    "attention_dropout": 0.0,
+    "bos_token_id": 128000,
+    "eos_token_id": 128001,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 14336,
+    "max_position_embeddings": 8192,
+    "mlp_bias": False,
+    "model_type": "llama",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 8,
+    "pretraining_tp": 1,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": None,
+    "rope_theta": 500000.0,
+    "tie_word_embeddings": False,
+    "torch_dtype": "float16",
+    "transformers_version": "4.46.3",
+    "use_cache": True,
+    "vocab_size": 128320,
+}
+
+# When using decoder-only models, we must provide a prompt template to instruct the text encoder
+# on how to generate the text.
+# --------------------------------------------------------------------
+PROMPT_TEMPLATE_ENCODE = (
+    "<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
+    "quantity, text, spatial relationships of the objects and background:<|eot_id|>"
+    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+)
+PROMPT_TEMPLATE_ENCODE_VIDEO = (
+    "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
+    "1. The main content and theme of the video."
+    "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
+    "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
+    "4. background environment, light, style and atmosphere."
+    "5. camera angles, movements, and transitions used in the video:<|eot_id|>"
+    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+)
+
+NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
+
+PROMPT_TEMPLATE = {
+    "dit-llm-encode": {
+        "template": PROMPT_TEMPLATE_ENCODE,
+        "crop_start": 36,
+    },
+    "dit-llm-encode-video": {
+        "template": PROMPT_TEMPLATE_ENCODE_VIDEO,
+        "crop_start": 95,
+    },
+}
+
+
+def use_default(value, default):
+    return value if value is not None else default
+
+
+def load_clip_l(text_encoder_path: str, dtype: Optional[Union[str, torch.dtype]] = None):
+    if os.path.isdir(text_encoder_path):
+        # load from directory, configs are in the directory
+        text_encoder = CLIPTextModel.from_pretrained(text_encoder_path, torch_dtype=dtype)
+    else:
+        # load from file, we create the model with the appropriate config
+        config = CLIPConfig(**CLIP_CONFIG)
+        with init_empty_weights():
+            text_encoder = CLIPTextModel._from_config(config, torch_dtype=dtype)
+
+        state_dict = load_file(text_encoder_path)
+
+        text_encoder.load_state_dict(state_dict, strict=True, assign=True)
+    # if dtype is not None:
+    #     text_encoder.to(dtype=dtype)
+
+    return text_encoder
+
+
+def load_clip_l_tokenizer(tokenizer_path: str):
+    if os.path.isdir(tokenizer_path):
+        tokenizer = CLIPTokenizer.from_pretrained(tokenizer_path, max_length=77)
+    else:
+        # load from Hugging Face
+        logger.info(f"Loading tokenizer from Hugging Face: {CLIP_L_HUGGINGFACE_MODEL_ID}")
+        tokenizer = CLIPTokenizer.from_pretrained(CLIP_L_HUGGINGFACE_MODEL_ID, max_length=77)
+
+    return tokenizer
+
+
+def load_llm(text_encoder_path: str, dtype: Optional[Union[str, torch.dtype]] = None):
+    if os.path.isdir(text_encoder_path):
+        # load from directory, configs are in the directory
+        text_encoder = AutoModel.from_pretrained(text_encoder_path, low_cpu_mem_usage=True, torch_dtype=dtype)
+    else:
+        # load from file, we create the model with the appropriate config
+        config = LlamaConfig(**LLAMA_CONFIG)
+        with init_empty_weights():
+            text_encoder = LlamaForCausalLM._from_config(config, torch_dtype=dtype)
+
+        state_dict = load_file(text_encoder_path)
+
+        # support weights from ComfyUI
+        if "tokenizer" in state_dict:
+            state_dict.pop("tokenizer")
+
+        text_encoder.load_state_dict(state_dict, strict=True, assign=True)
+
+    return text_encoder
+
+
+def load_llm_tokenizer(tokenizer_path: str, padding_side="right"):
+    if os.path.isdir(tokenizer_path):
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    else:
+        # load from Hugging Face
+        logger.info(f"Loading tokenizer from Hugging Face: {LLAVA_HUGGINGFACE_MODEL_ID}")
+        tokenizer = AutoTokenizer.from_pretrained(LLAVA_HUGGINGFACE_MODEL_ID, padding_side=padding_side)
+
+    return tokenizer
+
+
+def load_text_encoder(
+    text_encoder_type: str,
+    text_encoder_path: str,
+    text_encoder_dtype: Optional[Union[str, torch.dtype]] = None,
+):
+    logger.info(f"Loading text encoder model ({text_encoder_type}) from: {text_encoder_path}")
+
+    # reduce peak memory usage by specifying the dtype of the model
+    dtype = text_encoder_dtype
+    if text_encoder_type == "clipL":
+        text_encoder = load_clip_l(text_encoder_path, dtype=dtype)
+        text_encoder.final_layer_norm = text_encoder.text_model.final_layer_norm
+    elif text_encoder_type == "llm":
+        text_encoder = load_llm(text_encoder_path, dtype=dtype)
+        if hasattr(text_encoder, "norm"):
+            text_encoder.final_layer_norm = text_encoder.norm  # by from_pretrained
+        else:
+            text_encoder.final_layer_norm = text_encoder.model.norm  # by _from_config
+    else:
+        raise ValueError(f"Unsupported text encoder type: {text_encoder_type}")
+    # from_pretrained will ensure that the model is in eval mode.
+
+    if dtype is not None:
+        text_encoder = text_encoder.to(dtype=dtype)
+
+    text_encoder.requires_grad_(False)
+
+    logger.info(f"Text encoder to dtype: {text_encoder.dtype}")
+    return text_encoder, text_encoder_path
+
+
+def load_tokenizer(tokenizer_type, tokenizer_path=None, padding_side="right"):
+    logger.info(f"Loading tokenizer ({tokenizer_type}) from: {tokenizer_path}")
+
+    if tokenizer_type == "clipL":
+        tokenizer = load_clip_l_tokenizer(tokenizer_path)
+    elif tokenizer_type == "llm":
+        tokenizer = load_llm_tokenizer(tokenizer_path, padding_side=padding_side)
+    else:
+        raise ValueError(f"Unsupported tokenizer type: {tokenizer_type}")
+
+    return tokenizer, tokenizer_path
+
+
+@dataclass
+class TextEncoderModelOutput(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+        hidden_states_list (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        text_outputs (`list`, *optional*, returned when `return_texts=True` is passed):
+            List of decoded texts.
+    """
+
+    hidden_state: torch.FloatTensor = None
+    attention_mask: Optional[torch.LongTensor] = None
+    hidden_states_list: Optional[Tuple[torch.FloatTensor, ...]] = None
+    text_outputs: Optional[list] = None
+
+
+class TextEncoder(nn.Module):
+    def __init__(
+        self,
+        text_encoder_type: str,
+        max_length: int,
+        text_encoder_dtype: Optional[Union[str, torch.dtype]] = None,
+        text_encoder_path: Optional[str] = None,
+        tokenizer_type: Optional[str] = None,
+        tokenizer_path: Optional[str] = None,
+        output_key: Optional[str] = None,
+        use_attention_mask: bool = True,
+        input_max_length: Optional[int] = None,
+        prompt_template: Optional[dict] = None,
+        prompt_template_video: Optional[dict] = None,
+        hidden_state_skip_layer: Optional[int] = None,
+        apply_final_norm: bool = False,
+        reproduce: bool = False,
+    ):
+        super().__init__()
+        self.text_encoder_type = text_encoder_type
+        self.max_length = max_length
+        # self.precision = text_encoder_precision
+        self.model_path = text_encoder_path
+        self.tokenizer_type = tokenizer_type if tokenizer_type is not None else text_encoder_type
+        self.tokenizer_path = tokenizer_path if tokenizer_path is not None else text_encoder_path
+        self.use_attention_mask = use_attention_mask
+        if prompt_template_video is not None:
+            assert use_attention_mask is True, "Attention mask is True required when training videos."
+        self.input_max_length = input_max_length if input_max_length is not None else max_length
+        self.prompt_template = prompt_template
+        self.prompt_template_video = prompt_template_video
+        self.hidden_state_skip_layer = hidden_state_skip_layer
+        self.apply_final_norm = apply_final_norm
+        self.reproduce = reproduce
+
+        self.use_template = self.prompt_template is not None
+        if self.use_template:
+            assert (
+                isinstance(self.prompt_template, dict) and "template" in self.prompt_template
+            ), f"`prompt_template` must be a dictionary with a key 'template', got {self.prompt_template}"
+            assert "{}" in str(self.prompt_template["template"]), (
+                "`prompt_template['template']` must contain a placeholder `{}` for the input text, "
+                f"got {self.prompt_template['template']}"
+            )
+
+        self.use_video_template = self.prompt_template_video is not None
+        if self.use_video_template:
+            if self.prompt_template_video is not None:
+                assert (
+                    isinstance(self.prompt_template_video, dict) and "template" in self.prompt_template_video
+                ), f"`prompt_template_video` must be a dictionary with a key 'template', got {self.prompt_template_video}"
+            assert "{}" in str(self.prompt_template_video["template"]), (
+                "`prompt_template_video['template']` must contain a placeholder `{}` for the input text, "
+                f"got {self.prompt_template_video['template']}"
+            )
+
+        if "t5" in text_encoder_type:
+            self.output_key = output_key or "last_hidden_state"
+        elif "clip" in text_encoder_type:
+            self.output_key = output_key or "pooler_output"
+        elif "llm" in text_encoder_type or "glm" in text_encoder_type:
+            self.output_key = output_key or "last_hidden_state"
+        else:
+            raise ValueError(f"Unsupported text encoder type: {text_encoder_type}")
+
+        self.model, self.model_path = load_text_encoder(
+            text_encoder_type=self.text_encoder_type, text_encoder_path=self.model_path, text_encoder_dtype=text_encoder_dtype
+        )
+        self.dtype = self.model.dtype
+
+        self.tokenizer, self.tokenizer_path = load_tokenizer(
+            tokenizer_type=self.tokenizer_type, tokenizer_path=self.tokenizer_path, padding_side="right"
+        )
+
+    def __repr__(self):
+        return f"{self.text_encoder_type} ({self.precision} - {self.model_path})"
+
+    @property
+    def device(self):
+        return self.model.device
+
+    @staticmethod
+    def apply_text_to_template(text, template, prevent_empty_text=True):
+        """
+        Apply text to template.
+
+        Args:
+            text (str): Input text.
+            template (str or list): Template string or list of chat conversation.
+            prevent_empty_text (bool): If Ture, we will prevent the user text from being empty
+                by adding a space. Defaults to True.
+        """
+        if isinstance(template, str):
+            # Will send string to tokenizer. Used for llm
+            return template.format(text)
+        else:
+            raise TypeError(f"Unsupported template type: {type(template)}")
+
+    def text2tokens(self, text, data_type="image"):
+        """
+        Tokenize the input text.
+
+        Args:
+            text (str or list): Input text.
+        """
+        tokenize_input_type = "str"
+        if self.use_template:
+            if data_type == "image":
+                prompt_template = self.prompt_template["template"]
+            elif data_type == "video":
+                prompt_template = self.prompt_template_video["template"]
+            else:
+                raise ValueError(f"Unsupported data type: {data_type}")
+            if isinstance(text, (list, tuple)):
+                text = [self.apply_text_to_template(one_text, prompt_template) for one_text in text]
+                if isinstance(text[0], list):
+                    tokenize_input_type = "list"
+            elif isinstance(text, str):
+                text = self.apply_text_to_template(text, prompt_template)
+                if isinstance(text, list):
+                    tokenize_input_type = "list"
+            else:
+                raise TypeError(f"Unsupported text type: {type(text)}")
+
+        kwargs = dict(
+            truncation=True,
+            max_length=self.max_length,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        if tokenize_input_type == "str":
+            return self.tokenizer(
+                text,
+                return_length=False,
+                return_overflowing_tokens=False,
+                return_attention_mask=True,
+                **kwargs,
+            )
+        elif tokenize_input_type == "list":
+            return self.tokenizer.apply_chat_template(
+                text,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                **kwargs,
+            )
+        else:
+            raise ValueError(f"Unsupported tokenize_input_type: {tokenize_input_type}")
+
+    def encode(
+        self,
+        batch_encoding,
+        use_attention_mask=None,
+        output_hidden_states=False,
+        do_sample=None,
+        hidden_state_skip_layer=None,
+        return_texts=False,
+        data_type="image",
+        device=None,
+    ):
+        """
+        Args:
+            batch_encoding (dict): Batch encoding from tokenizer.
+            use_attention_mask (bool): Whether to use attention mask. If None, use self.use_attention_mask.
+                Defaults to None.
+            output_hidden_states (bool): Whether to output hidden states. If False, return the value of
+                self.output_key. If True, return the entire output. If set self.hidden_state_skip_layer,
+                output_hidden_states will be set True. Defaults to False.
+            do_sample (bool): Whether to sample from the model. Used for Decoder-Only LLMs. Defaults to None.
+                When self.produce is False, do_sample is set to True by default.
+            hidden_state_skip_layer (int): Number of hidden states to hidden_state_skip_layer. 0 means the last layer.
+                If None, self.output_key will be used. Defaults to None.
+            return_texts (bool): Whether to return the decoded texts. Defaults to False.
+        """
+        device = self.model.device if device is None else device
+        use_attention_mask = use_default(use_attention_mask, self.use_attention_mask)
+        hidden_state_skip_layer = use_default(hidden_state_skip_layer, self.hidden_state_skip_layer)
+        do_sample = use_default(do_sample, not self.reproduce)
+        attention_mask = batch_encoding["attention_mask"].to(device) if use_attention_mask else None
+        outputs = self.model(
+            input_ids=batch_encoding["input_ids"].to(device),
+            attention_mask=attention_mask,
+            output_hidden_states=output_hidden_states or hidden_state_skip_layer is not None,
+        )
+        if hidden_state_skip_layer is not None:
+            last_hidden_state = outputs.hidden_states[-(hidden_state_skip_layer + 1)]
+            # Real last hidden state already has layer norm applied. So here we only apply it
+            # for intermediate layers.
+            if hidden_state_skip_layer > 0 and self.apply_final_norm:
+                last_hidden_state = self.model.final_layer_norm(last_hidden_state)
+        else:
+            last_hidden_state = outputs[self.output_key]
+
+        # Remove hidden states of instruction tokens, only keep prompt tokens.
+        if self.use_template:
+            if data_type == "image":
+                crop_start = self.prompt_template.get("crop_start", -1)
+            elif data_type == "video":
+                crop_start = self.prompt_template_video.get("crop_start", -1)
+            else:
+                raise ValueError(f"Unsupported data type: {data_type}")
+            if crop_start > 0:
+                last_hidden_state = last_hidden_state[:, crop_start:]
+                attention_mask = attention_mask[:, crop_start:] if use_attention_mask else None
+
+        if output_hidden_states:
+            return TextEncoderModelOutput(last_hidden_state, attention_mask, outputs.hidden_states)
+        return TextEncoderModelOutput(last_hidden_state, attention_mask)
+
+    def forward(
+        self,
+        text,
+        use_attention_mask=None,
+        output_hidden_states=False,
+        do_sample=False,
+        hidden_state_skip_layer=None,
+        return_texts=False,
+    ):
+        batch_encoding = self.text2tokens(text)
+        return self.encode(
+            batch_encoding,
+            use_attention_mask=use_attention_mask,
+            output_hidden_states=output_hidden_states,
+            do_sample=do_sample,
+            hidden_state_skip_layer=hidden_state_skip_layer,
+            return_texts=return_texts,
+        )
+
+
+# region HunyanVideo architecture
+
+
+def load_text_encoder_1(
+    text_encoder_dir: str, device: torch.device, fp8_llm: bool, dtype: Optional[Union[str, torch.dtype]] = None
+) -> TextEncoder:
+    text_encoder_dtype = dtype or torch.float16
+    text_encoder_type = "llm"
+    text_len = 256
+    hidden_state_skip_layer = 2
+    apply_final_norm = False
+    reproduce = False
+
+    prompt_template = "dit-llm-encode"
+    prompt_template = PROMPT_TEMPLATE[prompt_template]
+    prompt_template_video = "dit-llm-encode-video"
+    prompt_template_video = PROMPT_TEMPLATE[prompt_template_video]
+
+    crop_start = prompt_template_video["crop_start"]  # .get("crop_start", 0)
+    max_length = text_len + crop_start
+
+    text_encoder_1 = TextEncoder(
+        text_encoder_type=text_encoder_type,
+        max_length=max_length,
+        text_encoder_dtype=text_encoder_dtype,
+        text_encoder_path=text_encoder_dir,
+        tokenizer_type=text_encoder_type,
+        prompt_template=prompt_template,
+        prompt_template_video=prompt_template_video,
+        hidden_state_skip_layer=hidden_state_skip_layer,
+        apply_final_norm=apply_final_norm,
+        reproduce=reproduce,
+    )
+    text_encoder_1.eval()
+
+    if fp8_llm:
+        org_dtype = text_encoder_1.dtype
+        logger.info(f"Moving and casting text encoder to {device} and torch.float8_e4m3fn")
+        text_encoder_1.to(device=device, dtype=torch.float8_e4m3fn)
+
+        # prepare LLM for fp8
+        def prepare_fp8(llama_model: LlamaModel, target_dtype):
+            def forward_hook(module):
+                def forward(hidden_states):
+                    input_dtype = hidden_states.dtype
+                    hidden_states = hidden_states.to(torch.float32)
+                    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+                    hidden_states = hidden_states * torch.rsqrt(variance + module.variance_epsilon)
+                    return module.weight.to(input_dtype) * hidden_states.to(input_dtype)
+
+                return forward
+
+            for module in llama_model.modules():
+                if module.__class__.__name__ in ["Embedding"]:
+                    # print("set", module.__class__.__name__, "to", target_dtype)
+                    module.to(target_dtype)
+                if module.__class__.__name__ in ["LlamaRMSNorm"]:
+                    # print("set", module.__class__.__name__, "hooks")
+                    module.forward = forward_hook(module)
+
+        prepare_fp8(text_encoder_1.model, org_dtype)
+    else:
+        text_encoder_1.to(device=device)
+
+    return text_encoder_1
+
+
+def load_text_encoder_2(
+    text_encoder_dir: str, device: torch.device, dtype: Optional[Union[str, torch.dtype]] = None
+) -> TextEncoder:
+    text_encoder_dtype = dtype or torch.float16
+    reproduce = False
+
+    text_encoder_2_type = "clipL"
+    text_len_2 = 77
+
+    text_encoder_2 = TextEncoder(
+        text_encoder_type=text_encoder_2_type,
+        max_length=text_len_2,
+        text_encoder_dtype=text_encoder_dtype,
+        text_encoder_path=text_encoder_dir,
+        tokenizer_type=text_encoder_2_type,
+        reproduce=reproduce,
+    )
+    text_encoder_2.eval()
+
+    text_encoder_2.to(device=device)
+
+    return text_encoder_2
+
+
+# endregion
+
+
+if __name__ == "__main__":
+    import argparse
+    from utils.model_utils import str_to_dtype
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("type", type=str, help="Text Encoder type")
+    parser.add_argument("path1", type=str, help="Text Encoder directory or file 1")
+    parser.add_argument("path2", type=str, help="Text Encoder directory or file 2")
+    parser.add_argument("--dtype", type=str, default=None, help="Data type for Text Encoder")
+    args = parser.parse_args()
+
+    dtype = str_to_dtype(args.dtype) if args.dtype is not None else torch.float16
+
+    """
+    if args.type == "clipL":
+        text_encoder_1st = load_clip_l(args.path1, dtype=dtype)
+        tokenizer_1st = load_clip_l_tokenizer(args.path1)
+        text_encoder_2nd = load_clip_l(args.path2, dtype=dtype)
+        tokenizer_2nd = load_clip_l_tokenizer(args.path2)
+    elif args.type == "llm":
+        text_encoder_1st = load_llm(args.path1, dtype=dtype)
+        tokenizer_1st = load_llm_tokenizer(args.path1)
+        text_encoder_2nd = load_llm(args.path2, dtype=dtype)
+        tokenizer_2nd = load_llm_tokenizer(args.path2)
+
+    print(f"1st Text Encoder dtype: {text_encoder_1st.dtype}")
+    print(f"2nd Text Encoder dtype: {text_encoder_2nd.dtype}")
+
+    text_encoder_1st.to(device=device)
+    text_encoder_2nd.to(device=device)
+
+    test_text = "A cat sitting on a table"
+    token_ids_1st = tokenizer_1st(test_text, return_tensors="pt")["input_ids"]
+    token_ids_2nd = tokenizer_2nd(test_text, return_tensors="pt")["input_ids"]
+    assert torch.allclose(token_ids_1st, token_ids_2nd)
+    print(f"Token IDs are the same: {token_ids_1st}")
+
+    with torch.no_grad():
+        text_encoder_1st_output = text_encoder_1st(token_ids_1st.to(device), output_hidden_states=True)
+        text_encoder_2nd_output = text_encoder_2nd(token_ids_2nd.to(device), output_hidden_states=True)
+    print(f"1st Text Encoder output keys: {text_encoder_1st_output.keys()}")
+    print(f"2nd Text Encoder output keys: {text_encoder_2nd_output.keys()}")
+    for key in text_encoder_1st_output:
+        print(f"Checking output: {key}")
+        assert key in text_encoder_2nd_output, f"Key {key} not in 2nd Text Encoder output"
+        assert torch.allclose(text_encoder_1st_output[key], text_encoder_2nd_output[key])
+        print(f"Outputs are the same: {key}")
+    print("All outputs are the same.")
+    """
+
+    if args.type == "clipL":
+        text_encoder_1st = load_text_encoder_2(args.path1, device, dtype)
+        text_encoder_2nd = load_text_encoder_2(args.path2, device, dtype)
+    elif args.type == "llm":
+        text_encoder_1st = load_text_encoder_1(args.path1, device, False, dtype)
+        text_encoder_2nd = load_text_encoder_1(args.path2, device, False, dtype)
+    print(f"1st Text Encoder dtype: {text_encoder_1st.dtype}")
+    print(f"2nd Text Encoder dtype: {text_encoder_2nd.dtype}")
+
+    prompt = "A cat sitting on a table"
+    data_type = "video"  # video only, image is not supported
+    text_inputs_1st = text_encoder_1st.text2tokens(prompt, data_type=data_type)
+    text_inputs_2nd = text_encoder_2nd.text2tokens(prompt, data_type=data_type)
+    print(text_inputs_1st)
+    assert torch.allclose(text_inputs_1st["input_ids"], text_inputs_2nd["input_ids"])
+
+    with torch.no_grad():
+        prompt_outputs_1st = text_encoder_1st.encode(text_inputs_1st, data_type=data_type)
+        prompt_outputs_2nd = text_encoder_2nd.encode(text_inputs_1st, data_type=data_type)
+
+    # prompt_outputs.hidden_state, prompt_outputs.attention_mask
+    assert torch.allclose(prompt_outputs_1st.hidden_state, prompt_outputs_2nd.hidden_state)
+    print("Hidden states are the same.")
+    assert torch.allclose(prompt_outputs_1st.attention_mask, prompt_outputs_2nd.attention_mask)
+    print("Attention masks are the same.")
+    print("All outputs are the same.")
diff --git a/hunyuan_model/token_refiner.py b/hunyuan_model/token_refiner.py
new file mode 100644
index 0000000000000000000000000000000000000000..378bbab7d5b5483f552bc37699650506dc6f790c
--- /dev/null
+++ b/hunyuan_model/token_refiner.py
@@ -0,0 +1,245 @@
+from typing import Optional
+
+from einops import rearrange
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+
+from .activation_layers import get_activation_layer
+from .attention import attention
+from .norm_layers import get_norm_layer
+from .embed_layers import TimestepEmbedder, TextProjection
+from .mlp_layers import MLP
+from .modulate_layers import modulate, apply_gate
+
+
+class IndividualTokenRefinerBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        heads_num,
+        mlp_width_ratio: str = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+        self.self_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.self_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.self_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
+        )
+        self.self_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+        act_layer = get_activation_layer(act_type)
+        self.mlp = MLP(
+            in_channels=hidden_size,
+            hidden_channels=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=mlp_drop_rate,
+            **factory_kwargs,
+        )
+
+        self.adaLN_modulation = nn.Sequential(
+            act_layer(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.adaLN_modulation[1].weight)
+        nn.init.zeros_(self.adaLN_modulation[1].bias)
+
+        self.gradient_checkpointing = False
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+
+    def _forward(
+        self,
+        x: torch.Tensor,
+        c: torch.Tensor,  # timestep_aware_representations + context_aware_representations
+        attn_mask: torch.Tensor = None,
+    ):
+        gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
+
+        norm_x = self.norm1(x)
+        qkv = self.self_attn_qkv(norm_x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        # Apply QK-Norm if needed
+        q = self.self_attn_q_norm(q).to(v)
+        k = self.self_attn_k_norm(k).to(v)
+
+        # Self-Attention
+        attn = attention(q, k, v, mode="torch", attn_mask=attn_mask)
+
+        x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
+
+        # FFN Layer
+        x = x + apply_gate(self.mlp(self.norm2(x)), gate_mlp)
+
+        return x
+
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+
+
+class IndividualTokenRefiner(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        heads_num,
+        depth,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.blocks = nn.ModuleList(
+            [
+                IndividualTokenRefinerBlock(
+                    hidden_size=hidden_size,
+                    heads_num=heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_drop_rate=mlp_drop_rate,
+                    act_type=act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    qkv_bias=qkv_bias,
+                    **factory_kwargs,
+                )
+                for _ in range(depth)
+            ]
+        )
+
+    def enable_gradient_checkpointing(self):
+        for block in self.blocks:
+            block.enable_gradient_checkpointing()
+
+    def disable_gradient_checkpointing(self):
+        for block in self.blocks:
+            block.disable_gradient_checkpointing()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        c: torch.LongTensor,
+        mask: Optional[torch.Tensor] = None,
+    ):
+        self_attn_mask = None
+        if mask is not None:
+            batch_size = mask.shape[0]
+            seq_len = mask.shape[1]
+            mask = mask.to(x.device)
+            # batch_size x 1 x seq_len x seq_len
+            self_attn_mask_1 = mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
+            # batch_size x 1 x seq_len x seq_len
+            self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
+            # batch_size x 1 x seq_len x seq_len, 1 for broadcasting of heads_num
+            self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
+            # avoids self-attention weight being NaN for padding tokens
+            self_attn_mask[:, :, :, 0] = True
+
+        for block in self.blocks:
+            x = block(x, c, self_attn_mask)
+        return x
+
+
+class SingleTokenRefiner(nn.Module):
+    """
+    A single token refiner block for llm text embedding refine.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        hidden_size,
+        heads_num,
+        depth,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        attn_mode: str = "torch",
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.attn_mode = attn_mode
+        assert self.attn_mode == "torch", "Only support 'torch' mode for token refiner."
+
+        self.input_embedder = nn.Linear(in_channels, hidden_size, bias=True, **factory_kwargs)
+
+        act_layer = get_activation_layer(act_type)
+        # Build timestep embedding layer
+        self.t_embedder = TimestepEmbedder(hidden_size, act_layer, **factory_kwargs)
+        # Build context embedding layer
+        self.c_embedder = TextProjection(in_channels, hidden_size, act_layer, **factory_kwargs)
+
+        self.individual_token_refiner = IndividualTokenRefiner(
+            hidden_size=hidden_size,
+            heads_num=heads_num,
+            depth=depth,
+            mlp_width_ratio=mlp_width_ratio,
+            mlp_drop_rate=mlp_drop_rate,
+            act_type=act_type,
+            qk_norm=qk_norm,
+            qk_norm_type=qk_norm_type,
+            qkv_bias=qkv_bias,
+            **factory_kwargs,
+        )
+
+    def enable_gradient_checkpointing(self):
+        self.individual_token_refiner.enable_gradient_checkpointing()
+
+    def disable_gradient_checkpointing(self):
+        self.individual_token_refiner.disable_gradient_checkpointing()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.LongTensor,
+        mask: Optional[torch.LongTensor] = None,
+    ):
+        timestep_aware_representations = self.t_embedder(t)
+
+        if mask is None:
+            context_aware_representations = x.mean(dim=1)
+        else:
+            mask_float = mask.float().unsqueeze(-1)  # [b, s1, 1]
+            context_aware_representations = (x * mask_float).sum(dim=1) / mask_float.sum(dim=1)
+        context_aware_representations = self.c_embedder(context_aware_representations)
+        c = timestep_aware_representations + context_aware_representations
+
+        x = self.input_embedder(x)
+
+        x = self.individual_token_refiner(x, c, mask)
+
+        return x
diff --git a/hunyuan_model/vae.py b/hunyuan_model/vae.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ae718a5634e98e53a0c0dec85254228229a01c3
--- /dev/null
+++ b/hunyuan_model/vae.py
@@ -0,0 +1,446 @@
+from dataclasses import dataclass
+import json
+from typing import Optional, Tuple, Union
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from diffusers.utils import BaseOutput, is_torch_version
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.models.attention_processor import SpatialNorm
+from modules.unet_causal_3d_blocks import CausalConv3d, UNetMidBlockCausal3D, get_down_block3d, get_up_block3d
+
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+SCALING_FACTOR = 0.476986
+VAE_VER = "884-16c-hy"  # We don't support other versions currently
+
+
+def load_vae(
+    vae_type: str = "884-16c-hy",
+    vae_dtype: Optional[Union[str, torch.dtype]] = None,
+    sample_size: tuple = None,
+    vae_path: str = None,
+    device=None,
+):
+    """the fucntion to load the 3D VAE model
+
+    Args:
+        vae_type (str): the type of the 3D VAE model. Defaults to "884-16c-hy".
+        vae_precision (str, optional): the precision to load vae. Defaults to None.
+        sample_size (tuple, optional): the tiling size. Defaults to None.
+        vae_path (str, optional): the path to vae. Defaults to None.
+        logger (_type_, optional): logger. Defaults to None.
+        device (_type_, optional): device to load vae. Defaults to None.
+    """
+    if vae_path is None:
+        vae_path = VAE_PATH[vae_type]
+
+    logger.info(f"Loading 3D VAE model ({vae_type}) from: {vae_path}")
+
+    # use fixed config for Hunyuan's VAE
+    CONFIG_JSON = """{
+    "_class_name": "AutoencoderKLCausal3D",
+    "_diffusers_version": "0.4.2",
+    "act_fn": "silu",
+    "block_out_channels": [
+      128,
+      256,
+      512,
+      512
+    ],
+    "down_block_types": [
+      "DownEncoderBlockCausal3D",
+      "DownEncoderBlockCausal3D",
+      "DownEncoderBlockCausal3D",
+      "DownEncoderBlockCausal3D"
+    ],
+    "in_channels": 3,
+    "latent_channels": 16,
+    "layers_per_block": 2,
+    "norm_num_groups": 32,
+    "out_channels": 3,
+    "sample_size": 256,
+    "sample_tsize": 64,
+    "up_block_types": [
+      "UpDecoderBlockCausal3D",
+      "UpDecoderBlockCausal3D",
+      "UpDecoderBlockCausal3D",
+      "UpDecoderBlockCausal3D"
+    ],
+    "scaling_factor": 0.476986,
+    "time_compression_ratio": 4,
+    "mid_block_add_attention": true
+  }"""
+
+    # config = AutoencoderKLCausal3D.load_config(vae_path)
+    config = json.loads(CONFIG_JSON)
+
+    # import here to avoid circular import
+    from .autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+
+    if sample_size:
+        vae = AutoencoderKLCausal3D.from_config(config, sample_size=sample_size)
+    else:
+        vae = AutoencoderKLCausal3D.from_config(config)
+
+    # vae_ckpt = Path(vae_path) / "pytorch_model.pt"
+    # assert vae_ckpt.exists(), f"VAE checkpoint not found: {vae_ckpt}"
+
+    if vae_path.endswith(".safetensors"):
+        from safetensors.torch import load_file
+        ckpt = load_file(vae_path)
+    else:
+        ckpt = torch.load(vae_path, map_location=vae.device, weights_only=True)
+        if "state_dict" in ckpt:
+            ckpt = ckpt["state_dict"]
+    if any(k.startswith("vae.") for k in ckpt.keys()):
+        ckpt = {k.replace("vae.", ""): v for k, v in ckpt.items() if k.startswith("vae.")}
+    vae.load_state_dict(ckpt)
+
+    spatial_compression_ratio = vae.config.spatial_compression_ratio
+    time_compression_ratio = vae.config.time_compression_ratio
+
+    if vae_dtype is not None:
+        vae = vae.to(vae_dtype)
+
+    vae.requires_grad_(False)
+
+    logger.info(f"VAE to dtype: {vae.dtype}")
+
+    if device is not None:
+        vae = vae.to(device)
+
+    vae.eval()
+
+    return vae, vae_path, spatial_compression_ratio, time_compression_ratio
+
+
+@dataclass
+class DecoderOutput(BaseOutput):
+    r"""
+    Output of decoding method.
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The decoded output sample from the last layer of the model.
+    """
+
+    sample: torch.FloatTensor
+
+
+class EncoderCausal3D(nn.Module):
+    r"""
+    The `EncoderCausal3D` layer of a variational autoencoder that encodes its input into a latent representation.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlockCausal3D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        double_z: bool = True,
+        mid_block_add_attention=True,
+        time_compression_ratio: int = 4,
+        spatial_compression_ratio: int = 8,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = CausalConv3d(in_channels, block_out_channels[0], kernel_size=3, stride=1)
+        self.mid_block = None
+        self.down_blocks = nn.ModuleList([])
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            num_spatial_downsample_layers = int(np.log2(spatial_compression_ratio))
+            num_time_downsample_layers = int(np.log2(time_compression_ratio))
+
+            if time_compression_ratio == 4:
+                add_spatial_downsample = bool(i < num_spatial_downsample_layers)
+                add_time_downsample = bool(i >= (len(block_out_channels) - 1 - num_time_downsample_layers) and not is_final_block)
+            else:
+                raise ValueError(f"Unsupported time_compression_ratio: {time_compression_ratio}.")
+
+            downsample_stride_HW = (2, 2) if add_spatial_downsample else (1, 1)
+            downsample_stride_T = (2,) if add_time_downsample else (1,)
+            downsample_stride = tuple(downsample_stride_T + downsample_stride_HW)
+            down_block = get_down_block3d(
+                down_block_type,
+                num_layers=self.layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                add_downsample=bool(add_spatial_downsample or add_time_downsample),
+                downsample_stride=downsample_stride,
+                resnet_eps=1e-6,
+                downsample_padding=0,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=None,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlockCausal3D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=None,
+            add_attention=mid_block_add_attention,
+        )
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+
+        conv_out_channels = 2 * out_channels if double_z else out_channels
+        self.conv_out = CausalConv3d(block_out_channels[-1], conv_out_channels, kernel_size=3)
+
+    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `EncoderCausal3D` class."""
+        assert len(sample.shape) == 5, "The input tensor should have 5 dimensions"
+
+        sample = self.conv_in(sample)
+
+        # down
+        for down_block in self.down_blocks:
+            sample = down_block(sample)
+
+        # middle
+        sample = self.mid_block(sample)
+
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class DecoderCausal3D(nn.Module):
+    r"""
+    The `DecoderCausal3D` layer of a variational autoencoder that decodes its latent representation into an output sample.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlockCausal3D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        norm_type: str = "group",  # group, spatial
+        mid_block_add_attention=True,
+        time_compression_ratio: int = 4,
+        spatial_compression_ratio: int = 8,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = CausalConv3d(in_channels, block_out_channels[-1], kernel_size=3, stride=1)
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+
+        temb_channels = in_channels if norm_type == "spatial" else None
+
+        # mid
+        self.mid_block = UNetMidBlockCausal3D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default" if norm_type == "group" else norm_type,
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=temb_channels,
+            add_attention=mid_block_add_attention,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            num_spatial_upsample_layers = int(np.log2(spatial_compression_ratio))
+            num_time_upsample_layers = int(np.log2(time_compression_ratio))
+
+            if time_compression_ratio == 4:
+                add_spatial_upsample = bool(i < num_spatial_upsample_layers)
+                add_time_upsample = bool(i >= len(block_out_channels) - 1 - num_time_upsample_layers and not is_final_block)
+            else:
+                raise ValueError(f"Unsupported time_compression_ratio: {time_compression_ratio}.")
+
+            upsample_scale_factor_HW = (2, 2) if add_spatial_upsample else (1, 1)
+            upsample_scale_factor_T = (2,) if add_time_upsample else (1,)
+            upsample_scale_factor = tuple(upsample_scale_factor_T + upsample_scale_factor_HW)
+            up_block = get_up_block3d(
+                up_block_type,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                add_upsample=bool(add_spatial_upsample or add_time_upsample),
+                upsample_scale_factor=upsample_scale_factor,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=temb_channels,
+                resnet_time_scale_shift=norm_type,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        if norm_type == "spatial":
+            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
+        else:
+            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = CausalConv3d(block_out_channels[0], out_channels, kernel_size=3)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        latent_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        r"""The forward method of the `DecoderCausal3D` class."""
+        assert len(sample.shape) == 5, "The input tensor should have 5 dimensions."
+
+        sample = self.conv_in(sample)
+
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    latent_embeds,
+                    use_reentrant=False,
+                )
+                sample = sample.to(upscale_dtype)
+
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        latent_embeds,
+                        use_reentrant=False,
+                    )
+            else:
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample, latent_embeds)
+                sample = sample.to(upscale_dtype)
+
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
+        else:
+            # middle
+            sample = self.mid_block(sample, latent_embeds)
+            sample = sample.to(upscale_dtype)
+
+            # up
+            for up_block in self.up_blocks:
+                sample = up_block(sample, latent_embeds)
+
+        # post-process
+        if latent_embeds is None:
+            sample = self.conv_norm_out(sample)
+        else:
+            sample = self.conv_norm_out(sample, latent_embeds)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
+        if parameters.ndim == 3:
+            dim = 2  # (B, L, C)
+        elif parameters.ndim == 5 or parameters.ndim == 4:
+            dim = 1  # (B, C, T, H ,W) / (B, C, H, W)
+        else:
+            raise NotImplementedError
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=dim)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean, device=self.parameters.device, dtype=self.parameters.dtype)
+
+    def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
+        # make sure sample is on the same device as the parameters and has same dtype
+        sample = randn_tensor(
+            self.mean.shape,
+            generator=generator,
+            device=self.parameters.device,
+            dtype=self.parameters.dtype,
+        )
+        x = self.mean + self.std * sample
+        return x
+
+    def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            reduce_dim = list(range(1, self.mean.ndim))
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=reduce_dim,
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var + self.var / other.var - 1.0 - self.logvar + other.logvar,
+                    dim=reduce_dim,
+                )
+
+    def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+
+    def mode(self) -> torch.Tensor:
+        return self.mean
diff --git a/hv_generate_video.py b/hv_generate_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7ba4c03348d5ca484b3ff1d91cc88aab34c6556
--- /dev/null
+++ b/hv_generate_video.py
@@ -0,0 +1,952 @@
+import argparse
+from datetime import datetime
+from pathlib import Path
+import random
+import sys
+import os
+import time
+from typing import Optional, Union
+
+import numpy as np
+import torch
+import torchvision
+import accelerate
+from diffusers.utils.torch_utils import randn_tensor
+from transformers.models.llama import LlamaModel
+from tqdm import tqdm
+import av
+from einops import rearrange
+from safetensors.torch import load_file, save_file
+from safetensors import safe_open
+from PIL import Image
+
+from hunyuan_model import vae
+from hunyuan_model.text_encoder import TextEncoder
+from hunyuan_model.text_encoder import PROMPT_TEMPLATE
+from hunyuan_model.vae import load_vae
+from hunyuan_model.models import load_transformer, get_rotary_pos_embed
+from hunyuan_model.fp8_optimization import convert_fp8_linear
+from modules.scheduling_flow_match_discrete import FlowMatchDiscreteScheduler
+from networks import lora
+
+try:
+    from lycoris.kohya import create_network_from_weights
+except:
+    pass
+
+from utils.model_utils import str_to_dtype
+from utils.safetensors_utils import mem_eff_save_file
+from dataset.image_video_dataset import load_video, glob_images, resize_image_to_bucket
+
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def clean_memory_on_device(device):
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+    elif device.type == "cpu":
+        pass
+    elif device.type == "mps":  # not tested
+        torch.mps.empty_cache()
+
+
+def synchronize_device(device: torch.device):
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    elif device.type == "xpu":
+        torch.xpu.synchronize()
+    elif device.type == "mps":
+        torch.mps.synchronize()
+
+
+def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=1, fps=24):
+    """save videos by video tensor
+       copy from https://github.com/guoyww/AnimateDiff/blob/e92bd5671ba62c0d774a32951453e328018b7c5b/animatediff/utils/util.py#L61
+
+    Args:
+        videos (torch.Tensor): video tensor predicted by the model
+        path (str): path to save video
+        rescale (bool, optional): rescale the video tensor from [-1, 1] to  . Defaults to False.
+        n_rows (int, optional): Defaults to 1.
+        fps (int, optional): video save fps. Defaults to 8.
+    """
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    outputs = []
+    for x in videos:
+        x = torchvision.utils.make_grid(x, nrow=n_rows)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = torch.clamp(x, 0, 1)
+        x = (x * 255).numpy().astype(np.uint8)
+        outputs.append(x)
+
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+
+    # # save video with av
+    # container = av.open(path, "w")
+    # stream = container.add_stream("libx264", rate=fps)
+    # for x in outputs:
+    #     frame = av.VideoFrame.from_ndarray(x, format="rgb24")
+    #     packet = stream.encode(frame)
+    #     container.mux(packet)
+    # packet = stream.encode(None)
+    # container.mux(packet)
+    # container.close()
+
+    height, width, _ = outputs[0].shape
+
+    # create output container
+    container = av.open(path, mode="w")
+
+    # create video stream
+    codec = "libx264"
+    pixel_format = "yuv420p"
+    stream = container.add_stream(codec, rate=fps)
+    stream.width = width
+    stream.height = height
+    stream.pix_fmt = pixel_format
+    stream.bit_rate = 4000000  # 4Mbit/s
+
+    for frame_array in outputs:
+        frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
+        packets = stream.encode(frame)
+        for packet in packets:
+            container.mux(packet)
+
+    for packet in stream.encode():
+        container.mux(packet)
+
+    container.close()
+
+
+def save_images_grid(
+    videos: torch.Tensor, parent_dir: str, image_name: str, rescale: bool = False, n_rows: int = 1, create_subdir=True
+):
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    outputs = []
+    for x in videos:
+        x = torchvision.utils.make_grid(x, nrow=n_rows)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = torch.clamp(x, 0, 1)
+        x = (x * 255).numpy().astype(np.uint8)
+        outputs.append(x)
+
+    if create_subdir:
+        output_dir = os.path.join(parent_dir, image_name)
+    else:
+        output_dir = parent_dir
+
+    os.makedirs(output_dir, exist_ok=True)
+    for i, x in enumerate(outputs):
+        image_path = os.path.join(output_dir, f"{image_name}_{i:03d}.png")
+        image = Image.fromarray(x)
+        image.save(image_path)
+
+
+# region Encoding prompt
+
+
+def encode_prompt(prompt: Union[str, list[str]], device: torch.device, num_videos_per_prompt: int, text_encoder: TextEncoder):
+    r"""
+    Encodes the prompt into text encoder hidden states.
+
+    Args:
+        prompt (`str` or `List[str]`):
+            prompt to be encoded
+        device: (`torch.device`):
+            torch device
+        num_videos_per_prompt (`int`):
+            number of videos that should be generated per prompt
+        text_encoder (TextEncoder):
+            text encoder to be used for encoding the prompt
+    """
+    # LoRA and Textual Inversion are not supported in this script
+    # negative prompt and prompt embedding are not supported in this script
+    # clip_skip is not supported in this script because it is not used in the original script
+    data_type = "video"  # video only, image is not supported
+
+    text_inputs = text_encoder.text2tokens(prompt, data_type=data_type)
+
+    with torch.no_grad():
+        prompt_outputs = text_encoder.encode(text_inputs, data_type=data_type, device=device)
+    prompt_embeds = prompt_outputs.hidden_state
+
+    attention_mask = prompt_outputs.attention_mask
+    if attention_mask is not None:
+        attention_mask = attention_mask.to(device)
+        bs_embed, seq_len = attention_mask.shape
+        attention_mask = attention_mask.repeat(1, num_videos_per_prompt)
+        attention_mask = attention_mask.view(bs_embed * num_videos_per_prompt, seq_len)
+
+    prompt_embeds_dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+    if prompt_embeds.ndim == 2:
+        bs_embed, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, -1)
+    else:
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+
+    return prompt_embeds, attention_mask
+
+
+def encode_input_prompt(prompt: Union[str, list[str]], args, device, fp8_llm=False, accelerator=None):
+    # constants
+    prompt_template_video = "dit-llm-encode-video"
+    prompt_template = "dit-llm-encode"
+    text_encoder_dtype = torch.float16
+    text_encoder_type = "llm"
+    text_len = 256
+    hidden_state_skip_layer = 2
+    apply_final_norm = False
+    reproduce = False
+
+    text_encoder_2_type = "clipL"
+    text_len_2 = 77
+
+    num_videos = 1
+
+    # if args.prompt_template_video is not None:
+    #     crop_start = PROMPT_TEMPLATE[args.prompt_template_video].get("crop_start", 0)
+    # elif args.prompt_template is not None:
+    #     crop_start = PROMPT_TEMPLATE[args.prompt_template].get("crop_start", 0)
+    # else:
+    #     crop_start = 0
+    crop_start = PROMPT_TEMPLATE[prompt_template_video].get("crop_start", 0)
+    max_length = text_len + crop_start
+
+    # prompt_template
+    prompt_template = PROMPT_TEMPLATE[prompt_template]
+
+    # prompt_template_video
+    prompt_template_video = PROMPT_TEMPLATE[prompt_template_video]  # if args.prompt_template_video is not None else None
+
+    # load text encoders
+    logger.info(f"loading text encoder: {args.text_encoder1}")
+    text_encoder = TextEncoder(
+        text_encoder_type=text_encoder_type,
+        max_length=max_length,
+        text_encoder_dtype=text_encoder_dtype,
+        text_encoder_path=args.text_encoder1,
+        tokenizer_type=text_encoder_type,
+        prompt_template=prompt_template,
+        prompt_template_video=prompt_template_video,
+        hidden_state_skip_layer=hidden_state_skip_layer,
+        apply_final_norm=apply_final_norm,
+        reproduce=reproduce,
+    )
+    text_encoder.eval()
+    if fp8_llm:
+        org_dtype = text_encoder.dtype
+        logger.info(f"Moving and casting text encoder to {device} and torch.float8_e4m3fn")
+        text_encoder.to(device=device, dtype=torch.float8_e4m3fn)
+
+        # prepare LLM for fp8
+        def prepare_fp8(llama_model: LlamaModel, target_dtype):
+            def forward_hook(module):
+                def forward(hidden_states):
+                    input_dtype = hidden_states.dtype
+                    hidden_states = hidden_states.to(torch.float32)
+                    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+                    hidden_states = hidden_states * torch.rsqrt(variance + module.variance_epsilon)
+                    return module.weight.to(input_dtype) * hidden_states.to(input_dtype)
+
+                return forward
+
+            for module in llama_model.modules():
+                if module.__class__.__name__ in ["Embedding"]:
+                    # print("set", module.__class__.__name__, "to", target_dtype)
+                    module.to(target_dtype)
+                if module.__class__.__name__ in ["LlamaRMSNorm"]:
+                    # print("set", module.__class__.__name__, "hooks")
+                    module.forward = forward_hook(module)
+
+        prepare_fp8(text_encoder.model, org_dtype)
+
+    logger.info(f"loading text encoder 2: {args.text_encoder2}")
+    text_encoder_2 = TextEncoder(
+        text_encoder_type=text_encoder_2_type,
+        max_length=text_len_2,
+        text_encoder_dtype=text_encoder_dtype,
+        text_encoder_path=args.text_encoder2,
+        tokenizer_type=text_encoder_2_type,
+        reproduce=reproduce,
+    )
+    text_encoder_2.eval()
+
+    # encode prompt
+    logger.info(f"Encoding prompt with text encoder 1")
+    text_encoder.to(device=device)
+    if fp8_llm:
+        with accelerator.autocast():
+            prompt_embeds, prompt_mask = encode_prompt(prompt, device, num_videos, text_encoder)
+    else:
+        prompt_embeds, prompt_mask = encode_prompt(prompt, device, num_videos, text_encoder)
+    text_encoder = None
+    clean_memory_on_device(device)
+
+    logger.info(f"Encoding prompt with text encoder 2")
+    text_encoder_2.to(device=device)
+    prompt_embeds_2, prompt_mask_2 = encode_prompt(prompt, device, num_videos, text_encoder_2)
+
+    prompt_embeds = prompt_embeds.to("cpu")
+    prompt_mask = prompt_mask.to("cpu")
+    prompt_embeds_2 = prompt_embeds_2.to("cpu")
+    prompt_mask_2 = prompt_mask_2.to("cpu")
+
+    text_encoder_2 = None
+    clean_memory_on_device(device)
+
+    return prompt_embeds, prompt_mask, prompt_embeds_2, prompt_mask_2
+
+
+# endregion
+
+
+def load_images(image_dir, video_length, bucket_reso):
+    image_files = glob_images(image_dir)
+    if len(image_files) == 0:
+        raise ValueError(f"No image files found in {image_dir}")
+    if len(image_files) < video_length:
+        raise ValueError(f"Number of images in {image_dir} is less than {video_length}")
+
+    image_files.sort()
+    images = []
+    for image_file in image_files[:video_length]:
+        image = Image.open(image_file)
+        image = resize_image_to_bucket(image, bucket_reso)  # returns a numpy array
+        images.append(image)
+
+    return images
+
+
+def prepare_vae(args, device):
+    vae_dtype = torch.float16 if args.vae_dtype is None else str_to_dtype(args.vae_dtype)
+    vae, _, s_ratio, t_ratio = load_vae(vae_dtype=vae_dtype, device=device, vae_path=args.vae)
+    vae.eval()
+    # vae_kwargs = {"s_ratio": s_ratio, "t_ratio": t_ratio}
+
+    # set chunk_size to CausalConv3d recursively
+    chunk_size = args.vae_chunk_size
+    if chunk_size is not None:
+        vae.set_chunk_size_for_causal_conv_3d(chunk_size)
+        logger.info(f"Set chunk_size to {chunk_size} for CausalConv3d")
+
+    if args.vae_spatial_tile_sample_min_size is not None:
+        vae.enable_spatial_tiling(True)
+        vae.tile_sample_min_size = args.vae_spatial_tile_sample_min_size
+        vae.tile_latent_min_size = args.vae_spatial_tile_sample_min_size // 8
+    # elif args.vae_tiling:
+    else:
+        vae.enable_spatial_tiling(True)
+
+    return vae, vae_dtype
+
+
+def encode_to_latents(args, video, device):
+    vae, vae_dtype = prepare_vae(args, device)
+
+    video = video.to(device=device, dtype=vae_dtype)
+    video = video * 2 - 1  # 0, 1 -> -1, 1
+    with torch.no_grad():
+        latents = vae.encode(video).latent_dist.sample()
+
+    if hasattr(vae.config, "shift_factor") and vae.config.shift_factor:
+        latents = (latents - vae.config.shift_factor) * vae.config.scaling_factor
+    else:
+        latents = latents * vae.config.scaling_factor
+
+    return latents
+
+
+def decode_latents(args, latents, device):
+    vae, vae_dtype = prepare_vae(args, device)
+
+    expand_temporal_dim = False
+    if len(latents.shape) == 4:
+        latents = latents.unsqueeze(2)
+        expand_temporal_dim = True
+    elif len(latents.shape) == 5:
+        pass
+    else:
+        raise ValueError(f"Only support latents with shape (b, c, h, w) or (b, c, f, h, w), but got {latents.shape}.")
+
+    if hasattr(vae.config, "shift_factor") and vae.config.shift_factor:
+        latents = latents / vae.config.scaling_factor + vae.config.shift_factor
+    else:
+        latents = latents / vae.config.scaling_factor
+
+    latents = latents.to(device=device, dtype=vae_dtype)
+    with torch.no_grad():
+        image = vae.decode(latents, return_dict=False)[0]
+
+    if expand_temporal_dim:
+        image = image.squeeze(2)
+
+    image = (image / 2 + 0.5).clamp(0, 1)
+    # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+    image = image.cpu().float()
+
+    return image
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="HunyuanVideo inference script")
+
+    parser.add_argument("--dit", type=str, required=True, help="DiT checkpoint path or directory")
+    parser.add_argument(
+        "--dit_in_channels",
+        type=int,
+        default=None,
+        help="input channels for DiT, default is None (automatically detect). 32 for SkyReels-I2V, 16 for others",
+    )
+    parser.add_argument("--vae", type=str, required=True, help="VAE checkpoint path or directory")
+    parser.add_argument("--vae_dtype", type=str, default=None, help="data type for VAE, default is float16")
+    parser.add_argument("--text_encoder1", type=str, required=True, help="Text Encoder 1 directory")
+    parser.add_argument("--text_encoder2", type=str, required=True, help="Text Encoder 2 directory")
+
+    # LoRA
+    parser.add_argument("--lora_weight", type=str, nargs="*", required=False, default=None, help="LoRA weight path")
+    parser.add_argument("--lora_multiplier", type=float, nargs="*", default=1.0, help="LoRA multiplier")
+    parser.add_argument(
+        "--save_merged_model",
+        type=str,
+        default=None,
+        help="Save merged model to path. If specified, no inference will be performed.",
+    )
+    parser.add_argument("--exclude_single_blocks", action="store_true", help="Exclude single blocks when loading LoRA weights")
+
+    # inference
+    parser.add_argument("--prompt", type=str, required=True, help="prompt for generation")
+    parser.add_argument("--negative_prompt", type=str, default=None, help="negative prompt for generation")
+    parser.add_argument("--video_size", type=int, nargs=2, default=[256, 256], help="video size")
+    parser.add_argument("--video_length", type=int, default=129, help="video length")
+    parser.add_argument("--fps", type=int, default=24, help="video fps")
+    parser.add_argument("--infer_steps", type=int, default=50, help="number of inference steps")
+    parser.add_argument("--save_path", type=str, required=True, help="path to save generated video")
+    parser.add_argument("--seed", type=int, default=None, help="Seed for evaluation.")
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=1.0,
+        help="Guidance scale for classifier free guidance. Default is 1.0 (means no guidance)",
+    )
+    parser.add_argument("--embedded_cfg_scale", type=float, default=6.0, help="Embeded classifier free guidance scale.")
+    parser.add_argument("--video_path", type=str, default=None, help="path to video for video2video inference")
+    parser.add_argument(
+        "--image_path", type=str, default=None, help="path to image for image2video inference, only works for SkyReels-I2V model"
+    )
+    parser.add_argument(
+        "--split_uncond",
+        action="store_true",
+        help="split unconditional call for classifier free guidance, slower but less memory usage",
+    )
+    parser.add_argument("--strength", type=float, default=0.8, help="strength for video2video inference")
+
+    # Flow Matching
+    parser.add_argument("--flow_shift", type=float, default=7.0, help="Shift factor for flow matching schedulers.")
+
+    parser.add_argument("--fp8", action="store_true", help="use fp8 for DiT model")
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for Text Encoder 1 (LLM)")
+    parser.add_argument(
+        "--device", type=str, default=None, help="device to use for inference. If None, use CUDA if available, otherwise use CPU"
+    )
+    parser.add_argument(
+        "--attn_mode", type=str, default="torch", choices=["flash", "torch", "sageattn", "xformers", "sdpa"], help="attention mode"
+    )
+    parser.add_argument(
+        "--split_attn", action="store_true", help="use split attention, default is False. if True, --split_uncond becomes True"
+    )
+    parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
+    parser.add_argument(
+        "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
+    )
+    parser.add_argument("--blocks_to_swap", type=int, default=None, help="number of blocks to swap in the model")
+    parser.add_argument("--img_in_txt_in_offloading", action="store_true", help="offload img_in and txt_in to cpu")
+    parser.add_argument(
+        "--output_type", type=str, default="video", choices=["video", "images", "latent", "both"], help="output type"
+    )
+    parser.add_argument("--no_metadata", action="store_true", help="do not save metadata")
+    parser.add_argument("--latent_path", type=str, nargs="*", default=None, help="path to latent for decode. no inference")
+    parser.add_argument("--lycoris", action="store_true", help="use lycoris for inference")
+    parser.add_argument("--fp8_fast", action="store_true", help="Enable fast FP8 arthimetic(RTX 4XXX+)")
+    parser.add_argument("--compile", action="store_true", help="Enable torch.compile")
+    parser.add_argument(
+        "--compile_args", nargs=4, metavar=("BACKEND", "MODE", "DYNAMIC", "FULLGRAPH"),
+        default=["inductor", "max-autotune-no-cudagraphs", "False", "False"],
+        help="Torch.compile settings"
+    )
+
+    args = parser.parse_args()
+
+    assert (args.latent_path is None or len(args.latent_path) == 0) or (
+        args.output_type == "images" or args.output_type == "video"
+    ), "latent_path is only supported for images or video output"
+
+    # update dit_weight based on model_base if not exists
+
+    if args.fp8_fast and not args.fp8:
+        raise ValueError("--fp8_fast requires --fp8")
+
+    return args
+
+
+def check_inputs(args):
+    height = args.video_size[0]
+    width = args.video_size[1]
+    video_length = args.video_length
+
+    if height % 8 != 0 or width % 8 != 0:
+        raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    return height, width, video_length
+
+
+def main():
+    args = parse_args()
+
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    dit_dtype = torch.bfloat16
+    dit_weight_dtype = torch.float8_e4m3fn if args.fp8 else dit_dtype
+    logger.info(f"Using device: {device}, DiT precision: {dit_dtype}, weight precision: {dit_weight_dtype}")
+
+    original_base_names = None
+    if args.latent_path is not None and len(args.latent_path) > 0:
+        original_base_names = []
+        latents_list = []
+        seeds = []
+        for latent_path in args.latent_path:
+            original_base_names.append(os.path.splitext(os.path.basename(latent_path))[0])
+            seed = 0
+
+            if os.path.splitext(latent_path)[1] != ".safetensors":
+                latents = torch.load(latent_path, map_location="cpu")
+            else:
+                latents = load_file(latent_path)["latent"]
+                with safe_open(latent_path, framework="pt") as f:
+                    metadata = f.metadata()
+                if metadata is None:
+                    metadata = {}
+                logger.info(f"Loaded metadata: {metadata}")
+
+                if "seeds" in metadata:
+                    seed = int(metadata["seeds"])
+
+            seeds.append(seed)
+            latents_list.append(latents)
+
+            logger.info(f"Loaded latent from {latent_path}. Shape: {latents.shape}")
+        latents = torch.stack(latents_list, dim=0)
+    else:
+        # prepare accelerator
+        mixed_precision = "bf16" if dit_dtype == torch.bfloat16 else "fp16"
+        accelerator = accelerate.Accelerator(mixed_precision=mixed_precision)
+
+        # load prompt
+        prompt = args.prompt  # TODO load prompts from file
+        assert prompt is not None, "prompt is required"
+
+        # check inputs: may be height, width, video_length etc will be changed for each generation in future
+        height, width, video_length = check_inputs(args)
+
+        # encode prompt with LLM and Text Encoder
+        logger.info(f"Encoding prompt: {prompt}")
+
+        do_classifier_free_guidance = args.guidance_scale != 1.0
+        if do_classifier_free_guidance:
+            negative_prompt = args.negative_prompt
+            if negative_prompt is None:
+                logger.info("Negative prompt is not provided, using empty prompt")
+                negative_prompt = ""
+            logger.info(f"Encoding negative prompt: {negative_prompt}")
+            prompt = [negative_prompt, prompt]
+        else:
+            if args.negative_prompt is not None:
+                logger.warning("Negative prompt is provided but guidance_scale is 1.0, negative prompt will be ignored.")
+
+        prompt_embeds, prompt_mask, prompt_embeds_2, prompt_mask_2 = encode_input_prompt(
+            prompt, args, device, args.fp8_llm, accelerator
+        )
+
+        # encode latents for video2video inference
+        video_latents = None
+        if args.video_path is not None:
+            # v2v inference
+            logger.info(f"Video2Video inference: {args.video_path}")
+
+            if os.path.isfile(args.video_path):
+                video = load_video(args.video_path, 0, video_length, bucket_reso=(width, height))  # list of frames
+            else:
+                video = load_images(args.video_path, video_length, bucket_reso=(width, height))  # list of frames
+
+            if len(video) < video_length:
+                raise ValueError(f"Video length is less than {video_length}")
+            video = np.stack(video, axis=0)  # F, H, W, C
+            video = torch.from_numpy(video).permute(3, 0, 1, 2).unsqueeze(0).float()  # 1, C, F, H, W
+            video = video / 255.0
+
+            logger.info(f"Encoding video to latents")
+            video_latents = encode_to_latents(args, video, device)
+            video_latents = video_latents.to(device=device, dtype=dit_dtype)
+
+            clean_memory_on_device(device)
+
+        # encode latents for image2video inference
+        image_latents = None
+        if args.image_path is not None:
+            # i2v inference
+            logger.info(f"Image2Video inference: {args.image_path}")
+
+            image = Image.open(args.image_path)
+            image = resize_image_to_bucket(image, (width, height))  # returns a numpy array
+            image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(0).unsqueeze(2).float()  # 1, C, 1, H, W
+            image = image / 255.0
+
+            logger.info(f"Encoding image to latents")
+            image_latents = encode_to_latents(args, image, device)  # 1, C, 1, H, W
+            image_latents = image_latents.to(device=device, dtype=dit_dtype)
+
+            clean_memory_on_device(device)
+
+        # load DiT model
+        blocks_to_swap = args.blocks_to_swap if args.blocks_to_swap else 0
+        loading_device = "cpu"  # if blocks_to_swap > 0 else device
+
+        logger.info(f"Loading DiT model from {args.dit}")
+        if args.attn_mode == "sdpa":
+            args.attn_mode = "torch"
+
+        # if image_latents is given, the model should be I2V model, so the in_channels should be 32
+        dit_in_channels = args.dit_in_channels if args.dit_in_channels is not None else (32 if image_latents is not None else 16)
+
+        # if we use LoRA, weigths should be bf16 instead of fp8, because merging should be done in bf16
+        # the model is too large, so we load the model to cpu. in addition, the .pt file is loaded to cpu anyway
+        # on the fly merging will be a solution for this issue for .safetenors files (not implemented yet)
+        transformer = load_transformer(
+            args.dit, args.attn_mode, args.split_attn, loading_device, dit_dtype, in_channels=dit_in_channels
+        )
+        transformer.eval()
+
+        # load LoRA weights
+        if args.lora_weight is not None and len(args.lora_weight) > 0:
+            for i, lora_weight in enumerate(args.lora_weight):
+                if args.lora_multiplier is not None and len(args.lora_multiplier) > i:
+                    lora_multiplier = args.lora_multiplier[i]
+                else:
+                    lora_multiplier = 1.0
+
+                logger.info(f"Loading LoRA weights from {lora_weight} with multiplier {lora_multiplier}")
+                weights_sd = load_file(lora_weight)
+
+                # Filter to exclude keys that are part of single_blocks
+                if args.exclude_single_blocks:
+                    filtered_weights = {k: v for k, v in weights_sd.items() if "single_blocks" not in k}
+                    weights_sd = filtered_weights
+
+                if args.lycoris:
+                    lycoris_net, _ = create_network_from_weights(
+                        multiplier=lora_multiplier,
+                        file=None,
+                        weights_sd=weights_sd,
+                        unet=transformer,
+                        text_encoder=None,
+                        vae=None,
+                        for_inference=True,
+                    )
+                else:
+                    network = lora.create_arch_network_from_weights(
+                        lora_multiplier, weights_sd, unet=transformer, for_inference=True
+                    )
+                logger.info("Merging LoRA weights to DiT model")
+
+                # try:
+                #     network.apply_to(None, transformer, apply_text_encoder=False, apply_unet=True)
+                #     info = network.load_state_dict(weights_sd, strict=True)
+                #     logger.info(f"Loaded LoRA weights from {weights_file}: {info}")
+                #     network.eval()
+                #     network.to(device)
+                # except Exception as e:
+                if args.lycoris:
+                    lycoris_net.merge_to(None, transformer, weights_sd, dtype=None, device=device)
+                else:
+                    network.merge_to(None, transformer, weights_sd, device=device, non_blocking=True)
+
+                synchronize_device(device)
+
+                logger.info("LoRA weights loaded")
+
+            # save model here before casting to dit_weight_dtype
+            if args.save_merged_model:
+                logger.info(f"Saving merged model to {args.save_merged_model}")
+                mem_eff_save_file(transformer.state_dict(), args.save_merged_model)  # save_file needs a lot of memory
+                logger.info("Merged model saved")
+                return
+
+        logger.info(f"Casting model to {dit_weight_dtype}")
+        transformer.to(dtype=dit_weight_dtype)
+
+        if args.fp8_fast:
+            logger.info("Enabling FP8 acceleration")
+            params_to_keep = {"norm", "bias", "time_in", "vector_in", "guidance_in", "txt_in", "img_in"}
+            for name, param in transformer.named_parameters():
+                dtype_to_use = dit_dtype if any(keyword in name for keyword in params_to_keep) else dit_weight_dtype
+                param.to(dtype=dtype_to_use)
+            convert_fp8_linear(transformer, dit_dtype, params_to_keep=params_to_keep)
+
+        if args.compile:
+            compile_backend, compile_mode, compile_dynamic, compile_fullgraph = args.compile_args
+            logger.info(
+                f"Torch Compiling[Backend: {compile_backend}; Mode: {compile_mode}; Dynamic: {compile_dynamic}; Fullgraph: {compile_fullgraph}]"
+            )
+            torch._dynamo.config.cache_size_limit = 32
+            for i, block in enumerate(transformer.single_blocks):
+                compiled_block = torch.compile(
+                    block, backend=compile_backend, mode=compile_mode,
+                    dynamic=compile_dynamic.lower() in "true",
+                    fullgraph=compile_fullgraph.lower() in "true"
+                )
+                transformer.single_blocks[i] = compiled_block
+            for i, block in enumerate(transformer.double_blocks):
+                compiled_block = torch.compile(
+                    block, backend=compile_backend, mode=compile_mode,
+                    dynamic=compile_dynamic.lower() in "true",
+                    fullgraph=compile_fullgraph.lower() in "true"
+                )
+                transformer.double_blocks[i] = compiled_block
+
+        if blocks_to_swap > 0:
+            logger.info(f"Enable swap {blocks_to_swap} blocks to CPU from device: {device}")
+            transformer.enable_block_swap(blocks_to_swap, device, supports_backward=False)
+            transformer.move_to_device_except_swap_blocks(device)
+            transformer.prepare_block_swap_before_forward()
+        else:
+            logger.info(f"Moving model to {device}")
+            transformer.to(device=device)
+        if args.img_in_txt_in_offloading:
+            logger.info("Enable offloading img_in and txt_in to CPU")
+            transformer.enable_img_in_txt_in_offloading()
+
+        # load scheduler
+        logger.info(f"Loading scheduler")
+        scheduler = FlowMatchDiscreteScheduler(shift=args.flow_shift, reverse=True, solver="euler")
+
+        # Prepare timesteps
+        num_inference_steps = args.infer_steps
+        scheduler.set_timesteps(num_inference_steps, device=device)  # n_tokens is not used in FlowMatchDiscreteScheduler
+        timesteps = scheduler.timesteps
+
+        # Prepare generator
+        num_videos_per_prompt = 1  # args.num_videos # currently only support 1 video per prompt, this is a batch size
+        seed = args.seed
+        if seed is None:
+            seeds = [random.randint(0, 2**32 - 1) for _ in range(num_videos_per_prompt)]
+        elif isinstance(seed, int):
+            seeds = [seed + i for i in range(num_videos_per_prompt)]
+        else:
+            raise ValueError(f"Seed must be an integer or None, got {seed}.")
+        generator = [torch.Generator(device).manual_seed(seed) for seed in seeds]
+
+        # Prepare noisy latents
+        num_channels_latents = 16  # transformer.config.in_channels
+        vae_scale_factor = 2 ** (4 - 1)  # len(self.vae.config.block_out_channels) == 4
+
+        vae_ver = vae.VAE_VER
+        if "884" in vae_ver:
+            latent_video_length = (video_length - 1) // 4 + 1
+        elif "888" in vae_ver:
+            latent_video_length = (video_length - 1) // 8 + 1
+        else:
+            latent_video_length = video_length
+
+        # shape = (
+        #     num_videos_per_prompt,
+        #     num_channels_latents,
+        #     latent_video_length,
+        #     height // vae_scale_factor,
+        #     width // vae_scale_factor,
+        # )
+        # latents = randn_tensor(shape, generator=generator, device=device, dtype=dit_dtype)
+
+        # make first N frames to be the same if the given seed is same
+        shape_of_frame = (num_videos_per_prompt, num_channels_latents, 1, height // vae_scale_factor, width // vae_scale_factor)
+        latents = []
+        for i in range(latent_video_length):
+            latents.append(randn_tensor(shape_of_frame, generator=generator, device=device, dtype=dit_dtype))
+        latents = torch.cat(latents, dim=2)
+
+        # pad image_latents to match the length of video_latents
+        if image_latents is not None:
+            zero_latents = torch.zeros_like(latents)
+            zero_latents[:, :, :1, :, :] = image_latents
+            image_latents = zero_latents
+
+        if args.video_path is not None:
+            # v2v inference
+            noise = latents
+            assert noise.shape == video_latents.shape, f"noise shape {noise.shape} != video_latents shape {video_latents.shape}"
+
+            num_inference_steps = int(num_inference_steps * args.strength)
+            timestep_start = scheduler.timesteps[-num_inference_steps]  # larger strength, less inference steps and more start time
+            t = timestep_start / 1000.0
+            latents = noise * t + video_latents * (1 - t)
+
+            timesteps = timesteps[-num_inference_steps:]
+
+            logger.info(f"strength: {args.strength}, num_inference_steps: {num_inference_steps}, timestep_start: {timestep_start}")
+
+        # FlowMatchDiscreteScheduler does not have init_noise_sigma
+
+        # Denoising loop
+        embedded_guidance_scale = args.embedded_cfg_scale
+        if embedded_guidance_scale is not None:
+            guidance_expand = torch.tensor([embedded_guidance_scale * 1000.0] * latents.shape[0], dtype=torch.float32, device="cpu")
+            guidance_expand = guidance_expand.to(device=device, dtype=dit_dtype)
+            if do_classifier_free_guidance:
+                guidance_expand = torch.cat([guidance_expand, guidance_expand], dim=0)
+        else:
+            guidance_expand = None
+        freqs_cos, freqs_sin = get_rotary_pos_embed(vae_ver, transformer, video_length, height, width)
+        # n_tokens = freqs_cos.shape[0]
+
+        # move and cast all inputs to the correct device and dtype
+        prompt_embeds = prompt_embeds.to(device=device, dtype=dit_dtype)
+        prompt_mask = prompt_mask.to(device=device)
+        prompt_embeds_2 = prompt_embeds_2.to(device=device, dtype=dit_dtype)
+        prompt_mask_2 = prompt_mask_2.to(device=device)
+
+        freqs_cos = freqs_cos.to(device=device, dtype=dit_dtype)
+        freqs_sin = freqs_sin.to(device=device, dtype=dit_dtype)
+
+        num_warmup_steps = len(timesteps) - num_inference_steps * scheduler.order  # this should be 0 in v2v inference
+
+        # assert split_uncond and split_attn
+        if args.split_attn and do_classifier_free_guidance and not args.split_uncond:
+            logger.warning("split_attn is enabled, split_uncond will be enabled as well.")
+            args.split_uncond = True
+
+        # with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA]) as p:
+        with tqdm(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latents = scheduler.scale_model_input(latents, t)
+
+                # predict the noise residual
+                with torch.no_grad(), accelerator.autocast():
+                    latents_input = latents if not do_classifier_free_guidance else torch.cat([latents, latents], dim=0)
+                    if image_latents is not None:
+                        latents_image_input = (
+                            image_latents if not do_classifier_free_guidance else torch.cat([image_latents, image_latents], dim=0)
+                        )
+                        latents_input = torch.cat([latents_input, latents_image_input], dim=1)  # 1 or 2, C*2, F, H, W
+
+                    batch_size = 1 if args.split_uncond else latents_input.shape[0]
+
+                    noise_pred_list = []
+                    for j in range(0, latents_input.shape[0], batch_size):
+                        noise_pred = transformer(  # For an input image (129, 192, 336) (1, 256, 256)
+                            latents_input[j : j + batch_size],  # [1, 16, 33, 24, 42]
+                            t.repeat(batch_size).to(device=device, dtype=dit_dtype),  # [1]
+                            text_states=prompt_embeds[j : j + batch_size],  # [1, 256, 4096]
+                            text_mask=prompt_mask[j : j + batch_size],  # [1, 256]
+                            text_states_2=prompt_embeds_2[j : j + batch_size],  # [1, 768]
+                            freqs_cos=freqs_cos,  # [seqlen, head_dim]
+                            freqs_sin=freqs_sin,  # [seqlen, head_dim]
+                            guidance=guidance_expand[j : j + batch_size],  # [1]
+                            return_dict=True,
+                        )["x"]
+                        noise_pred_list.append(noise_pred)
+                    noise_pred = torch.cat(noise_pred_list, dim=0)
+
+                # perform classifier free guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + args.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+
+                    # # SkyReels' rescale noise config is omitted for now
+                    # if guidance_rescale > 0.0:
+                    #     # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    #     noise_pred = rescale_noise_cfg(
+                    #         noise_pred,
+                    #         noise_pred_cond,
+                    #         guidance_rescale=self.guidance_rescale,
+                    #     )
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                # update progress bar
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % scheduler.order == 0):
+                    if progress_bar is not None:
+                        progress_bar.update()
+
+        # print(p.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1))
+        # print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+
+        latents = latents.detach().cpu()
+        transformer = None
+        clean_memory_on_device(device)
+
+    # Save samples
+    output_type = args.output_type
+    save_path = args.save_path  # if args.save_path_suffix == "" else f"{args.save_path}_{args.save_path_suffix}"
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
+
+    if output_type == "latent" or output_type == "both":
+        # save latent
+        for i, latent in enumerate(latents):
+            latent_path = f"{save_path}/{time_flag}_{i}_{seeds[i]}_latent.safetensors"
+
+            if args.no_metadata:
+                metadata = None
+            else:
+                metadata = {
+                    "seeds": f"{seeds[i]}",
+                    "prompt": f"{args.prompt}",
+                    "height": f"{height}",
+                    "width": f"{width}",
+                    "video_length": f"{video_length}",
+                    "infer_steps": f"{num_inference_steps}",
+                    "guidance_scale": f"{args.guidance_scale}",
+                    "embedded_cfg_scale": f"{args.embedded_cfg_scale}",
+                }
+                if args.negative_prompt is not None:
+                    metadata["negative_prompt"] = f"{args.negative_prompt}"
+            sd = {"latent": latent}
+            save_file(sd, latent_path, metadata=metadata)
+
+            logger.info(f"Latent save to: {latent_path}")
+    if output_type == "video" or output_type == "both":
+        # save video
+        videos = decode_latents(args, latents, device)
+        for i, sample in enumerate(videos):
+            original_name = "" if original_base_names is None else f"_{original_base_names[i]}"
+            sample = sample.unsqueeze(0)
+            video_path = f"{save_path}/{time_flag}_{i}_{seeds[i]}{original_name}.mp4"
+            save_videos_grid(sample, video_path, fps=args.fps)
+            logger.info(f"Sample save to: {video_path}")
+    elif output_type == "images":
+        # save images
+        videos = decode_latents(args, latents, device)
+        for i, sample in enumerate(videos):
+            original_name = "" if original_base_names is None else f"_{original_base_names[i]}"
+            sample = sample.unsqueeze(0)
+            image_name = f"{time_flag}_{i}_{seeds[i]}{original_name}"
+            save_images_grid(sample, save_path, image_name)
+            logger.info(f"Sample images save to: {save_path}/{image_name}")
+
+    logger.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hv_train.py b/hv_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..501f1648b3bd0df086f0ffc395ea58b146adc8ea
--- /dev/null
+++ b/hv_train.py
@@ -0,0 +1,1721 @@
+import ast
+import asyncio
+from datetime import timedelta
+import gc
+import importlib
+import argparse
+import math
+import os
+import pathlib
+import re
+import sys
+import random
+import time
+import json
+from multiprocessing import Value
+from typing import Any, Dict, List, Optional
+import accelerate
+import numpy as np
+from packaging.version import Version
+
+import huggingface_hub
+import toml
+
+import torch
+from tqdm import tqdm
+from accelerate.utils import set_seed
+from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs
+from safetensors.torch import load_file, save_file
+import transformers
+from diffusers.optimization import (
+    SchedulerType as DiffusersSchedulerType,
+    TYPE_TO_SCHEDULER_FUNCTION as DIFFUSERS_TYPE_TO_SCHEDULER_FUNCTION,
+)
+from transformers.optimization import SchedulerType, TYPE_TO_SCHEDULER_FUNCTION
+
+from dataset import config_utils
+from hunyuan_model.models import load_transformer, get_rotary_pos_embed_by_shape
+import hunyuan_model.text_encoder as text_encoder_module
+from hunyuan_model.vae import load_vae
+import hunyuan_model.vae as vae_module
+from modules.scheduling_flow_match_discrete import FlowMatchDiscreteScheduler
+import networks.lora as lora_module
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+from dataset.image_video_dataset import ARCHITECTURE_HUNYUAN_VIDEO
+
+import logging
+
+from utils import huggingface_utils, model_utils, train_utils, sai_model_spec
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+BASE_MODEL_VERSION_HUNYUAN_VIDEO = "hunyuan_video"
+
+# TODO make separate file for some functions to commonize with other scripts
+
+
+def clean_memory_on_device(device: torch.device):
+    r"""
+    Clean memory on the specified device, will be called from training scripts.
+    """
+    gc.collect()
+
+    # device may "cuda" or "cuda:0", so we need to check the type of device
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+    if device.type == "xpu":
+        torch.xpu.empty_cache()
+    if device.type == "mps":
+        torch.mps.empty_cache()
+
+
+# for collate_fn: epoch and step is multiprocessing.Value
+class collator_class:
+    def __init__(self, epoch, step, dataset):
+        self.current_epoch = epoch
+        self.current_step = step
+        self.dataset = dataset  # not used if worker_info is not None, in case of multiprocessing
+
+    def __call__(self, examples):
+        worker_info = torch.utils.data.get_worker_info()
+        # worker_info is None in the main process
+        if worker_info is not None:
+            dataset = worker_info.dataset
+        else:
+            dataset = self.dataset
+
+        # set epoch and step
+        dataset.set_current_epoch(self.current_epoch.value)
+        dataset.set_current_step(self.current_step.value)
+        return examples[0]
+
+
+def prepare_accelerator(args: argparse.Namespace) -> Accelerator:
+    """
+    DeepSpeed is not supported in this script currently.
+    """
+    if args.logging_dir is None:
+        logging_dir = None
+    else:
+        log_prefix = "" if args.log_prefix is None else args.log_prefix
+        logging_dir = args.logging_dir + "/" + log_prefix + time.strftime("%Y%m%d%H%M%S", time.localtime())
+
+    if args.log_with is None:
+        if logging_dir is not None:
+            log_with = "tensorboard"
+        else:
+            log_with = None
+    else:
+        log_with = args.log_with
+        if log_with in ["tensorboard", "all"]:
+            if logging_dir is None:
+                raise ValueError(
+                    "logging_dir is required when log_with is tensorboard / Tensorboardを使う場合、logging_dirを指定してください"
+                )
+        if log_with in ["wandb", "all"]:
+            try:
+                import wandb
+            except ImportError:
+                raise ImportError("No wandb / wandb がインストールされていないようです")
+            if logging_dir is not None:
+                os.makedirs(logging_dir, exist_ok=True)
+                os.environ["WANDB_DIR"] = logging_dir
+            if args.wandb_api_key is not None:
+                wandb.login(key=args.wandb_api_key)
+
+    kwargs_handlers = [
+        (
+            InitProcessGroupKwargs(
+                backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
+                init_method=(
+                    "env://?use_libuv=False" if os.name == "nt" and Version(torch.__version__) >= Version("2.4.0") else None
+                ),
+                timeout=timedelta(minutes=args.ddp_timeout) if args.ddp_timeout else None,
+            )
+            if torch.cuda.device_count() > 1
+            else None
+        ),
+        (
+            DistributedDataParallelKwargs(
+                gradient_as_bucket_view=args.ddp_gradient_as_bucket_view, static_graph=args.ddp_static_graph
+            )
+            if args.ddp_gradient_as_bucket_view or args.ddp_static_graph
+            else None
+        ),
+    ]
+    kwargs_handlers = [i for i in kwargs_handlers if i is not None]
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=log_with,
+        project_dir=logging_dir,
+        kwargs_handlers=kwargs_handlers,
+    )
+    print("accelerator device:", accelerator.device)
+    return accelerator
+
+
+def line_to_prompt_dict(line: str) -> dict:
+    # subset of gen_img_diffusers
+    prompt_args = line.split(" --")
+    prompt_dict = {}
+    prompt_dict["prompt"] = prompt_args[0]
+
+    for parg in prompt_args:
+        try:
+            m = re.match(r"w (\d+)", parg, re.IGNORECASE)
+            if m:
+                prompt_dict["width"] = int(m.group(1))
+                continue
+
+            m = re.match(r"h (\d+)", parg, re.IGNORECASE)
+            if m:
+                prompt_dict["height"] = int(m.group(1))
+                continue
+
+            m = re.match(r"f (\d+)", parg, re.IGNORECASE)
+            if m:
+                prompt_dict["frame_count"] = int(m.group(1))
+                continue
+
+            m = re.match(r"d (\d+)", parg, re.IGNORECASE)
+            if m:
+                prompt_dict["seed"] = int(m.group(1))
+                continue
+
+            m = re.match(r"s (\d+)", parg, re.IGNORECASE)
+            if m:  # steps
+                prompt_dict["sample_steps"] = max(1, min(1000, int(m.group(1))))
+                continue
+
+            # m = re.match(r"l ([\d\.]+)", parg, re.IGNORECASE)
+            # if m:  # scale
+            #     prompt_dict["scale"] = float(m.group(1))
+            #     continue
+            # m = re.match(r"n (.+)", parg, re.IGNORECASE)
+            # if m:  # negative prompt
+            #     prompt_dict["negative_prompt"] = m.group(1)
+            #     continue
+
+        except ValueError as ex:
+            logger.error(f"Exception in parsing / 解析エラー: {parg}")
+            logger.error(ex)
+
+    return prompt_dict
+
+
+def load_prompts(prompt_file: str) -> list[Dict]:
+    # read prompts
+    if prompt_file.endswith(".txt"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        prompts = [line.strip() for line in lines if len(line.strip()) > 0 and line[0] != "#"]
+    elif prompt_file.endswith(".toml"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            data = toml.load(f)
+        prompts = [dict(**data["prompt"], **subset) for subset in data["prompt"]["subset"]]
+    elif prompt_file.endswith(".json"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            prompts = json.load(f)
+
+    # preprocess prompts
+    for i in range(len(prompts)):
+        prompt_dict = prompts[i]
+        if isinstance(prompt_dict, str):
+            prompt_dict = line_to_prompt_dict(prompt_dict)
+            prompts[i] = prompt_dict
+        assert isinstance(prompt_dict, dict)
+
+        # Adds an enumerator to the dict based on prompt position. Used later to name image files. Also cleanup of extra data in original prompt dict.
+        prompt_dict["enum"] = i
+        prompt_dict.pop("subset", None)
+
+    return prompts
+
+
+def compute_density_for_timestep_sampling(
+    weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
+):
+    """Compute the density for sampling the timesteps when doing SD3 training.
+
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "logit_normal":
+        # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
+        u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
+        u = torch.nn.functional.sigmoid(u)
+    elif weighting_scheme == "mode":
+        u = torch.rand(size=(batch_size,), device="cpu")
+        u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
+    else:
+        u = torch.rand(size=(batch_size,), device="cpu")
+    return u
+
+
+def get_sigmas(noise_scheduler, timesteps, device, n_dim=4, dtype=torch.float32):
+    sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype)
+    schedule_timesteps = noise_scheduler.timesteps.to(device)
+    timesteps = timesteps.to(device)
+
+    # if sum([(schedule_timesteps == t) for t in timesteps]) < len(timesteps):
+    if any([(schedule_timesteps == t).sum() == 0 for t in timesteps]):
+        # raise ValueError("Some timesteps are not in the schedule / 一部のtimestepsがスケジュールに含まれていません")
+        # round to nearest timestep
+        logger.warning("Some timesteps are not in the schedule / 一部のtimestepsがスケジュールに含まれていません")
+        step_indices = [torch.argmin(torch.abs(schedule_timesteps - t)).item() for t in timesteps]
+    else:
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+    sigma = sigmas[step_indices].flatten()
+    while len(sigma.shape) < n_dim:
+        sigma = sigma.unsqueeze(-1)
+    return sigma
+
+
+def compute_loss_weighting_for_sd3(weighting_scheme: str, noise_scheduler, timesteps, device, dtype):
+    """Computes loss weighting scheme for SD3 training.
+
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "sigma_sqrt" or weighting_scheme == "cosmap":
+        sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=5, dtype=dtype)
+        if weighting_scheme == "sigma_sqrt":
+            weighting = (sigmas**-2.0).float()
+        else:
+            bot = 1 - 2 * sigmas + 2 * sigmas**2
+            weighting = 2 / (math.pi * bot)
+    else:
+        weighting = None  # torch.ones_like(sigmas)
+    return weighting
+
+
+class FineTuningTrainer:
+    def __init__(self):
+        pass
+
+    def process_sample_prompts(
+        self,
+        args: argparse.Namespace,
+        accelerator: Accelerator,
+        sample_prompts: str,
+        text_encoder1: str,
+        text_encoder2: str,
+        fp8_llm: bool,
+    ):
+        logger.info(f"cache Text Encoder outputs for sample prompt: {sample_prompts}")
+        prompts = load_prompts(sample_prompts)
+
+        def encode_for_text_encoder(text_encoder, is_llm=True):
+            sample_prompts_te_outputs = {}  # (prompt) -> (embeds, mask)
+            with accelerator.autocast(), torch.no_grad():
+                for prompt_dict in prompts:
+                    for p in [prompt_dict.get("prompt", "")]:
+                        if p not in sample_prompts_te_outputs:
+                            logger.info(f"cache Text Encoder outputs for prompt: {p}")
+
+                            data_type = "video"
+                            text_inputs = text_encoder.text2tokens(p, data_type=data_type)
+
+                            prompt_outputs = text_encoder.encode(text_inputs, data_type=data_type)
+                            sample_prompts_te_outputs[p] = (prompt_outputs.hidden_state, prompt_outputs.attention_mask)
+
+            return sample_prompts_te_outputs
+
+        # Load Text Encoder 1 and encode
+        text_encoder_dtype = torch.float16 if args.text_encoder_dtype is None else model_utils.str_to_dtype(args.text_encoder_dtype)
+        logger.info(f"loading text encoder 1: {text_encoder1}")
+        text_encoder_1 = text_encoder_module.load_text_encoder_1(text_encoder1, accelerator.device, fp8_llm, text_encoder_dtype)
+
+        logger.info("encoding with Text Encoder 1")
+        te_outputs_1 = encode_for_text_encoder(text_encoder_1)
+        del text_encoder_1
+
+        # Load Text Encoder 2 and encode
+        logger.info(f"loading text encoder 2: {text_encoder2}")
+        text_encoder_2 = text_encoder_module.load_text_encoder_2(text_encoder2, accelerator.device, text_encoder_dtype)
+
+        logger.info("encoding with Text Encoder 2")
+        te_outputs_2 = encode_for_text_encoder(text_encoder_2, is_llm=False)
+        del text_encoder_2
+
+        # prepare sample parameters
+        sample_parameters = []
+        for prompt_dict in prompts:
+            prompt_dict_copy = prompt_dict.copy()
+            p = prompt_dict.get("prompt", "")
+            prompt_dict_copy["llm_embeds"] = te_outputs_1[p][0]
+            prompt_dict_copy["llm_mask"] = te_outputs_1[p][1]
+            prompt_dict_copy["clipL_embeds"] = te_outputs_2[p][0]
+            prompt_dict_copy["clipL_mask"] = te_outputs_2[p][1]
+            sample_parameters.append(prompt_dict_copy)
+
+        clean_memory_on_device(accelerator.device)
+
+        return sample_parameters
+
+    def get_optimizer(self, args, trainable_params: list[torch.nn.Parameter]) -> tuple[str, str, torch.optim.Optimizer]:
+        # adamw, adamw8bit, adafactor
+
+        optimizer_type = args.optimizer_type.lower()
+
+        # split optimizer_type and optimizer_args
+        optimizer_kwargs = {}
+        if args.optimizer_args is not None and len(args.optimizer_args) > 0:
+            for arg in args.optimizer_args:
+                key, value = arg.split("=")
+                value = ast.literal_eval(value)
+                optimizer_kwargs[key] = value
+
+        lr = args.learning_rate
+        optimizer = None
+        optimizer_class = None
+
+        if optimizer_type.endswith("8bit".lower()):
+            try:
+                import bitsandbytes as bnb
+            except ImportError:
+                raise ImportError("No bitsandbytes / bitsandbytesがインストールされていないようです")
+
+            if optimizer_type == "AdamW8bit".lower():
+                logger.info(f"use 8-bit AdamW optimizer | {optimizer_kwargs}")
+                optimizer_class = bnb.optim.AdamW8bit
+                optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+
+        elif optimizer_type == "Adafactor".lower():
+            # Adafactor: check relative_step and warmup_init
+            if "relative_step" not in optimizer_kwargs:
+                optimizer_kwargs["relative_step"] = True  # default
+            if not optimizer_kwargs["relative_step"] and optimizer_kwargs.get("warmup_init", False):
+                logger.info(
+                    f"set relative_step to True because warmup_init is True / warmup_initがTrueのためrelative_stepをTrueにします"
+                )
+                optimizer_kwargs["relative_step"] = True
+            logger.info(f"use Adafactor optimizer | {optimizer_kwargs}")
+
+            if optimizer_kwargs["relative_step"]:
+                logger.info(f"relative_step is true / relative_stepがtrueです")
+                if lr != 0.0:
+                    logger.warning(f"learning rate is used as initial_lr / 指定したlearning rateはinitial_lrとして使用されます")
+                args.learning_rate = None
+
+                if args.lr_scheduler != "adafactor":
+                    logger.info(f"use adafactor_scheduler / スケジューラにadafactor_schedulerを使用します")
+                args.lr_scheduler = f"adafactor:{lr}"  # ちょっと微妙だけど
+
+                lr = None
+            else:
+                if args.max_grad_norm != 0.0:
+                    logger.warning(
+                        f"because max_grad_norm is set, clip_grad_norm is enabled. consider set to 0 / max_grad_normが設定されているためclip_grad_normが有効になります。0に設定して無効にしたほうがいいかもしれません"
+                    )
+                if args.lr_scheduler != "constant_with_warmup":
+                    logger.warning(f"constant_with_warmup will be good / スケジューラはconstant_with_warmupが良いかもしれません")
+                if optimizer_kwargs.get("clip_threshold", 1.0) != 1.0:
+                    logger.warning(f"clip_threshold=1.0 will be good / clip_thresholdは1.0が良いかもしれません")
+
+            optimizer_class = transformers.optimization.Adafactor
+            optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+
+        elif optimizer_type == "AdamW".lower():
+            logger.info(f"use AdamW optimizer | {optimizer_kwargs}")
+            optimizer_class = torch.optim.AdamW
+            optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+
+        if optimizer is None:
+            # 任意のoptimizerを使う
+            case_sensitive_optimizer_type = args.optimizer_type  # not lower
+            logger.info(f"use {case_sensitive_optimizer_type} | {optimizer_kwargs}")
+
+            if "." not in case_sensitive_optimizer_type:  # from torch.optim
+                optimizer_module = torch.optim
+            else:  # from other library
+                values = case_sensitive_optimizer_type.split(".")
+                optimizer_module = importlib.import_module(".".join(values[:-1]))
+                case_sensitive_optimizer_type = values[-1]
+
+            optimizer_class = getattr(optimizer_module, case_sensitive_optimizer_type)
+            optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+
+        # for logging
+        optimizer_name = optimizer_class.__module__ + "." + optimizer_class.__name__
+        optimizer_args = ",".join([f"{k}={v}" for k, v in optimizer_kwargs.items()])
+
+        # get train and eval functions
+        if hasattr(optimizer, "train") and callable(optimizer.train):
+            train_fn = optimizer.train
+            eval_fn = optimizer.eval
+        else:
+            train_fn = lambda: None
+            eval_fn = lambda: None
+
+        return optimizer_name, optimizer_args, optimizer, train_fn, eval_fn
+
+    def is_schedulefree_optimizer(self, optimizer: torch.optim.Optimizer, args: argparse.Namespace) -> bool:
+        return args.optimizer_type.lower().endswith("schedulefree".lower())  # or args.optimizer_schedulefree_wrapper
+
+    def get_dummy_scheduler(optimizer: torch.optim.Optimizer) -> Any:
+        # dummy scheduler for schedulefree optimizer. supports only empty step(), get_last_lr() and optimizers.
+        # this scheduler is used for logging only.
+        # this isn't be wrapped by accelerator because of this class is not a subclass of torch.optim.lr_scheduler._LRScheduler
+        class DummyScheduler:
+            def __init__(self, optimizer: torch.optim.Optimizer):
+                self.optimizer = optimizer
+
+            def step(self):
+                pass
+
+            def get_last_lr(self):
+                return [group["lr"] for group in self.optimizer.param_groups]
+
+        return DummyScheduler(optimizer)
+
+    def get_scheduler(self, args, optimizer: torch.optim.Optimizer, num_processes: int):
+        """
+        Unified API to get any scheduler from its name.
+        """
+        # if schedulefree optimizer, return dummy scheduler
+        if self.is_schedulefree_optimizer(optimizer, args):
+            return self.get_dummy_scheduler(optimizer)
+
+        name = args.lr_scheduler
+        num_training_steps = args.max_train_steps * num_processes  # * args.gradient_accumulation_steps
+        num_warmup_steps: Optional[int] = (
+            int(args.lr_warmup_steps * num_training_steps) if isinstance(args.lr_warmup_steps, float) else args.lr_warmup_steps
+        )
+        num_decay_steps: Optional[int] = (
+            int(args.lr_decay_steps * num_training_steps) if isinstance(args.lr_decay_steps, float) else args.lr_decay_steps
+        )
+        num_stable_steps = num_training_steps - num_warmup_steps - num_decay_steps
+        num_cycles = args.lr_scheduler_num_cycles
+        power = args.lr_scheduler_power
+        timescale = args.lr_scheduler_timescale
+        min_lr_ratio = args.lr_scheduler_min_lr_ratio
+
+        lr_scheduler_kwargs = {}  # get custom lr_scheduler kwargs
+        if args.lr_scheduler_args is not None and len(args.lr_scheduler_args) > 0:
+            for arg in args.lr_scheduler_args:
+                key, value = arg.split("=")
+                value = ast.literal_eval(value)
+                lr_scheduler_kwargs[key] = value
+
+        def wrap_check_needless_num_warmup_steps(return_vals):
+            if num_warmup_steps is not None and num_warmup_steps != 0:
+                raise ValueError(f"{name} does not require `num_warmup_steps`. Set None or 0.")
+            return return_vals
+
+        # using any lr_scheduler from other library
+        if args.lr_scheduler_type:
+            lr_scheduler_type = args.lr_scheduler_type
+            logger.info(f"use {lr_scheduler_type} | {lr_scheduler_kwargs} as lr_scheduler")
+            if "." not in lr_scheduler_type:  # default to use torch.optim
+                lr_scheduler_module = torch.optim.lr_scheduler
+            else:
+                values = lr_scheduler_type.split(".")
+                lr_scheduler_module = importlib.import_module(".".join(values[:-1]))
+                lr_scheduler_type = values[-1]
+            lr_scheduler_class = getattr(lr_scheduler_module, lr_scheduler_type)
+            lr_scheduler = lr_scheduler_class(optimizer, **lr_scheduler_kwargs)
+            return lr_scheduler
+
+        if name.startswith("adafactor"):
+            assert (
+                type(optimizer) == transformers.optimization.Adafactor
+            ), f"adafactor scheduler must be used with Adafactor optimizer / adafactor schedulerはAdafactorオプティマイザと同時に使ってください"
+            initial_lr = float(name.split(":")[1])
+            # logger.info(f"adafactor scheduler init lr {initial_lr}")
+            return wrap_check_needless_num_warmup_steps(transformers.optimization.AdafactorSchedule(optimizer, initial_lr))
+
+        if name == DiffusersSchedulerType.PIECEWISE_CONSTANT.value:
+            name = DiffusersSchedulerType(name)
+            schedule_func = DIFFUSERS_TYPE_TO_SCHEDULER_FUNCTION[name]
+            return schedule_func(optimizer, **lr_scheduler_kwargs)  # step_rules and last_epoch are given as kwargs
+
+        name = SchedulerType(name)
+        schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
+
+        if name == SchedulerType.CONSTANT:
+            return wrap_check_needless_num_warmup_steps(schedule_func(optimizer, **lr_scheduler_kwargs))
+
+        # All other schedulers require `num_warmup_steps`
+        if num_warmup_steps is None:
+            raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
+
+        if name == SchedulerType.CONSTANT_WITH_WARMUP:
+            return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, **lr_scheduler_kwargs)
+
+        if name == SchedulerType.INVERSE_SQRT:
+            return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, timescale=timescale, **lr_scheduler_kwargs)
+
+        # All other schedulers require `num_training_steps`
+        if num_training_steps is None:
+            raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
+
+        if name == SchedulerType.COSINE_WITH_RESTARTS:
+            return schedule_func(
+                optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_training_steps=num_training_steps,
+                num_cycles=num_cycles,
+                **lr_scheduler_kwargs,
+            )
+
+        if name == SchedulerType.POLYNOMIAL:
+            return schedule_func(
+                optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_training_steps=num_training_steps,
+                power=power,
+                **lr_scheduler_kwargs,
+            )
+
+        if name == SchedulerType.COSINE_WITH_MIN_LR:
+            return schedule_func(
+                optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_training_steps=num_training_steps,
+                num_cycles=num_cycles / 2,
+                min_lr_rate=min_lr_ratio,
+                **lr_scheduler_kwargs,
+            )
+
+        # these schedulers do not require `num_decay_steps`
+        if name == SchedulerType.LINEAR or name == SchedulerType.COSINE:
+            return schedule_func(
+                optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_training_steps=num_training_steps,
+                **lr_scheduler_kwargs,
+            )
+
+        # All other schedulers require `num_decay_steps`
+        if num_decay_steps is None:
+            raise ValueError(f"{name} requires `num_decay_steps`, please provide that argument.")
+        if name == SchedulerType.WARMUP_STABLE_DECAY:
+            return schedule_func(
+                optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_stable_steps=num_stable_steps,
+                num_decay_steps=num_decay_steps,
+                num_cycles=num_cycles / 2,
+                min_lr_ratio=min_lr_ratio if min_lr_ratio is not None else 0.0,
+                **lr_scheduler_kwargs,
+            )
+
+        return schedule_func(
+            optimizer,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_training_steps,
+            num_decay_steps=num_decay_steps,
+            **lr_scheduler_kwargs,
+        )
+
+    def resume_from_local_or_hf_if_specified(self, accelerator: Accelerator, args: argparse.Namespace) -> bool:
+        if not args.resume:
+            return False
+
+        if not args.resume_from_huggingface:
+            logger.info(f"resume training from local state: {args.resume}")
+            accelerator.load_state(args.resume)
+            return True
+
+        logger.info(f"resume training from huggingface state: {args.resume}")
+        repo_id = args.resume.split("/")[0] + "/" + args.resume.split("/")[1]
+        path_in_repo = "/".join(args.resume.split("/")[2:])
+        revision = None
+        repo_type = None
+        if ":" in path_in_repo:
+            divided = path_in_repo.split(":")
+            if len(divided) == 2:
+                path_in_repo, revision = divided
+                repo_type = "model"
+            else:
+                path_in_repo, revision, repo_type = divided
+        logger.info(f"Downloading state from huggingface: {repo_id}/{path_in_repo}@{revision}")
+
+        list_files = huggingface_utils.list_dir(
+            repo_id=repo_id,
+            subfolder=path_in_repo,
+            revision=revision,
+            token=args.huggingface_token,
+            repo_type=repo_type,
+        )
+
+        async def download(filename) -> str:
+            def task():
+                return huggingface_hub.hf_hub_download(
+                    repo_id=repo_id,
+                    filename=filename,
+                    revision=revision,
+                    repo_type=repo_type,
+                    token=args.huggingface_token,
+                )
+
+            return await asyncio.get_event_loop().run_in_executor(None, task)
+
+        loop = asyncio.get_event_loop()
+        results = loop.run_until_complete(asyncio.gather(*[download(filename=filename.rfilename) for filename in list_files]))
+        if len(results) == 0:
+            raise ValueError(
+                "No files found in the specified repo id/path/revision / 指定されたリポジトリID/パス/リビジョンにファイルが見つかりませんでした"
+            )
+        dirname = os.path.dirname(results[0])
+        accelerator.load_state(dirname)
+
+        return True
+
+    def sample_images(self, accelerator, args, epoch, global_step, device, vae, transformer, sample_parameters):
+        pass
+
+    def get_noisy_model_input_and_timesteps(
+        self,
+        args: argparse.Namespace,
+        noise: torch.Tensor,
+        latents: torch.Tensor,
+        noise_scheduler: FlowMatchDiscreteScheduler,
+        device: torch.device,
+        dtype: torch.dtype,
+    ):
+        batch_size = noise.shape[0]
+
+        if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid" or args.timestep_sampling == "shift":
+            if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
+                # Simple random t-based noise sampling
+                if args.timestep_sampling == "sigmoid":
+                    t = torch.sigmoid(args.sigmoid_scale * torch.randn((batch_size,), device=device))
+                else:
+                    t = torch.rand((batch_size,), device=device)
+
+            elif args.timestep_sampling == "shift":
+                shift = args.discrete_flow_shift
+                logits_norm = torch.randn(batch_size, device=device)
+                logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
+                t = logits_norm.sigmoid()
+                t = (t * shift) / (1 + (shift - 1) * t)
+
+            t_min = args.min_timestep if args.min_timestep is not None else 0
+            t_max = args.max_timestep if args.max_timestep is not None else 1000.0
+            t_min /= 1000.0
+            t_max /= 1000.0
+            t = t * (t_max - t_min) + t_min  # scale to [t_min, t_max], default [0, 1]
+
+            timesteps = t * 1000.0
+            t = t.view(-1, 1, 1, 1, 1)
+            noisy_model_input = (1 - t) * latents + t * noise
+
+            timesteps += 1  # 1 to 1000
+        else:
+            # Sample a random timestep for each image
+            # for weighting schemes where we sample timesteps non-uniformly
+            u = compute_density_for_timestep_sampling(
+                weighting_scheme=args.weighting_scheme,
+                batch_size=batch_size,
+                logit_mean=args.logit_mean,
+                logit_std=args.logit_std,
+                mode_scale=args.mode_scale,
+            )
+            # indices = (u * noise_scheduler.config.num_train_timesteps).long()
+            t_min = args.min_timestep if args.min_timestep is not None else 0
+            t_max = args.max_timestep if args.max_timestep is not None else 1000
+            indices = (u * (t_max - t_min) + t_min).long()
+
+            timesteps = noise_scheduler.timesteps[indices].to(device=device)  # 1 to 1000
+
+            # Add noise according to flow matching.
+            sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
+            noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+
+        return noisy_model_input, timesteps
+
+    def train(self, args):
+        if args.seed is None:
+            args.seed = random.randint(0, 2**32)
+        set_seed(args.seed)
+
+        # Load dataset config
+        blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+        logger.info(f"Load dataset config from {args.dataset_config}")
+        user_config = config_utils.load_user_config(args.dataset_config)
+        blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_HUNYUAN_VIDEO)
+        train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group, training=True)
+
+        current_epoch = Value("i", 0)
+        current_step = Value("i", 0)
+        ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+        collator = collator_class(current_epoch, current_step, ds_for_collator)
+
+        # prepare accelerator
+        logger.info("preparing accelerator")
+        accelerator = prepare_accelerator(args)
+        is_main_process = accelerator.is_main_process
+
+        # prepare dtype
+        weight_dtype = torch.float32
+        if args.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif args.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+
+        # HunyuanVideo specific
+        vae_dtype = torch.float16 if args.vae_dtype is None else model_utils.str_to_dtype(args.vae_dtype)
+
+        # get embedding for sampling images
+        sample_parameters = vae = None
+        if args.sample_prompts:
+            sample_parameters = self.process_sample_prompts(
+                args, accelerator, args.sample_prompts, args.text_encoder1, args.text_encoder2, args.fp8_llm
+            )
+
+            # Load VAE model for sampling images: VAE is loaded to cpu to save gpu memory
+            vae, _, s_ratio, t_ratio = load_vae(vae_dtype=vae_dtype, device="cpu", vae_path=args.vae)
+            vae.requires_grad_(False)
+            vae.eval()
+
+            if args.vae_chunk_size is not None:
+                vae.set_chunk_size_for_causal_conv_3d(args.vae_chunk_size)
+                logger.info(f"Set chunk_size to {args.vae_chunk_size} for CausalConv3d in VAE")
+            if args.vae_spatial_tile_sample_min_size is not None:
+                vae.enable_spatial_tiling(True)
+                vae.tile_sample_min_size = args.vae_spatial_tile_sample_min_size
+                vae.tile_latent_min_size = args.vae_spatial_tile_sample_min_size // 8
+            elif args.vae_tiling:
+                vae.enable_spatial_tiling(True)
+
+        # load DiT model
+        blocks_to_swap = args.blocks_to_swap if args.blocks_to_swap else 0
+        loading_device = "cpu" if blocks_to_swap > 0 else accelerator.device
+
+        logger.info(f"Loading DiT model from {args.dit}")
+        if args.sdpa:
+            attn_mode = "torch"
+        elif args.flash_attn:
+            attn_mode = "flash"
+        elif args.sage_attn:
+            attn_mode = "sageattn"
+        elif args.xformers:
+            attn_mode = "xformers"
+        else:
+            raise ValueError(
+                f"either --sdpa, --flash-attn, --sage-attn or --xformers must be specified / --sdpa, --flash-attn, --sage-attn, --xformersのいずれかを指定してください"
+            )
+        transformer = load_transformer(
+            args.dit, attn_mode, args.split_attn, loading_device, None, in_channels=args.dit_in_channels
+        )  # load as is
+
+        if blocks_to_swap > 0:
+            logger.info(f"enable swap {blocks_to_swap} blocks to CPU from device: {accelerator.device}")
+            transformer.enable_block_swap(blocks_to_swap, accelerator.device, supports_backward=True)
+            transformer.move_to_device_except_swap_blocks(accelerator.device)
+        if args.img_in_txt_in_offloading:
+            logger.info("Enable offloading img_in and txt_in to CPU")
+            transformer.enable_img_in_txt_in_offloading()
+
+        if args.gradient_checkpointing:
+            transformer.enable_gradient_checkpointing()
+
+        # prepare optimizer, data loader etc.
+        accelerator.print("prepare optimizer, data loader etc.")
+
+        transformer.requires_grad_(False)
+        if accelerator.is_main_process:
+            accelerator.print(f"Trainable modules '{args.trainable_modules}'.")
+        for name, param in transformer.named_parameters():
+            for trainable_module_name in args.trainable_modules:
+                if trainable_module_name in name:
+                    param.requires_grad = True
+                    break
+
+        total_params = list(transformer.parameters())
+        trainable_params = list(filter(lambda p: p.requires_grad, transformer.parameters()))
+        logger.info(
+            f"number of trainable parameters: {sum(p.numel() for p in trainable_params) / 1e6} M, total paramters: {sum(p.numel() for p in total_params) / 1e6} M"
+        )
+        optimizer_name, optimizer_args, optimizer, optimizer_train_fn, optimizer_eval_fn = self.get_optimizer(
+            args, trainable_params
+        )
+
+        # prepare dataloader
+
+        # num workers for data loader: if 0, persistent_workers is not available
+        n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
+
+        train_dataloader = torch.utils.data.DataLoader(
+            train_dataset_group,
+            batch_size=1,
+            shuffle=True,
+            collate_fn=collator,
+            num_workers=n_workers,
+            persistent_workers=args.persistent_data_loader_workers,
+        )
+
+        # calculate max_train_steps
+        if args.max_train_epochs is not None:
+            args.max_train_steps = args.max_train_epochs * math.ceil(
+                len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
+            )
+            accelerator.print(
+                f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+            )
+
+        # send max_train_steps to train_dataset_group
+        train_dataset_group.set_max_train_steps(args.max_train_steps)
+
+        # prepare lr_scheduler
+        lr_scheduler = self.get_scheduler(args, optimizer, accelerator.num_processes)
+
+        # prepare training model. accelerator does some magic here
+
+        # experimental feature: train the model with gradients in fp16/bf16
+        dit_dtype = torch.float32
+        if args.full_fp16:
+            assert (
+                args.mixed_precision == "fp16"
+            ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
+            accelerator.print("enable full fp16 training.")
+            dit_weight_dtype = torch.float16
+        elif args.full_bf16:
+            assert (
+                args.mixed_precision == "bf16"
+            ), "full_bf16 requires mixed precision='bf16' / full_bf16を使う場合はmixed_precision='bf16'を指定してください。"
+            accelerator.print("enable full bf16 training.")
+            dit_weight_dtype = torch.bfloat16
+        else:
+            dit_weight_dtype = torch.float32
+
+        # TODO add fused optimizer and stochastic rounding
+
+        # cast model to dit_weight_dtype
+        # if dit_dtype != dit_weight_dtype:
+        logger.info(f"casting model to {dit_weight_dtype}")
+        transformer.to(dit_weight_dtype)
+
+        if blocks_to_swap > 0:
+            transformer = accelerator.prepare(transformer, device_placement=[not blocks_to_swap > 0])
+            accelerator.unwrap_model(transformer).move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
+            accelerator.unwrap_model(transformer).prepare_block_swap_before_forward()
+        else:
+            transformer = accelerator.prepare(transformer)
+
+        optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
+
+        transformer.train()
+
+        if args.full_fp16:
+            # patch accelerator for fp16 training
+            # def patch_accelerator_for_fp16_training(accelerator):
+            org_unscale_grads = accelerator.scaler._unscale_grads_
+
+            def _unscale_grads_replacer(optimizer, inv_scale, found_inf, allow_fp16):
+                return org_unscale_grads(optimizer, inv_scale, found_inf, True)
+
+            accelerator.scaler._unscale_grads_ = _unscale_grads_replacer
+
+        # resume from local or huggingface. accelerator.step is set
+        self.resume_from_local_or_hf_if_specified(accelerator, args)  # accelerator.load_state(args.resume)
+
+        # epoch数を計算する
+        num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+        num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+        # 学習する
+        # total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+        accelerator.print("running training / 学習開始")
+        accelerator.print(f"  num train items / 学習画像、動画数: {train_dataset_group.num_train_items}")
+        accelerator.print(f"  num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}")
+        accelerator.print(f"  num epochs / epoch数: {num_train_epochs}")
+        accelerator.print(
+            f"  batch size per device / バッチサイズ: {', '.join([str(d.batch_size) for d in train_dataset_group.datasets])}"
+        )
+        # accelerator.print(f"  total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ（並列学習、勾配合計含む）: {total_batch_size}")
+        accelerator.print(f"  gradient accumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}")
+        accelerator.print(f"  total optimization steps / 学習ステップ数: {args.max_train_steps}")
+
+        if accelerator.is_main_process:
+            init_kwargs = {}
+            if args.wandb_run_name:
+                init_kwargs["wandb"] = {"name": args.wandb_run_name}
+            if args.log_tracker_config is not None:
+                init_kwargs = toml.load(args.log_tracker_config)
+            accelerator.init_trackers(
+                "hunyuan_video_ft" if args.log_tracker_name is None else args.log_tracker_name,
+                config=train_utils.get_sanitized_config_or_none(args),
+                init_kwargs=init_kwargs,
+            )
+
+        # TODO skip until initial step
+        progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps")
+
+        epoch_to_start = 0
+        global_step = 0
+        noise_scheduler = FlowMatchDiscreteScheduler(shift=args.discrete_flow_shift, reverse=True, solver="euler")
+
+        loss_recorder = train_utils.LossRecorder()
+        del train_dataset_group
+
+        # function for saving/removing
+        def save_model(ckpt_name: str, unwrapped_nw, steps, epoch_no, force_sync_upload=False):
+            os.makedirs(args.output_dir, exist_ok=True)
+            ckpt_file = os.path.join(args.output_dir, ckpt_name)
+
+            accelerator.print(f"\nsaving checkpoint: {ckpt_file}")
+
+            title = args.metadata_title if args.metadata_title is not None else args.output_name
+            if args.min_timestep is not None or args.max_timestep is not None:
+                min_time_step = args.min_timestep if args.min_timestep is not None else 0
+                max_time_step = args.max_timestep if args.max_timestep is not None else 1000
+                md_timesteps = (min_time_step, max_time_step)
+            else:
+                md_timesteps = None
+
+            sai_metadata = sai_model_spec.build_metadata(
+                None,
+                ARCHITECTURE_HUNYUAN_VIDEO,
+                time.time(),
+                title,
+                None,
+                args.metadata_author,
+                args.metadata_description,
+                args.metadata_license,
+                args.metadata_tags,
+                timesteps=md_timesteps,
+                is_lora=False,
+            )
+
+            save_file(unwrapped_nw.state_dict(), ckpt_file, sai_metadata)
+            if args.huggingface_repo_id is not None:
+                huggingface_utils.upload(args, ckpt_file, "/" + ckpt_name, force_sync_upload=force_sync_upload)
+
+        def remove_model(old_ckpt_name):
+            old_ckpt_file = os.path.join(args.output_dir, old_ckpt_name)
+            if os.path.exists(old_ckpt_file):
+                accelerator.print(f"removing old checkpoint: {old_ckpt_file}")
+                os.remove(old_ckpt_file)
+
+        # For --sample_at_first
+        optimizer_eval_fn()
+        self.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, transformer, sample_parameters)
+        optimizer_train_fn()
+        if len(accelerator.trackers) > 0:
+            # log empty object to commit the sample images to wandb
+            accelerator.log({}, step=0)
+
+        # training loop
+
+        # log device and dtype for each model
+        logger.info(f"DiT dtype: {transformer.dtype}, device: {transformer.device}")
+
+        clean_memory_on_device(accelerator.device)
+
+        pos_embed_cache = {}
+
+        for epoch in range(epoch_to_start, num_train_epochs):
+            accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
+            current_epoch.value = epoch + 1
+
+            for step, batch in enumerate(train_dataloader):
+                latents, llm_embeds, llm_mask, clip_embeds = batch
+                bsz = latents.shape[0]
+                current_step.value = global_step
+
+                with accelerator.accumulate(transformer):
+                    latents = latents * vae_module.SCALING_FACTOR
+
+                    # Sample noise that we'll add to the latents
+                    noise = torch.randn_like(latents)
+
+                    # calculate model input and timesteps
+                    noisy_model_input, timesteps = self.get_noisy_model_input_and_timesteps(
+                        args, noise, latents, noise_scheduler, accelerator.device, dit_dtype
+                    )
+
+                    weighting = compute_loss_weighting_for_sd3(
+                        args.weighting_scheme, noise_scheduler, timesteps, accelerator.device, dit_dtype
+                    )
+
+                    # ensure guidance_scale in args is float
+                    guidance_vec = torch.full((bsz,), float(args.guidance_scale), device=accelerator.device)  # , dtype=dit_dtype)
+
+                    # ensure the hidden state will require grad
+                    if args.gradient_checkpointing:
+                        noisy_model_input.requires_grad_(True)
+                        guidance_vec.requires_grad_(True)
+
+                    pos_emb_shape = latents.shape[1:]
+                    if pos_emb_shape not in pos_embed_cache:
+                        freqs_cos, freqs_sin = get_rotary_pos_embed_by_shape(
+                            accelerator.unwrap_model(transformer), latents.shape[2:]
+                        )
+                        # freqs_cos = freqs_cos.to(device=accelerator.device, dtype=dit_dtype)
+                        # freqs_sin = freqs_sin.to(device=accelerator.device, dtype=dit_dtype)
+                        pos_embed_cache[pos_emb_shape] = (freqs_cos, freqs_sin)
+                    else:
+                        freqs_cos, freqs_sin = pos_embed_cache[pos_emb_shape]
+
+                    # call DiT
+                    latents = latents.to(device=accelerator.device, dtype=dit_dtype)
+                    noisy_model_input = noisy_model_input.to(device=accelerator.device, dtype=dit_dtype)
+                    # timesteps = timesteps.to(device=accelerator.device, dtype=dit_dtype)
+                    # llm_embeds = llm_embeds.to(device=accelerator.device, dtype=dit_dtype)
+                    # llm_mask = llm_mask.to(device=accelerator.device)
+                    # clip_embeds = clip_embeds.to(device=accelerator.device, dtype=dit_dtype)
+                    with accelerator.autocast():
+                        model_pred = transformer(
+                            noisy_model_input,
+                            timesteps,
+                            text_states=llm_embeds,
+                            text_mask=llm_mask,
+                            text_states_2=clip_embeds,
+                            freqs_cos=freqs_cos,
+                            freqs_sin=freqs_sin,
+                            guidance=guidance_vec,
+                            return_dict=False,
+                        )
+
+                    # flow matching loss
+                    target = noise - latents
+
+                    loss = torch.nn.functional.mse_loss(model_pred.to(dit_dtype), target, reduction="none")
+
+                    if weighting is not None:
+                        loss = loss * weighting
+                    # loss = loss.mean([1, 2, 3])
+                    # # min snr gamma, scale v pred loss like noise pred, v pred like loss, debiased estimation etc.
+                    # loss = self.post_process_loss(loss, args, timesteps, noise_scheduler)
+
+                    loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
+
+                    accelerator.backward(loss)
+                    if accelerator.sync_gradients:
+                        # self.all_reduce_network(accelerator, network)  # sync DDP grad manually
+                        state = accelerate.PartialState()
+                        if state.distributed_type != accelerate.DistributedType.NO:
+                            for param in transformer.parameters():
+                                if param.grad is not None:
+                                    param.grad = accelerator.reduce(param.grad, reduction="mean")
+
+                        if args.max_grad_norm != 0.0:
+                            params_to_clip = accelerator.unwrap_model(transformer).parameters()
+                            accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad(set_to_none=True)
+
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                if accelerator.sync_gradients:
+                    progress_bar.update(1)
+                    global_step += 1
+
+                    optimizer_eval_fn()
+                    self.sample_images(
+                        accelerator, args, None, global_step, accelerator.device, vae, transformer, sample_parameters
+                    )
+
+                    # 指定ステップごとにモデルを保存
+                    if args.save_every_n_steps is not None and global_step % args.save_every_n_steps == 0:
+                        accelerator.wait_for_everyone()
+                        if accelerator.is_main_process:
+                            ckpt_name = train_utils.get_step_ckpt_name(args.output_name, global_step)
+                            save_model(ckpt_name, accelerator.unwrap_model(transformer), global_step, epoch)
+
+                            if args.save_state:
+                                train_utils.save_and_remove_state_stepwise(args, accelerator, global_step)
+
+                            remove_step_no = train_utils.get_remove_step_no(args, global_step)
+                            if remove_step_no is not None:
+                                remove_ckpt_name = train_utils.get_step_ckpt_name(args.output_name, remove_step_no)
+                                remove_model(remove_ckpt_name)
+                    optimizer_train_fn()
+
+                current_loss = loss.detach().item()
+                loss_recorder.add(epoch=epoch, step=step, loss=current_loss)
+                avr_loss: float = loss_recorder.moving_average
+                logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
+                progress_bar.set_postfix(**logs)
+
+                if len(accelerator.trackers) > 0:
+                    logs = {"loss": current_loss, "lr": lr_scheduler.get_last_lr()[0]}
+                    accelerator.log(logs, step=global_step)
+
+                if global_step >= args.max_train_steps:
+                    break
+
+            if len(accelerator.trackers) > 0:
+                logs = {"loss/epoch": loss_recorder.moving_average}
+                accelerator.log(logs, step=epoch + 1)
+
+            accelerator.wait_for_everyone()
+
+            # 指定エポックごとにモデルを保存
+            optimizer_eval_fn()
+            if args.save_every_n_epochs is not None:
+                saving = (epoch + 1) % args.save_every_n_epochs == 0 and (epoch + 1) < num_train_epochs
+                if is_main_process and saving:
+                    ckpt_name = train_utils.get_epoch_ckpt_name(args.output_name, epoch + 1)
+                    save_model(ckpt_name, accelerator.unwrap_model(transformer), global_step, epoch + 1)
+
+                    remove_epoch_no = train_utils.get_remove_epoch_no(args, epoch + 1)
+                    if remove_epoch_no is not None:
+                        remove_ckpt_name = train_utils.get_epoch_ckpt_name(args.output_name, remove_epoch_no)
+                        remove_model(remove_ckpt_name)
+
+                    if args.save_state:
+                        train_utils.save_and_remove_state_on_epoch_end(args, accelerator, epoch + 1)
+
+            self.sample_images(accelerator, args, epoch + 1, global_step, accelerator.device, vae, transformer, sample_parameters)
+            optimizer_train_fn()
+
+            # end of epoch
+
+        if is_main_process:
+            transformer = accelerator.unwrap_model(transformer)
+
+        accelerator.end_training()
+        optimizer_eval_fn()
+
+        if args.save_state or args.save_state_on_train_end:
+            train_utils.save_state_on_train_end(args, accelerator)
+
+        if is_main_process:
+            ckpt_name = train_utils.get_last_ckpt_name(args.output_name)
+            save_model(ckpt_name, transformer, global_step, num_train_epochs, force_sync_upload=True)
+
+            logger.info("model saved.")
+
+
+def setup_parser() -> argparse.ArgumentParser:
+    def int_or_float(value):
+        if value.endswith("%"):
+            try:
+                return float(value[:-1]) / 100.0
+            except ValueError:
+                raise argparse.ArgumentTypeError(f"Value '{value}' is not a valid percentage")
+        try:
+            float_value = float(value)
+            if float_value >= 1 and float_value.is_integer():
+                return int(value)
+            return float(value)
+        except ValueError:
+            raise argparse.ArgumentTypeError(f"'{value}' is not an int or float")
+
+    parser = argparse.ArgumentParser()
+
+    # general settings
+    parser.add_argument(
+        "--config_file",
+        type=str,
+        default=None,
+        help="using .toml instead of args to pass hyperparameter / ハイパーパラメータを引数ではなく.tomlファイルで渡す",
+    )
+    parser.add_argument(
+        "--dataset_config",
+        type=pathlib.Path,
+        default=None,
+        required=True,
+        help="config file for dataset / データセットの設定ファイル",
+    )
+
+    # training settings
+    parser.add_argument(
+        "--sdpa",
+        action="store_true",
+        help="use sdpa for CrossAttention (requires PyTorch 2.0) / CrossAttentionにsdpaを使う（PyTorch 2.0が必要）",
+    )
+    parser.add_argument(
+        "--flash_attn",
+        action="store_true",
+        help="use FlashAttention for CrossAttention, requires FlashAttention / CrossAttentionにFlashAttentionを使う、FlashAttentionが必要",
+    )
+    parser.add_argument(
+        "--sage_attn",
+        action="store_true",
+        help="use SageAttention. requires SageAttention / SageAttentionを使う。SageAttentionが必要",
+    )
+    parser.add_argument(
+        "--xformers",
+        action="store_true",
+        help="use xformers for CrossAttention, requires xformers / CrossAttentionにxformersを使う、xformersが必要",
+    )
+    parser.add_argument(
+        "--split_attn",
+        action="store_true",
+        help="use split attention for attention calculation (split batch size=1, affects memory usage and speed)"
+        " / attentionを分割して計算する（バッチサイズ=1に分割、メモリ使用量と速度に影響）",
+    )
+
+    parser.add_argument("--max_train_steps", type=int, default=1600, help="training steps / 学習ステップ数")
+    parser.add_argument(
+        "--max_train_epochs",
+        type=int,
+        default=None,
+        help="training epochs (overrides max_train_steps) / 学習エポック数（max_train_stepsを上書きします）",
+    )
+    parser.add_argument(
+        "--max_data_loader_n_workers",
+        type=int,
+        default=8,
+        help="max num workers for DataLoader (lower is less main RAM usage, faster epoch start and slower data loading) / DataLoaderの最大プロセス数（小さい値ではメインメモリの使用量が減りエポック間の待ち時間が減りますが、データ読み込みは遅くなります）",
+    )
+    parser.add_argument(
+        "--persistent_data_loader_workers",
+        action="store_true",
+        help="persistent DataLoader workers (useful for reduce time gap between epoch, but may use more memory) / DataLoader のワーカーを持続させる (エポック間の時間差を少なくするのに有効だが、より多くのメモリを消費する可能性がある)",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="random seed for training / 学習時の乱数のseed")
+    parser.add_argument(
+        "--gradient_checkpointing", action="store_true", help="enable gradient checkpointing / gradient checkpointingを有効にする"
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass / 学習時に逆伝播をする前に勾配を合計するステップ数",
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help="use mixed precision / 混合精度を使う場合、その精度",
+    )
+    parser.add_argument("--trainable_modules", nargs="+", default=".", help="Enter a list of trainable modules")
+
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default=None,
+        help="enable logging and output TensorBoard log to this directory / ログ出力を有効にしてこのディレクトリにTensorBoard用のログを出力する",
+    )
+    parser.add_argument(
+        "--log_with",
+        type=str,
+        default=None,
+        choices=["tensorboard", "wandb", "all"],
+        help="what logging tool(s) to use (if 'all', TensorBoard and WandB are both used) / ログ出力に使用するツール (allを指定するとTensorBoardとWandBの両方が使用される)",
+    )
+    parser.add_argument(
+        "--log_prefix", type=str, default=None, help="add prefix for each log directory / ログディレクトリ名の先頭に追加する文字列"
+    )
+    parser.add_argument(
+        "--log_tracker_name",
+        type=str,
+        default=None,
+        help="name of tracker to use for logging, default is script-specific default name / ログ出力に使用するtrackerの名前、省略時はスクリプトごとのデフォルト名",
+    )
+    parser.add_argument(
+        "--wandb_run_name",
+        type=str,
+        default=None,
+        help="The name of the specific wandb session / wandb ログに表示される特定の実行の名前",
+    )
+    parser.add_argument(
+        "--log_tracker_config",
+        type=str,
+        default=None,
+        help="path to tracker config file to use for logging / ログ出力に使用するtrackerの設定ファイルのパス",
+    )
+    parser.add_argument(
+        "--wandb_api_key",
+        type=str,
+        default=None,
+        help="specify WandB API key to log in before starting training (optional). / WandB APIキーを指定して学習開始前にログインする（オプション）",
+    )
+    parser.add_argument("--log_config", action="store_true", help="log training configuration / 学習設定をログに出力する")
+
+    parser.add_argument(
+        "--ddp_timeout",
+        type=int,
+        default=None,
+        help="DDP timeout (min, None for default of accelerate) / DDPのタイムアウト（分、Noneでaccelerateのデフォルト）",
+    )
+    parser.add_argument(
+        "--ddp_gradient_as_bucket_view",
+        action="store_true",
+        help="enable gradient_as_bucket_view for DDP / DDPでgradient_as_bucket_viewを有効にする",
+    )
+    parser.add_argument(
+        "--ddp_static_graph",
+        action="store_true",
+        help="enable static_graph for DDP / DDPでstatic_graphを有効にする",
+    )
+
+    parser.add_argument(
+        "--sample_every_n_steps",
+        type=int,
+        default=None,
+        help="generate sample images every N steps / 学習中のモデルで指定ステップごとにサンプル出力する",
+    )
+    parser.add_argument(
+        "--sample_at_first", action="store_true", help="generate sample images before training / 学習前にサンプル出力する"
+    )
+    parser.add_argument(
+        "--sample_every_n_epochs",
+        type=int,
+        default=None,
+        help="generate sample images every N epochs (overwrites n_steps) / 学習中のモデルで指定エポックごとにサンプル出力する（ステップ数指定を上書きします）",
+    )
+    parser.add_argument(
+        "--sample_prompts",
+        type=str,
+        default=None,
+        help="file for prompts to generate sample images / 学習中モデルのサンプル出力用プロンプトのファイル",
+    )
+
+    # optimizer and lr scheduler settings
+    parser.add_argument(
+        "--optimizer_type",
+        type=str,
+        default="",
+        help="Optimizer to use / オプティマイザの種類: AdamW (default), AdamW8bit, AdaFactor. "
+        "Also, you can use any optimizer by specifying the full path to the class, like 'torch.optim.AdamW', 'bitsandbytes.optim.AdEMAMix8bit' or 'bitsandbytes.optim.PagedAdEMAMix8bit' etc. / ",
+    )
+    parser.add_argument(
+        "--optimizer_args",
+        type=str,
+        default=None,
+        nargs="*",
+        help='additional arguments for optimizer (like "weight_decay=0.01 betas=0.9,0.999 ...") / オプティマイザの追加引数（例： "weight_decay=0.01 betas=0.9,0.999 ..."）',
+    )
+    parser.add_argument("--learning_rate", type=float, default=2.0e-6, help="learning rate / 学習率")
+    parser.add_argument(
+        "--max_grad_norm",
+        default=1.0,
+        type=float,
+        help="Max gradient norm, 0 for no clipping / 勾配正規化の最大norm、0でclippingを行わない",
+    )
+
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help="scheduler to use for learning rate / 学習率のスケジューラ: linear, cosine, cosine_with_restarts, polynomial, constant (default), constant_with_warmup, adafactor",
+    )
+    parser.add_argument(
+        "--lr_warmup_steps",
+        type=int_or_float,
+        default=0,
+        help="Int number of steps for the warmup in the lr scheduler (default is 0) or float with ratio of train steps"
+        " / 学習率のスケジューラをウォームアップするステップ数（デフォルト0）、または学習ステップの比率（1未満のfloat値の場合）",
+    )
+    parser.add_argument(
+        "--lr_decay_steps",
+        type=int_or_float,
+        default=0,
+        help="Int number of steps for the decay in the lr scheduler (default is 0) or float (<1) with ratio of train steps"
+        " / 学習率のスケジューラを減衰させるステップ数（デフォルト0）、または学習ステップの比率（1未満のfloat値の場合）",
+    )
+    parser.add_argument(
+        "--lr_scheduler_num_cycles",
+        type=int,
+        default=1,
+        help="Number of restarts for cosine scheduler with restarts / cosine with restartsスケジューラでのリスタート回数",
+    )
+    parser.add_argument(
+        "--lr_scheduler_power",
+        type=float,
+        default=1,
+        help="Polynomial power for polynomial scheduler / polynomialスケジューラでのpolynomial power",
+    )
+    parser.add_argument(
+        "--lr_scheduler_timescale",
+        type=int,
+        default=None,
+        help="Inverse sqrt timescale for inverse sqrt scheduler,defaults to `num_warmup_steps`"
+        + " / 逆平方根スケジューラのタイムスケール、デフォルトは`num_warmup_steps`",
+    )
+    parser.add_argument(
+        "--lr_scheduler_min_lr_ratio",
+        type=float,
+        default=None,
+        help="The minimum learning rate as a ratio of the initial learning rate for cosine with min lr scheduler and warmup decay scheduler"
+        + " / 初期学習率の比率としての最小学習率を指定する、cosine with min lr と warmup decay スケジューラ で有効",
+    )
+    parser.add_argument("--lr_scheduler_type", type=str, default="", help="custom scheduler module / 使用するスケジューラ")
+    parser.add_argument(
+        "--lr_scheduler_args",
+        type=str,
+        default=None,
+        nargs="*",
+        help='additional arguments for scheduler (like "T_max=100") / スケジューラの追加引数（例： "T_max100"）',
+    )
+
+    # model settings
+    parser.add_argument("--dit", type=str, required=True, help="DiT checkpoint path / DiTのチェックポイントのパス")
+    parser.add_argument("--dit_dtype", type=str, default=None, help="data type for DiT, default is bfloat16")
+    parser.add_argument("--dit_in_channels", type=int, default=16, help="input channels for DiT, default is 16, skyreels I2V is 32")
+    parser.add_argument("--vae", type=str, help="VAE checkpoint path / VAEのチェックポイントのパス")
+    parser.add_argument("--vae_dtype", type=str, default=None, help="data type for VAE, default is float16")
+    parser.add_argument(
+        "--vae_tiling",
+        action="store_true",
+        help="enable spatial tiling for VAE, default is False. If vae_spatial_tile_sample_min_size is set, this is automatically enabled."
+        " / VAEの空間タイリングを有効にする、デフォルトはFalse。vae_spatial_tile_sample_min_sizeが設定されている場合、自動的に有効になります。",
+    )
+    parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
+    parser.add_argument(
+        "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
+    )
+    parser.add_argument("--text_encoder1", type=str, help="Text Encoder 1 directory / テキストエンコーダ1のディレクトリ")
+    parser.add_argument("--text_encoder2", type=str, help="Text Encoder 2 directory / テキストエンコーダ2のディレクトリ")
+    parser.add_argument("--text_encoder_dtype", type=str, default=None, help="data type for Text Encoder, default is float16")
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for LLM / LLMにfp8を使う")
+    parser.add_argument("--full_fp16", action="store_true", help="fp16 training including gradients / 勾配も含めてfp16で学習する")
+    parser.add_argument("--full_bf16", action="store_true", help="bf16 training including gradients / 勾配も含めてbf16で学習する")
+
+    parser.add_argument(
+        "--blocks_to_swap",
+        type=int,
+        default=None,
+        help="number of blocks to swap in the model, max XXX / モデル内のブロックの数、最大XXX",
+    )
+    parser.add_argument(
+        "--img_in_txt_in_offloading",
+        action="store_true",
+        help="offload img_in and txt_in to cpu / img_inとtxt_inをCPUにオフロードする",
+    )
+
+    # parser.add_argument("--flow_shift", type=float, default=7.0, help="Shift factor for flow matching schedulers")
+    parser.add_argument("--guidance_scale", type=float, default=1.0, help="Embeded classifier free guidance scale.")
+    parser.add_argument(
+        "--timestep_sampling",
+        choices=["sigma", "uniform", "sigmoid", "shift"],
+        default="sigma",
+        help="Method to sample timesteps: sigma-based, uniform random, sigmoid of random normal and shift of sigmoid."
+        " / タイムステップをサンプリングする方法：sigma、random uniform、random normalのsigmoid、sigmoidのシフト。",
+    )
+    parser.add_argument(
+        "--discrete_flow_shift",
+        type=float,
+        default=1.0,
+        help="Discrete flow shift for the Euler Discrete Scheduler, default is 1.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは1.0。",
+    )
+    parser.add_argument(
+        "--sigmoid_scale",
+        type=float,
+        default=1.0,
+        help='Scale factor for sigmoid timestep sampling (only used when timestep-sampling is "sigmoid" or "shift"). / sigmoidタイムステップサンプリングの倍率（timestep-samplingが"sigmoid"または"shift"の場合のみ有効）。',
+    )
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="none",
+        choices=["logit_normal", "mode", "cosmap", "sigma_sqrt", "none"],
+        help="weighting scheme for timestep distribution. Default is none"
+        " / タイムステップ分布の重み付けスキーム、デフォルトはnone",
+    )
+    parser.add_argument(
+        "--logit_mean",
+        type=float,
+        default=0.0,
+        help="mean to use when using the `'logit_normal'` weighting scheme / `'logit_normal'`重み付けスキームを使用する場合の平均",
+    )
+    parser.add_argument(
+        "--logit_std",
+        type=float,
+        default=1.0,
+        help="std to use when using the `'logit_normal'` weighting scheme / `'logit_normal'`重み付けスキームを使用する場合のstd",
+    )
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme` / モード重み付けスキームのスケール",
+    )
+    parser.add_argument(
+        "--min_timestep",
+        type=int,
+        default=None,
+        help="set minimum time step for training (0~999, default is 0) / 学習時のtime stepの最小値を設定する（0~999で指定、省略時はデフォルト値(0)） ",
+    )
+    parser.add_argument(
+        "--max_timestep",
+        type=int,
+        default=None,
+        help="set maximum time step for training (1~1000, default is 1000) / 学習時のtime stepの最大値を設定する（1~1000で指定、省略時はデフォルト値(1000)）",
+    )
+
+    # save and load settings
+    parser.add_argument(
+        "--output_dir", type=str, default=None, help="directory to output trained model / 学習後のモデル出力先ディレクトリ"
+    )
+    parser.add_argument(
+        "--output_name",
+        type=str,
+        default=None,
+        required=True,
+        help="base name of trained model file / 学習後のモデルの拡張子を除くファイル名",
+    )
+    parser.add_argument("--resume", type=str, default=None, help="saved state to resume training / 学習再開するモデルのstate")
+
+    parser.add_argument(
+        "--save_every_n_epochs",
+        type=int,
+        default=None,
+        help="save checkpoint every N epochs / 学習中のモデルを指定エポックごとに保存する",
+    )
+    parser.add_argument(
+        "--save_every_n_steps",
+        type=int,
+        default=None,
+        help="save checkpoint every N steps / 学習中のモデルを指定ステップごとに保存する",
+    )
+    parser.add_argument(
+        "--save_last_n_epochs",
+        type=int,
+        default=None,
+        help="save last N checkpoints when saving every N epochs (remove older checkpoints) / 指定エポックごとにモデルを保存するとき最大Nエポック保存する（古いチェックポイントは削除する）",
+    )
+    parser.add_argument(
+        "--save_last_n_epochs_state",
+        type=int,
+        default=None,
+        help="save last N checkpoints of state (overrides the value of --save_last_n_epochs)/ 最大Nエポックstateを保存する（--save_last_n_epochsの指定を上書きする）",
+    )
+    parser.add_argument(
+        "--save_last_n_steps",
+        type=int,
+        default=None,
+        help="save checkpoints until N steps elapsed (remove older checkpoints if N steps elapsed) / 指定ステップごとにモデルを保存するとき、このステップ数経過するまで保存する（このステップ数経過したら削除する）",
+    )
+    parser.add_argument(
+        "--save_last_n_steps_state",
+        type=int,
+        default=None,
+        help="save states until N steps elapsed (remove older states if N steps elapsed, overrides --save_last_n_steps) / 指定ステップごとにstateを保存するとき、このステップ数経過するまで保存する（このステップ数経過したら削除する。--save_last_n_stepsを上書きする）",
+    )
+    parser.add_argument(
+        "--save_state",
+        action="store_true",
+        help="save training state additionally (including optimizer states etc.) when saving model / optimizerなど学習状態も含めたstateをモデル保存時に追加で保存する",
+    )
+    parser.add_argument(
+        "--save_state_on_train_end",
+        action="store_true",
+        help="save training state (including optimizer states etc.) on train end even if --save_state is not specified"
+        " / --save_stateが未指定時にもoptimizerなど学習状態も含めたstateを学習終了時に保存する",
+    )
+
+    # SAI Model spec
+    parser.add_argument(
+        "--metadata_title",
+        type=str,
+        default=None,
+        help="title for model metadata (default is output_name) / メタデータに書き込まれるモデルタイトル、省略時はoutput_name",
+    )
+    parser.add_argument(
+        "--metadata_author",
+        type=str,
+        default=None,
+        help="author name for model metadata / メタデータに書き込まれるモデル作者名",
+    )
+    parser.add_argument(
+        "--metadata_description",
+        type=str,
+        default=None,
+        help="description for model metadata / メタデータに書き込まれるモデル説明",
+    )
+    parser.add_argument(
+        "--metadata_license",
+        type=str,
+        default=None,
+        help="license for model metadata / メタデータに書き込まれるモデルライセンス",
+    )
+    parser.add_argument(
+        "--metadata_tags",
+        type=str,
+        default=None,
+        help="tags for model metadata, separated by comma / メタデータに書き込まれるモデルタグ、カンマ区切り",
+    )
+
+    # huggingface settings
+    parser.add_argument(
+        "--huggingface_repo_id",
+        type=str,
+        default=None,
+        help="huggingface repo name to upload / huggingfaceにアップロードするリポジトリ名",
+    )
+    parser.add_argument(
+        "--huggingface_repo_type",
+        type=str,
+        default=None,
+        help="huggingface repo type to upload / huggingfaceにアップロードするリポジトリの種類",
+    )
+    parser.add_argument(
+        "--huggingface_path_in_repo",
+        type=str,
+        default=None,
+        help="huggingface model path to upload files / huggingfaceにアップロードするファイルのパス",
+    )
+    parser.add_argument("--huggingface_token", type=str, default=None, help="huggingface token / huggingfaceのトークン")
+    parser.add_argument(
+        "--huggingface_repo_visibility",
+        type=str,
+        default=None,
+        help="huggingface repository visibility ('public' for public, 'private' or None for private) / huggingfaceにアップロードするリポジトリの公開設定（'public'で公開、'private'またはNoneで非公開）",
+    )
+    parser.add_argument(
+        "--save_state_to_huggingface", action="store_true", help="save state to huggingface / huggingfaceにstateを保存する"
+    )
+    parser.add_argument(
+        "--resume_from_huggingface",
+        action="store_true",
+        help="resume from huggingface (ex: --resume {repo_id}/{path_in_repo}:{revision}:{repo_type}) / huggingfaceから学習を再開する(例: --resume {repo_id}/{path_in_repo}:{revision}:{repo_type})",
+    )
+    parser.add_argument(
+        "--async_upload",
+        action="store_true",
+        help="upload to huggingface asynchronously / huggingfaceに非同期でアップロードする",
+    )
+
+    return parser
+
+
+def read_config_from_file(args: argparse.Namespace, parser: argparse.ArgumentParser):
+    if not args.config_file:
+        return args
+
+    config_path = args.config_file + ".toml" if not args.config_file.endswith(".toml") else args.config_file
+
+    if not os.path.exists(config_path):
+        logger.info(f"{config_path} not found.")
+        exit(1)
+
+    logger.info(f"Loading settings from {config_path}...")
+    with open(config_path, "r", encoding="utf-8") as f:
+        config_dict = toml.load(f)
+
+    # combine all sections into one
+    ignore_nesting_dict = {}
+    for section_name, section_dict in config_dict.items():
+        # if value is not dict, save key and value as is
+        if not isinstance(section_dict, dict):
+            ignore_nesting_dict[section_name] = section_dict
+            continue
+
+        # if value is dict, save all key and value into one dict
+        for key, value in section_dict.items():
+            ignore_nesting_dict[key] = value
+
+    config_args = argparse.Namespace(**ignore_nesting_dict)
+    args = parser.parse_args(namespace=config_args)
+    args.config_file = os.path.splitext(args.config_file)[0]
+    logger.info(args.config_file)
+
+    return args
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+
+    args = parser.parse_args()
+    args = read_config_from_file(args, parser)
+
+    trainer = FineTuningTrainer()
+    trainer.train(args)
diff --git a/hv_train_network.py b/hv_train_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..6765b3dc35c52b6a758b9cc99e8d8668978fbfac
--- /dev/null
+++ b/hv_train_network.py
@@ -0,0 +1,2602 @@
+import ast
+import asyncio
+from datetime import timedelta
+import gc
+import importlib
+import argparse
+import math
+import os
+import pathlib
+import re
+import sys
+import random
+import time
+import json
+from multiprocessing import Value
+from typing import Any, Dict, List, Optional
+import accelerate
+import numpy as np
+from packaging.version import Version
+from PIL import Image
+
+import huggingface_hub
+import toml
+
+import torch
+from tqdm import tqdm
+from accelerate.utils import set_seed
+from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs, PartialState
+from safetensors.torch import load_file
+import transformers
+from diffusers.optimization import (
+    SchedulerType as DiffusersSchedulerType,
+    TYPE_TO_SCHEDULER_FUNCTION as DIFFUSERS_TYPE_TO_SCHEDULER_FUNCTION,
+)
+from transformers.optimization import SchedulerType, TYPE_TO_SCHEDULER_FUNCTION
+
+from dataset import config_utils
+from hunyuan_model.models import load_transformer, get_rotary_pos_embed_by_shape, HYVideoDiffusionTransformer
+import hunyuan_model.text_encoder as text_encoder_module
+from hunyuan_model.vae import load_vae, VAE_VER
+import hunyuan_model.vae as vae_module
+from modules.scheduling_flow_match_discrete import FlowMatchDiscreteScheduler
+import networks.lora as lora_module
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+from dataset.image_video_dataset import ARCHITECTURE_HUNYUAN_VIDEO, ARCHITECTURE_HUNYUAN_VIDEO_FULL
+from hv_generate_video import save_images_grid, save_videos_grid, resize_image_to_bucket, encode_to_latents
+
+import logging
+
+from utils import huggingface_utils, model_utils, train_utils, sai_model_spec
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+SS_METADATA_KEY_BASE_MODEL_VERSION = "ss_base_model_version"
+SS_METADATA_KEY_NETWORK_MODULE = "ss_network_module"
+SS_METADATA_KEY_NETWORK_DIM = "ss_network_dim"
+SS_METADATA_KEY_NETWORK_ALPHA = "ss_network_alpha"
+SS_METADATA_KEY_NETWORK_ARGS = "ss_network_args"
+
+SS_METADATA_MINIMUM_KEYS = [
+    SS_METADATA_KEY_BASE_MODEL_VERSION,
+    SS_METADATA_KEY_NETWORK_MODULE,
+    SS_METADATA_KEY_NETWORK_DIM,
+    SS_METADATA_KEY_NETWORK_ALPHA,
+    SS_METADATA_KEY_NETWORK_ARGS,
+]
+
+
+def clean_memory_on_device(device: torch.device):
+    r"""
+    Clean memory on the specified device, will be called from training scripts.
+    """
+    gc.collect()
+
+    # device may "cuda" or "cuda:0", so we need to check the type of device
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+    if device.type == "xpu":
+        torch.xpu.empty_cache()
+    if device.type == "mps":
+        torch.mps.empty_cache()
+
+
+# for collate_fn: epoch and step is multiprocessing.Value
+class collator_class:
+    def __init__(self, epoch, step, dataset):
+        self.current_epoch = epoch
+        self.current_step = step
+        self.dataset = dataset  # not used if worker_info is not None, in case of multiprocessing
+
+    def __call__(self, examples):
+        worker_info = torch.utils.data.get_worker_info()
+        # worker_info is None in the main process
+        if worker_info is not None:
+            dataset = worker_info.dataset
+        else:
+            dataset = self.dataset
+
+        # set epoch and step
+        dataset.set_current_epoch(self.current_epoch.value)
+        dataset.set_current_step(self.current_step.value)
+        return examples[0]
+
+
+def prepare_accelerator(args: argparse.Namespace) -> Accelerator:
+    """
+    DeepSpeed is not supported in this script currently.
+    """
+    if args.logging_dir is None:
+        logging_dir = None
+    else:
+        log_prefix = "" if args.log_prefix is None else args.log_prefix
+        logging_dir = args.logging_dir + "/" + log_prefix + time.strftime("%Y%m%d%H%M%S", time.localtime())
+
+    if args.log_with is None:
+        if logging_dir is not None:
+            log_with = "tensorboard"
+        else:
+            log_with = None
+    else:
+        log_with = args.log_with
+        if log_with in ["tensorboard", "all"]:
+            if logging_dir is None:
+                raise ValueError(
+                    "logging_dir is required when log_with is tensorboard / Tensorboardを使う場合、logging_dirを指定してください"
+                )
+        if log_with in ["wandb", "all"]:
+            try:
+                import wandb
+            except ImportError:
+                raise ImportError("No wandb / wandb がインストールされていないようです")
+            if logging_dir is not None:
+                os.makedirs(logging_dir, exist_ok=True)
+                os.environ["WANDB_DIR"] = logging_dir
+            if args.wandb_api_key is not None:
+                wandb.login(key=args.wandb_api_key)
+
+    kwargs_handlers = [
+        (
+            InitProcessGroupKwargs(
+                backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
+                init_method=(
+                    "env://?use_libuv=False" if os.name == "nt" and Version(torch.__version__) >= Version("2.4.0") else None
+                ),
+                timeout=timedelta(minutes=args.ddp_timeout) if args.ddp_timeout else None,
+            )
+            if torch.cuda.device_count() > 1
+            else None
+        ),
+        (
+            DistributedDataParallelKwargs(
+                gradient_as_bucket_view=args.ddp_gradient_as_bucket_view, static_graph=args.ddp_static_graph
+            )
+            if args.ddp_gradient_as_bucket_view or args.ddp_static_graph
+            else None
+        ),
+    ]
+    kwargs_handlers = [i for i in kwargs_handlers if i is not None]
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=log_with,
+        project_dir=logging_dir,
+        kwargs_handlers=kwargs_handlers,
+    )
+    print("accelerator device:", accelerator.device)
+    return accelerator
+
+
+def line_to_prompt_dict(line: str) -> dict:
+    # subset of gen_img_diffusers
+    prompt_args = line.split(" --")
+    prompt_dict = {}
+    prompt_dict["prompt"] = prompt_args[0]
+
+    for parg in prompt_args:
+        try:
+            m = re.match(r"w (\d+)", parg, re.IGNORECASE)
+            if m:
+                prompt_dict["width"] = int(m.group(1))
+                continue
+
+            m = re.match(r"h (\d+)", parg, re.IGNORECASE)
+            if m:
+                prompt_dict["height"] = int(m.group(1))
+                continue
+
+            m = re.match(r"f (\d+)", parg, re.IGNORECASE)
+            if m:
+                prompt_dict["frame_count"] = int(m.group(1))
+                continue
+
+            m = re.match(r"d (\d+)", parg, re.IGNORECASE)
+            if m:
+                prompt_dict["seed"] = int(m.group(1))
+                continue
+
+            m = re.match(r"s (\d+)", parg, re.IGNORECASE)
+            if m:  # steps
+                prompt_dict["sample_steps"] = max(1, min(1000, int(m.group(1))))
+                continue
+
+            m = re.match(r"g ([\d\.]+)", parg, re.IGNORECASE)
+            if m:  # scale
+                prompt_dict["guidance_scale"] = float(m.group(1))
+                continue
+
+            m = re.match(r"fs ([\d\.]+)", parg, re.IGNORECASE)
+            if m:  # scale
+                prompt_dict["discrete_flow_shift"] = float(m.group(1))
+                continue
+
+            m = re.match(r"l ([\d\.]+)", parg, re.IGNORECASE)
+            if m:  # scale
+                prompt_dict["cfg_scale"] = float(m.group(1))
+                continue
+
+            m = re.match(r"n (.+)", parg, re.IGNORECASE)
+            if m:  # negative prompt
+                prompt_dict["negative_prompt"] = m.group(1)
+                continue
+
+            m = re.match(r"i (.+)", parg, re.IGNORECASE)
+            if m:  # negative prompt
+                prompt_dict["image_path"] = m.group(1)
+                continue
+
+        except ValueError as ex:
+            logger.error(f"Exception in parsing / 解析エラー: {parg}")
+            logger.error(ex)
+
+    return prompt_dict
+
+
+def load_prompts(prompt_file: str) -> list[Dict]:
+    # read prompts
+    if prompt_file.endswith(".txt"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        prompts = [line.strip() for line in lines if len(line.strip()) > 0 and line[0] != "#"]
+    elif prompt_file.endswith(".toml"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            data = toml.load(f)
+        prompts = [dict(**data["prompt"], **subset) for subset in data["prompt"]["subset"]]
+    elif prompt_file.endswith(".json"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            prompts = json.load(f)
+
+    # preprocess prompts
+    for i in range(len(prompts)):
+        prompt_dict = prompts[i]
+        if isinstance(prompt_dict, str):
+            prompt_dict = line_to_prompt_dict(prompt_dict)
+            prompts[i] = prompt_dict
+        assert isinstance(prompt_dict, dict)
+
+        # Adds an enumerator to the dict based on prompt position. Used later to name image files. Also cleanup of extra data in original prompt dict.
+        prompt_dict["enum"] = i
+        prompt_dict.pop("subset", None)
+
+    return prompts
+
+
+def compute_density_for_timestep_sampling(
+    weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
+):
+    """Compute the density for sampling the timesteps when doing SD3 training.
+
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "logit_normal":
+        # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
+        u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
+        u = torch.nn.functional.sigmoid(u)
+    elif weighting_scheme == "mode":
+        u = torch.rand(size=(batch_size,), device="cpu")
+        u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
+    else:
+        u = torch.rand(size=(batch_size,), device="cpu")
+    return u
+
+
+def get_sigmas(noise_scheduler, timesteps, device, n_dim=4, dtype=torch.float32):
+    sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype)
+    schedule_timesteps = noise_scheduler.timesteps.to(device)
+    timesteps = timesteps.to(device)
+
+    # if sum([(schedule_timesteps == t) for t in timesteps]) < len(timesteps):
+    if any([(schedule_timesteps == t).sum() == 0 for t in timesteps]):
+        # raise ValueError("Some timesteps are not in the schedule / 一部のtimestepsがスケジュールに含まれていません")
+        # round to nearest timestep
+        logger.warning("Some timesteps are not in the schedule / 一部のtimestepsがスケジュールに含まれていません")
+        step_indices = [torch.argmin(torch.abs(schedule_timesteps - t)).item() for t in timesteps]
+    else:
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+    sigma = sigmas[step_indices].flatten()
+    while len(sigma.shape) < n_dim:
+        sigma = sigma.unsqueeze(-1)
+    return sigma
+
+
+def compute_loss_weighting_for_sd3(weighting_scheme: str, noise_scheduler, timesteps, device, dtype):
+    """Computes loss weighting scheme for SD3 training.
+
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "sigma_sqrt" or weighting_scheme == "cosmap":
+        sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=5, dtype=dtype)
+        if weighting_scheme == "sigma_sqrt":
+            weighting = (sigmas**-2.0).float()
+        else:
+            bot = 1 - 2 * sigmas + 2 * sigmas**2
+            weighting = 2 / (math.pi * bot)
+    else:
+        weighting = None  # torch.ones_like(sigmas)
+    return weighting
+
+
+def should_sample_images(args, steps, epoch=None):
+    if steps == 0:
+        if not args.sample_at_first:
+            return False
+    else:
+        should_sample_by_steps = args.sample_every_n_steps is not None and steps % args.sample_every_n_steps == 0
+        should_sample_by_epochs = (
+            args.sample_every_n_epochs is not None and epoch is not None and epoch % args.sample_every_n_epochs == 0
+        )
+        if not should_sample_by_steps and not should_sample_by_epochs:
+            return False
+    return True
+
+
+class NetworkTrainer:
+    def __init__(self):
+        self.blocks_to_swap = None
+
+    # TODO 他のスクリプトと共通化する
+    def generate_step_logs(
+        self,
+        args: argparse.Namespace,
+        current_loss,
+        avr_loss,
+        lr_scheduler,
+        lr_descriptions,
+        optimizer=None,
+        keys_scaled=None,
+        mean_norm=None,
+        maximum_norm=None,
+    ):
+        network_train_unet_only = True
+        logs = {"loss/current": current_loss, "loss/average": avr_loss}
+
+        if keys_scaled is not None:
+            logs["max_norm/keys_scaled"] = keys_scaled
+            logs["max_norm/average_key_norm"] = mean_norm
+            logs["max_norm/max_key_norm"] = maximum_norm
+
+        lrs = lr_scheduler.get_last_lr()
+        for i, lr in enumerate(lrs):
+            if lr_descriptions is not None:
+                lr_desc = lr_descriptions[i]
+            else:
+                idx = i - (0 if network_train_unet_only else -1)
+                if idx == -1:
+                    lr_desc = "textencoder"
+                else:
+                    if len(lrs) > 2:
+                        lr_desc = f"group{idx}"
+                    else:
+                        lr_desc = "unet"
+
+            logs[f"lr/{lr_desc}"] = lr
+
+            if args.optimizer_type.lower().startswith("DAdapt".lower()) or args.optimizer_type.lower() == "Prodigy".lower():
+                # tracking d*lr value
+                logs[f"lr/d*lr/{lr_desc}"] = (
+                    lr_scheduler.optimizers[-1].param_groups[i]["d"] * lr_scheduler.optimizers[-1].param_groups[i]["lr"]
+                )
+            if (
+                args.optimizer_type.lower().endswith("ProdigyPlusScheduleFree".lower()) and optimizer is not None
+            ):  # tracking d*lr value of unet.
+                logs["lr/d*lr"] = optimizer.param_groups[0]["d"] * optimizer.param_groups[0]["lr"]
+        else:
+            idx = 0
+            if not network_train_unet_only:
+                logs["lr/textencoder"] = float(lrs[0])
+                idx = 1
+
+            for i in range(idx, len(lrs)):
+                logs[f"lr/group{i}"] = float(lrs[i])
+                if args.optimizer_type.lower().startswith("DAdapt".lower()) or args.optimizer_type.lower() == "Prodigy".lower():
+                    logs[f"lr/d*lr/group{i}"] = (
+                        lr_scheduler.optimizers[-1].param_groups[i]["d"] * lr_scheduler.optimizers[-1].param_groups[i]["lr"]
+                    )
+                if args.optimizer_type.lower().endswith("ProdigyPlusScheduleFree".lower()) and optimizer is not None:
+                    logs[f"lr/d*lr/group{i}"] = optimizer.param_groups[i]["d"] * optimizer.param_groups[i]["lr"]
+
+        return logs
+
+    def get_optimizer(self, args, trainable_params: list[torch.nn.Parameter]) -> tuple[str, str, torch.optim.Optimizer]:
+        # adamw, adamw8bit, adafactor
+
+        optimizer_type = args.optimizer_type.lower()
+
+        # split optimizer_type and optimizer_args
+        optimizer_kwargs = {}
+        if args.optimizer_args is not None and len(args.optimizer_args) > 0:
+            for arg in args.optimizer_args:
+                key, value = arg.split("=")
+                value = ast.literal_eval(value)
+                optimizer_kwargs[key] = value
+
+        lr = args.learning_rate
+        optimizer = None
+        optimizer_class = None
+
+        if optimizer_type.endswith("8bit".lower()):
+            try:
+                import bitsandbytes as bnb
+            except ImportError:
+                raise ImportError("No bitsandbytes / bitsandbytesがインストールされていないようです")
+
+            if optimizer_type == "AdamW8bit".lower():
+                logger.info(f"use 8-bit AdamW optimizer | {optimizer_kwargs}")
+                optimizer_class = bnb.optim.AdamW8bit
+                optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+
+        elif optimizer_type == "Adafactor".lower():
+            # Adafactor: check relative_step and warmup_init
+            if "relative_step" not in optimizer_kwargs:
+                optimizer_kwargs["relative_step"] = True  # default
+            if not optimizer_kwargs["relative_step"] and optimizer_kwargs.get("warmup_init", False):
+                logger.info(
+                    f"set relative_step to True because warmup_init is True / warmup_initがTrueのためrelative_stepをTrueにします"
+                )
+                optimizer_kwargs["relative_step"] = True
+            logger.info(f"use Adafactor optimizer | {optimizer_kwargs}")
+
+            if optimizer_kwargs["relative_step"]:
+                logger.info(f"relative_step is true / relative_stepがtrueです")
+                if lr != 0.0:
+                    logger.warning(f"learning rate is used as initial_lr / 指定したlearning rateはinitial_lrとして使用されます")
+                args.learning_rate = None
+
+                if args.lr_scheduler != "adafactor":
+                    logger.info(f"use adafactor_scheduler / スケジューラにadafactor_schedulerを使用します")
+                args.lr_scheduler = f"adafactor:{lr}"  # ちょっと微妙だけど
+
+                lr = None
+            else:
+                if args.max_grad_norm != 0.0:
+                    logger.warning(
+                        f"because max_grad_norm is set, clip_grad_norm is enabled. consider set to 0 / max_grad_normが設定されているためclip_grad_normが有効になります。0に設定して無効にしたほうがいいかもしれません"
+                    )
+                if args.lr_scheduler != "constant_with_warmup":
+                    logger.warning(f"constant_with_warmup will be good / スケジューラはconstant_with_warmupが良いかもしれません")
+                if optimizer_kwargs.get("clip_threshold", 1.0) != 1.0:
+                    logger.warning(f"clip_threshold=1.0 will be good / clip_thresholdは1.0が良いかもしれません")
+
+            optimizer_class = transformers.optimization.Adafactor
+            optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+
+        elif optimizer_type == "AdamW".lower():
+            logger.info(f"use AdamW optimizer | {optimizer_kwargs}")
+            optimizer_class = torch.optim.AdamW
+            optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+
+        if optimizer is None:
+            # 任意のoptimizerを使う
+            case_sensitive_optimizer_type = args.optimizer_type  # not lower
+            logger.info(f"use {case_sensitive_optimizer_type} | {optimizer_kwargs}")
+
+            if "." not in case_sensitive_optimizer_type:  # from torch.optim
+                optimizer_module = torch.optim
+            else:  # from other library
+                values = case_sensitive_optimizer_type.split(".")
+                optimizer_module = importlib.import_module(".".join(values[:-1]))
+                case_sensitive_optimizer_type = values[-1]
+
+            optimizer_class = getattr(optimizer_module, case_sensitive_optimizer_type)
+            optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+
+        # for logging
+        optimizer_name = optimizer_class.__module__ + "." + optimizer_class.__name__
+        optimizer_args = ",".join([f"{k}={v}" for k, v in optimizer_kwargs.items()])
+
+        # get train and eval functions
+        if hasattr(optimizer, "train") and callable(optimizer.train):
+            train_fn = optimizer.train
+            eval_fn = optimizer.eval
+        else:
+            train_fn = lambda: None
+            eval_fn = lambda: None
+
+        return optimizer_name, optimizer_args, optimizer, train_fn, eval_fn
+
+    def is_schedulefree_optimizer(self, optimizer: torch.optim.Optimizer, args: argparse.Namespace) -> bool:
+        return args.optimizer_type.lower().endswith("schedulefree".lower())  # or args.optimizer_schedulefree_wrapper
+
+    def get_dummy_scheduler(optimizer: torch.optim.Optimizer) -> Any:
+        # dummy scheduler for schedulefree optimizer. supports only empty step(), get_last_lr() and optimizers.
+        # this scheduler is used for logging only.
+        # this isn't be wrapped by accelerator because of this class is not a subclass of torch.optim.lr_scheduler._LRScheduler
+        class DummyScheduler:
+            def __init__(self, optimizer: torch.optim.Optimizer):
+                self.optimizer = optimizer
+
+            def step(self):
+                pass
+
+            def get_last_lr(self):
+                return [group["lr"] for group in self.optimizer.param_groups]
+
+        return DummyScheduler(optimizer)
+
+    def get_lr_scheduler(self, args, optimizer: torch.optim.Optimizer, num_processes: int):
+        """
+        Unified API to get any scheduler from its name.
+        """
+        # if schedulefree optimizer, return dummy scheduler
+        if self.is_schedulefree_optimizer(optimizer, args):
+            return self.get_dummy_scheduler(optimizer)
+
+        name = args.lr_scheduler
+        num_training_steps = args.max_train_steps * num_processes  # * args.gradient_accumulation_steps
+        num_warmup_steps: Optional[int] = (
+            int(args.lr_warmup_steps * num_training_steps) if isinstance(args.lr_warmup_steps, float) else args.lr_warmup_steps
+        )
+        num_decay_steps: Optional[int] = (
+            int(args.lr_decay_steps * num_training_steps) if isinstance(args.lr_decay_steps, float) else args.lr_decay_steps
+        )
+        num_stable_steps = num_training_steps - num_warmup_steps - num_decay_steps
+        num_cycles = args.lr_scheduler_num_cycles
+        power = args.lr_scheduler_power
+        timescale = args.lr_scheduler_timescale
+        min_lr_ratio = args.lr_scheduler_min_lr_ratio
+
+        lr_scheduler_kwargs = {}  # get custom lr_scheduler kwargs
+        if args.lr_scheduler_args is not None and len(args.lr_scheduler_args) > 0:
+            for arg in args.lr_scheduler_args:
+                key, value = arg.split("=")
+                value = ast.literal_eval(value)
+                lr_scheduler_kwargs[key] = value
+
+        def wrap_check_needless_num_warmup_steps(return_vals):
+            if num_warmup_steps is not None and num_warmup_steps != 0:
+                raise ValueError(f"{name} does not require `num_warmup_steps`. Set None or 0.")
+            return return_vals
+
+        # using any lr_scheduler from other library
+        if args.lr_scheduler_type:
+            lr_scheduler_type = args.lr_scheduler_type
+            logger.info(f"use {lr_scheduler_type} | {lr_scheduler_kwargs} as lr_scheduler")
+            if "." not in lr_scheduler_type:  # default to use torch.optim
+                lr_scheduler_module = torch.optim.lr_scheduler
+            else:
+                values = lr_scheduler_type.split(".")
+                lr_scheduler_module = importlib.import_module(".".join(values[:-1]))
+                lr_scheduler_type = values[-1]
+            lr_scheduler_class = getattr(lr_scheduler_module, lr_scheduler_type)
+            lr_scheduler = lr_scheduler_class(optimizer, **lr_scheduler_kwargs)
+            return lr_scheduler
+
+        if name.startswith("adafactor"):
+            assert (
+                type(optimizer) == transformers.optimization.Adafactor
+            ), f"adafactor scheduler must be used with Adafactor optimizer / adafactor schedulerはAdafactorオプティマイザと同時に使ってください"
+            initial_lr = float(name.split(":")[1])
+            # logger.info(f"adafactor scheduler init lr {initial_lr}")
+            return wrap_check_needless_num_warmup_steps(transformers.optimization.AdafactorSchedule(optimizer, initial_lr))
+
+        if name == DiffusersSchedulerType.PIECEWISE_CONSTANT.value:
+            name = DiffusersSchedulerType(name)
+            schedule_func = DIFFUSERS_TYPE_TO_SCHEDULER_FUNCTION[name]
+            return schedule_func(optimizer, **lr_scheduler_kwargs)  # step_rules and last_epoch are given as kwargs
+
+        name = SchedulerType(name)
+        schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
+
+        if name == SchedulerType.CONSTANT:
+            return wrap_check_needless_num_warmup_steps(schedule_func(optimizer, **lr_scheduler_kwargs))
+
+        # All other schedulers require `num_warmup_steps`
+        if num_warmup_steps is None:
+            raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
+
+        if name == SchedulerType.CONSTANT_WITH_WARMUP:
+            return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, **lr_scheduler_kwargs)
+
+        if name == SchedulerType.INVERSE_SQRT:
+            return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, timescale=timescale, **lr_scheduler_kwargs)
+
+        # All other schedulers require `num_training_steps`
+        if num_training_steps is None:
+            raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
+
+        if name == SchedulerType.COSINE_WITH_RESTARTS:
+            return schedule_func(
+                optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_training_steps=num_training_steps,
+                num_cycles=num_cycles,
+                **lr_scheduler_kwargs,
+            )
+
+        if name == SchedulerType.POLYNOMIAL:
+            return schedule_func(
+                optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_training_steps=num_training_steps,
+                power=power,
+                **lr_scheduler_kwargs,
+            )
+
+        if name == SchedulerType.COSINE_WITH_MIN_LR:
+            return schedule_func(
+                optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_training_steps=num_training_steps,
+                num_cycles=num_cycles / 2,
+                min_lr_rate=min_lr_ratio,
+                **lr_scheduler_kwargs,
+            )
+
+        # these schedulers do not require `num_decay_steps`
+        if name == SchedulerType.LINEAR or name == SchedulerType.COSINE:
+            return schedule_func(
+                optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_training_steps=num_training_steps,
+                **lr_scheduler_kwargs,
+            )
+
+        # All other schedulers require `num_decay_steps`
+        if num_decay_steps is None:
+            raise ValueError(f"{name} requires `num_decay_steps`, please provide that argument.")
+        if name == SchedulerType.WARMUP_STABLE_DECAY:
+            return schedule_func(
+                optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_stable_steps=num_stable_steps,
+                num_decay_steps=num_decay_steps,
+                num_cycles=num_cycles / 2,
+                min_lr_ratio=min_lr_ratio if min_lr_ratio is not None else 0.0,
+                **lr_scheduler_kwargs,
+            )
+
+        return schedule_func(
+            optimizer,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_training_steps,
+            num_decay_steps=num_decay_steps,
+            **lr_scheduler_kwargs,
+        )
+
+    def resume_from_local_or_hf_if_specified(self, accelerator: Accelerator, args: argparse.Namespace) -> bool:
+        if not args.resume:
+            return False
+
+        if not args.resume_from_huggingface:
+            logger.info(f"resume training from local state: {args.resume}")
+            accelerator.load_state(args.resume)
+            return True
+
+        logger.info(f"resume training from huggingface state: {args.resume}")
+        repo_id = args.resume.split("/")[0] + "/" + args.resume.split("/")[1]
+        path_in_repo = "/".join(args.resume.split("/")[2:])
+        revision = None
+        repo_type = None
+        if ":" in path_in_repo:
+            divided = path_in_repo.split(":")
+            if len(divided) == 2:
+                path_in_repo, revision = divided
+                repo_type = "model"
+            else:
+                path_in_repo, revision, repo_type = divided
+        logger.info(f"Downloading state from huggingface: {repo_id}/{path_in_repo}@{revision}")
+
+        list_files = huggingface_utils.list_dir(
+            repo_id=repo_id,
+            subfolder=path_in_repo,
+            revision=revision,
+            token=args.huggingface_token,
+            repo_type=repo_type,
+        )
+
+        async def download(filename) -> str:
+            def task():
+                return huggingface_hub.hf_hub_download(
+                    repo_id=repo_id,
+                    filename=filename,
+                    revision=revision,
+                    repo_type=repo_type,
+                    token=args.huggingface_token,
+                )
+
+            return await asyncio.get_event_loop().run_in_executor(None, task)
+
+        loop = asyncio.get_event_loop()
+        results = loop.run_until_complete(asyncio.gather(*[download(filename=filename.rfilename) for filename in list_files]))
+        if len(results) == 0:
+            raise ValueError(
+                "No files found in the specified repo id/path/revision / 指定されたリポジトリID/パス/リビジョンにファイルが見つかりませんでした"
+            )
+        dirname = os.path.dirname(results[0])
+        accelerator.load_state(dirname)
+
+        return True
+
+    def get_noisy_model_input_and_timesteps(
+        self,
+        args: argparse.Namespace,
+        noise: torch.Tensor,
+        latents: torch.Tensor,
+        noise_scheduler: FlowMatchDiscreteScheduler,
+        device: torch.device,
+        dtype: torch.dtype,
+    ):
+        batch_size = noise.shape[0]
+
+        if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid" or args.timestep_sampling == "shift":
+            if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
+                # Simple random t-based noise sampling
+                if args.timestep_sampling == "sigmoid":
+                    t = torch.sigmoid(args.sigmoid_scale * torch.randn((batch_size,), device=device))
+                else:
+                    t = torch.rand((batch_size,), device=device)
+
+            elif args.timestep_sampling == "shift":
+                shift = args.discrete_flow_shift
+                logits_norm = torch.randn(batch_size, device=device)
+                logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
+                t = logits_norm.sigmoid()
+                t = (t * shift) / (1 + (shift - 1) * t)
+
+            t_min = args.min_timestep if args.min_timestep is not None else 0
+            t_max = args.max_timestep if args.max_timestep is not None else 1000.0
+            t_min /= 1000.0
+            t_max /= 1000.0
+            t = t * (t_max - t_min) + t_min  # scale to [t_min, t_max], default [0, 1]
+
+            timesteps = t * 1000.0
+            t = t.view(-1, 1, 1, 1, 1)
+            noisy_model_input = (1 - t) * latents + t * noise
+
+            timesteps += 1  # 1 to 1000
+        else:
+            # Sample a random timestep for each image
+            # for weighting schemes where we sample timesteps non-uniformly
+            u = compute_density_for_timestep_sampling(
+                weighting_scheme=args.weighting_scheme,
+                batch_size=batch_size,
+                logit_mean=args.logit_mean,
+                logit_std=args.logit_std,
+                mode_scale=args.mode_scale,
+            )
+            # indices = (u * noise_scheduler.config.num_train_timesteps).long()
+            t_min = args.min_timestep if args.min_timestep is not None else 0
+            t_max = args.max_timestep if args.max_timestep is not None else 1000
+            indices = (u * (t_max - t_min) + t_min).long()
+
+            timesteps = noise_scheduler.timesteps[indices].to(device=device)  # 1 to 1000
+
+            # Add noise according to flow matching.
+            sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
+            noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+
+        return noisy_model_input, timesteps
+
+    def show_timesteps(self, args: argparse.Namespace):
+        N_TRY = 100000
+        BATCH_SIZE = 1000
+        CONSOLE_WIDTH = 64
+        N_TIMESTEPS_PER_LINE = 25
+
+        noise_scheduler = FlowMatchDiscreteScheduler(shift=args.discrete_flow_shift, reverse=True, solver="euler")
+        # print(f"Noise scheduler timesteps: {noise_scheduler.timesteps}")
+
+        latents = torch.zeros(BATCH_SIZE, 1, 1, 1, 1, dtype=torch.float16)
+        noise = torch.ones_like(latents)
+
+        # sample timesteps
+        sampled_timesteps = [0] * noise_scheduler.config.num_train_timesteps
+        for i in tqdm(range(N_TRY // BATCH_SIZE)):
+            # we use noise=1, so retured noisy_model_input is same as timestep, because `noisy_model_input = (1 - t) * latents + t * noise`
+            actual_timesteps, _ = self.get_noisy_model_input_and_timesteps(
+                args, noise, latents, noise_scheduler, "cpu", torch.float16
+            )
+            actual_timesteps = actual_timesteps[:, 0, 0, 0, 0] * 1000
+            for t in actual_timesteps:
+                t = int(t.item())
+                sampled_timesteps[t] += 1
+
+        # sample weighting
+        sampled_weighting = [0] * noise_scheduler.config.num_train_timesteps
+        for i in tqdm(range(len(sampled_weighting))):
+            timesteps = torch.tensor([i + 1], device="cpu")
+            weighting = compute_loss_weighting_for_sd3(args.weighting_scheme, noise_scheduler, timesteps, "cpu", torch.float16)
+            if weighting is None:
+                weighting = torch.tensor(1.0, device="cpu")
+            elif torch.isinf(weighting).any():
+                weighting = torch.tensor(1.0, device="cpu")
+            sampled_weighting[i] = weighting.item()
+
+        # show results
+        if args.show_timesteps == "image":
+            # show timesteps with matplotlib
+            import matplotlib.pyplot as plt
+
+            plt.figure(figsize=(10, 5))
+            plt.subplot(1, 2, 1)
+            plt.bar(range(len(sampled_timesteps)), sampled_timesteps, width=1.0)
+            plt.title("Sampled timesteps")
+            plt.xlabel("Timestep")
+            plt.ylabel("Count")
+
+            plt.subplot(1, 2, 2)
+            plt.bar(range(len(sampled_weighting)), sampled_weighting, width=1.0)
+            plt.title("Sampled loss weighting")
+            plt.xlabel("Timestep")
+            plt.ylabel("Weighting")
+
+            plt.tight_layout()
+            plt.show()
+
+        else:
+            sampled_timesteps = np.array(sampled_timesteps)
+            sampled_weighting = np.array(sampled_weighting)
+
+            # average per line
+            sampled_timesteps = sampled_timesteps.reshape(-1, N_TIMESTEPS_PER_LINE).mean(axis=1)
+            sampled_weighting = sampled_weighting.reshape(-1, N_TIMESTEPS_PER_LINE).mean(axis=1)
+
+            max_count = max(sampled_timesteps)
+            print(f"Sampled timesteps: max count={max_count}")
+            for i, t in enumerate(sampled_timesteps):
+                line = f"{(i)*N_TIMESTEPS_PER_LINE:4d}-{(i+1)*N_TIMESTEPS_PER_LINE-1:4d}: "
+                line += "#" * int(t / max_count * CONSOLE_WIDTH)
+                print(line)
+
+            max_weighting = max(sampled_weighting)
+            print(f"Sampled loss weighting: max weighting={max_weighting}")
+            for i, w in enumerate(sampled_weighting):
+                line = f"{i*N_TIMESTEPS_PER_LINE:4d}-{(i+1)*N_TIMESTEPS_PER_LINE-1:4d}: {w:8.2f} "
+                line += "#" * int(w / max_weighting * CONSOLE_WIDTH)
+                print(line)
+
+    def sample_images(self, accelerator, args, epoch, steps, vae, transformer, sample_parameters, dit_dtype):
+        """architecture independent sample images"""
+        if not should_sample_images(args, steps, epoch):
+            return
+
+        logger.info("")
+        logger.info(f"generating sample images at step / サンプル画像生成 ステップ: {steps}")
+        if sample_parameters is None:
+            logger.error(f"No prompt file / プロンプトファイルがありません: {args.sample_prompts}")
+            return
+
+        distributed_state = PartialState()  # for multi gpu distributed inference. this is a singleton, so it's safe to use it here
+
+        # Use the unwrapped model
+        transformer = accelerator.unwrap_model(transformer)
+        transformer.switch_block_swap_for_inference()
+
+        # Create a directory to save the samples
+        save_dir = args.output_dir + "/sample"
+        os.makedirs(save_dir, exist_ok=True)
+
+        # save random state to restore later
+        rng_state = torch.get_rng_state()
+        cuda_rng_state = None
+        try:
+            cuda_rng_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None
+        except Exception:
+            pass
+
+        if distributed_state.num_processes <= 1:
+            # If only one device is available, just use the original prompt list. We don't need to care about the distribution of prompts.
+            with torch.no_grad(), accelerator.autocast():
+                for sample_parameter in sample_parameters:
+                    self.sample_image_inference(
+                        accelerator, args, transformer, dit_dtype, vae, save_dir, sample_parameter, epoch, steps
+                    )
+                    clean_memory_on_device(accelerator.device)
+        else:
+            # Creating list with N elements, where each element is a list of prompt_dicts, and N is the number of processes available (number of devices available)
+            # prompt_dicts are assigned to lists based on order of processes, to attempt to time the image creation time to match enum order. Probably only works when steps and sampler are identical.
+            per_process_params = []  # list of lists
+            for i in range(distributed_state.num_processes):
+                per_process_params.append(sample_parameters[i :: distributed_state.num_processes])
+
+            with torch.no_grad():
+                with distributed_state.split_between_processes(per_process_params) as sample_parameter_lists:
+                    for sample_parameter in sample_parameter_lists[0]:
+                        self.sample_image_inference(
+                            accelerator, args, transformer, dit_dtype, vae, save_dir, sample_parameter, epoch, steps
+                        )
+                        clean_memory_on_device(accelerator.device)
+
+        torch.set_rng_state(rng_state)
+        if cuda_rng_state is not None:
+            torch.cuda.set_rng_state(cuda_rng_state)
+
+        transformer.switch_block_swap_for_training()
+        clean_memory_on_device(accelerator.device)
+
+    def sample_image_inference(self, accelerator, args, transformer, dit_dtype, vae, save_dir, sample_parameter, epoch, steps):
+        """architecture independent sample images"""
+        sample_steps = sample_parameter.get("sample_steps", 20)
+        width = sample_parameter.get("width", 256)  # make smaller for faster and memory saving inference
+        height = sample_parameter.get("height", 256)
+        frame_count = sample_parameter.get("frame_count", 1)
+        guidance_scale = sample_parameter.get("guidance_scale", 6.0)
+        discrete_flow_shift = sample_parameter.get("discrete_flow_shift", 14.5)
+        seed = sample_parameter.get("seed")
+        prompt: str = sample_parameter.get("prompt", "")
+        cfg_scale = sample_parameter.get("cfg_scale", None)  # None for architecture default
+        negative_prompt = sample_parameter.get("negative_prompt", None)
+
+        if self.i2v_training:
+            image_path = sample_parameter.get("image_path", None)
+            if image_path is None:
+                logger.error("No image_path for i2v model / i2vモデルのサンプル画像生成にはimage_pathが必要です")
+                return
+        else:
+            image_path = None
+
+        device = accelerator.device
+        if seed is not None:
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed(seed)
+            generator = torch.Generator(device=device).manual_seed(seed)
+        else:
+            # True random sample image generation
+            torch.seed()
+            torch.cuda.seed()
+            generator = torch.Generator(device=device).manual_seed(torch.initial_seed())
+
+        logger.info(f"prompt: {prompt}")
+        logger.info(f"height: {height}")
+        logger.info(f"width: {width}")
+        logger.info(f"frame count: {frame_count}")
+        logger.info(f"sample steps: {sample_steps}")
+        logger.info(f"guidance scale: {guidance_scale}")
+        logger.info(f"discrete flow shift: {discrete_flow_shift}")
+        if seed is not None:
+            logger.info(f"seed: {seed}")
+
+        do_classifier_free_guidance = False
+        if negative_prompt is not None:
+            do_classifier_free_guidance = True
+            logger.info(f"negative prompt: {negative_prompt}")
+            logger.info(f"cfg scale: {cfg_scale}")
+
+        if self.i2v_training:
+            logger.info(f"image path: {image_path}")
+
+        # inference: architecture dependent
+        video = self.do_inference(
+            accelerator,
+            args,
+            sample_parameter,
+            vae,
+            dit_dtype,
+            transformer,
+            discrete_flow_shift,
+            sample_steps,
+            width,
+            height,
+            frame_count,
+            generator,
+            do_classifier_free_guidance,
+            guidance_scale,
+            cfg_scale,
+            image_path=image_path,
+        )
+
+        # Save video
+        ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
+        num_suffix = f"e{epoch:06d}" if epoch is not None else f"{steps:06d}"
+        seed_suffix = "" if seed is None else f"_{seed}"
+        prompt_idx = sample_parameter.get("enum", 0)
+        save_path = (
+            f"{'' if args.output_name is None else args.output_name + '_'}{num_suffix}_{prompt_idx:02d}_{ts_str}{seed_suffix}"
+        )
+        if video.shape[2] == 1:
+            save_images_grid(video, save_dir, save_path, create_subdir=False)
+        else:
+            save_videos_grid(video, os.path.join(save_dir, save_path) + ".mp4")
+
+        # Move models back to initial state
+        vae.to("cpu")
+        clean_memory_on_device(device)
+
+    # region model specific
+
+    @property
+    def architecture(self) -> str:
+        return ARCHITECTURE_HUNYUAN_VIDEO
+
+    @property
+    def architecture_full_name(self) -> str:
+        return ARCHITECTURE_HUNYUAN_VIDEO_FULL
+
+    def handle_model_specific_args(self, args: argparse.Namespace):
+        self.pos_embed_cache = {}
+
+        self._i2v_training = args.dit_in_channels == 32  # may be changed in the future
+        if self._i2v_training:
+            logger.info("I2V training mode")
+
+    @property
+    def i2v_training(self) -> bool:
+        return self._i2v_training
+
+    def process_sample_prompts(
+        self,
+        args: argparse.Namespace,
+        accelerator: Accelerator,
+        sample_prompts: str,
+    ):
+        text_encoder1, text_encoder2, fp8_llm = args.text_encoder1, args.text_encoder2, args.fp8_llm
+
+        logger.info(f"cache Text Encoder outputs for sample prompt: {sample_prompts}")
+        prompts = load_prompts(sample_prompts)
+
+        def encode_for_text_encoder(text_encoder, is_llm=True):
+            sample_prompts_te_outputs = {}  # (prompt) -> (embeds, mask)
+            with accelerator.autocast(), torch.no_grad():
+                for prompt_dict in prompts:
+                    for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", None)]:
+                        if p is None:
+                            continue
+                        if p not in sample_prompts_te_outputs:
+                            logger.info(f"cache Text Encoder outputs for prompt: {p}")
+
+                            data_type = "video"
+                            text_inputs = text_encoder.text2tokens(p, data_type=data_type)
+
+                            prompt_outputs = text_encoder.encode(text_inputs, data_type=data_type)
+                            sample_prompts_te_outputs[p] = (prompt_outputs.hidden_state, prompt_outputs.attention_mask)
+
+            return sample_prompts_te_outputs
+
+        # Load Text Encoder 1 and encode
+        text_encoder_dtype = torch.float16 if args.text_encoder_dtype is None else model_utils.str_to_dtype(args.text_encoder_dtype)
+        logger.info(f"loading text encoder 1: {text_encoder1}")
+        text_encoder_1 = text_encoder_module.load_text_encoder_1(text_encoder1, accelerator.device, fp8_llm, text_encoder_dtype)
+
+        logger.info("encoding with Text Encoder 1")
+        te_outputs_1 = encode_for_text_encoder(text_encoder_1)
+        del text_encoder_1
+
+        # Load Text Encoder 2 and encode
+        logger.info(f"loading text encoder 2: {text_encoder2}")
+        text_encoder_2 = text_encoder_module.load_text_encoder_2(text_encoder2, accelerator.device, text_encoder_dtype)
+
+        logger.info("encoding with Text Encoder 2")
+        te_outputs_2 = encode_for_text_encoder(text_encoder_2, is_llm=False)
+        del text_encoder_2
+
+        # prepare sample parameters
+        sample_parameters = []
+        for prompt_dict in prompts:
+            prompt_dict_copy = prompt_dict.copy()
+
+            p = prompt_dict.get("prompt", "")
+            prompt_dict_copy["llm_embeds"] = te_outputs_1[p][0]
+            prompt_dict_copy["llm_mask"] = te_outputs_1[p][1]
+            prompt_dict_copy["clipL_embeds"] = te_outputs_2[p][0]
+            prompt_dict_copy["clipL_mask"] = te_outputs_2[p][1]
+
+            p = prompt_dict.get("negative_prompt", None)
+            if p is not None:
+                prompt_dict_copy["negative_llm_embeds"] = te_outputs_1[p][0]
+                prompt_dict_copy["negative_llm_mask"] = te_outputs_1[p][1]
+                prompt_dict_copy["negative_clipL_embeds"] = te_outputs_2[p][0]
+                prompt_dict_copy["negative_clipL_mask"] = te_outputs_2[p][1]
+
+            sample_parameters.append(prompt_dict_copy)
+
+        clean_memory_on_device(accelerator.device)
+
+        return sample_parameters
+
+    def do_inference(
+        self,
+        accelerator,
+        args,
+        sample_parameter,
+        vae,
+        dit_dtype,
+        transformer,
+        discrete_flow_shift,
+        sample_steps,
+        width,
+        height,
+        frame_count,
+        generator,
+        do_classifier_free_guidance,
+        guidance_scale,
+        cfg_scale,
+        image_path=None,
+    ):
+        """architecture dependent inference"""
+        device = accelerator.device
+        if cfg_scale is None:
+            cfg_scale = 1.0
+        do_classifier_free_guidance = do_classifier_free_guidance and cfg_scale != 1.0
+
+        # Prepare scheduler for each prompt
+        scheduler = FlowMatchDiscreteScheduler(shift=discrete_flow_shift, reverse=True, solver="euler")
+
+        # Number of inference steps for sampling
+        scheduler.set_timesteps(sample_steps, device=device)
+        timesteps = scheduler.timesteps
+
+        # Calculate latent video length based on VAE version
+        if "884" in VAE_VER:
+            latent_video_length = (frame_count - 1) // 4 + 1
+        elif "888" in VAE_VER:
+            latent_video_length = (frame_count - 1) // 8 + 1
+        else:
+            latent_video_length = frame_count
+
+        # Get embeddings
+        prompt_embeds = sample_parameter["llm_embeds"].to(device=device, dtype=dit_dtype)
+        prompt_mask = sample_parameter["llm_mask"].to(device=device)
+        prompt_embeds_2 = sample_parameter["clipL_embeds"].to(device=device, dtype=dit_dtype)
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = sample_parameter["negative_llm_embeds"].to(device=device, dtype=dit_dtype)
+            negative_prompt_mask = sample_parameter["negative_llm_mask"].to(device=device)
+            negative_prompt_embeds_2 = sample_parameter["negative_clipL_embeds"].to(device=device, dtype=dit_dtype)
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            prompt_mask = torch.cat([negative_prompt_mask, prompt_mask], dim=0)
+            prompt_embeds_2 = torch.cat([negative_prompt_embeds_2, prompt_embeds_2], dim=0)
+
+        num_channels_latents = 16  # transformer.config.in_channels
+        vae_scale_factor = 2 ** (4 - 1)  # Assuming 4 VAE blocks
+
+        # Initialize latents
+        shape_or_frame = (
+            1,
+            num_channels_latents,
+            1,
+            height // vae_scale_factor,
+            width // vae_scale_factor,
+        )
+        latents = []
+        for _ in range(latent_video_length):
+            latents.append(torch.randn(shape_or_frame, generator=generator, device=device, dtype=dit_dtype))
+        latents = torch.cat(latents, dim=2)
+
+        if self.i2v_training:
+            # Move VAE to the appropriate device for sampling
+            vae.to(device)
+            vae.eval()
+
+            image = Image.open(image_path)
+            image = resize_image_to_bucket(image, (width, height))  # returns a numpy array
+            image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(0).unsqueeze(2).float()  # 1, C, 1, H, W
+            image = image / 255.0
+
+            logger.info(f"Encoding image to latents")
+            image_latents = encode_to_latents(args, image, device)  # 1, C, 1, H, W
+            image_latents = image_latents.to(device=device, dtype=dit_dtype)
+
+            vae.to("cpu")
+            clean_memory_on_device(device)
+
+            zero_latents = torch.zeros_like(latents)
+            zero_latents[:, :, :1, :, :] = image_latents
+            image_latents = zero_latents
+        else:
+            image_latents = None
+
+        # Guidance scale
+        guidance_expand = torch.tensor([guidance_scale * 1000.0], dtype=torch.float32, device=device).to(dit_dtype)
+
+        # Get rotary positional embeddings
+        freqs_cos, freqs_sin = get_rotary_pos_embed_by_shape(transformer, latents.shape[2:])
+        freqs_cos = freqs_cos.to(device=device, dtype=dit_dtype)
+        freqs_sin = freqs_sin.to(device=device, dtype=dit_dtype)
+
+        # Wrap the inner loop with tqdm to track progress over timesteps
+        prompt_idx = sample_parameter.get("enum", 0)
+        with torch.no_grad():
+            for i, t in enumerate(tqdm(timesteps, desc=f"Sampling timesteps for prompt {prompt_idx+1}")):
+                latents_input = scheduler.scale_model_input(latents, t)
+
+                if do_classifier_free_guidance:
+                    latents_input = torch.cat([latents_input, latents_input], dim=0)  # 2, C, F, H, W
+
+                if image_latents is not None:
+                    latents_image_input = (
+                        image_latents if not do_classifier_free_guidance else torch.cat([image_latents, image_latents], dim=0)
+                    )
+                    latents_input = torch.cat([latents_input, latents_image_input], dim=1)  # 1 or 2, C*2, F, H, W
+
+                noise_pred = transformer(
+                    latents_input,
+                    t.repeat(latents.shape[0]).to(device=device, dtype=dit_dtype),
+                    text_states=prompt_embeds,
+                    text_mask=prompt_mask,
+                    text_states_2=prompt_embeds_2,
+                    freqs_cos=freqs_cos,
+                    freqs_sin=freqs_sin,
+                    guidance=guidance_expand,
+                    return_dict=True,
+                )["x"]
+
+                # perform classifier free guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + cfg_scale * (noise_pred_cond - noise_pred_uncond)
+
+                # Compute the previous noisy sample x_t -> x_t-1
+                latents = scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+        # Move VAE to the appropriate device for sampling
+        vae.to(device)
+        vae.eval()
+
+        # Decode latents to video
+        if hasattr(vae.config, "shift_factor") and vae.config.shift_factor:
+            latents = latents / vae.config.scaling_factor + vae.config.shift_factor
+        else:
+            latents = latents / vae.config.scaling_factor
+
+        latents = latents.to(device=device, dtype=vae.dtype)
+        with torch.no_grad():
+            video = vae.decode(latents, return_dict=False)[0]
+        video = (video / 2 + 0.5).clamp(0, 1)
+        video = video.cpu().float()
+
+        return video
+
+    def load_vae(self, args: argparse.Namespace, vae_dtype: torch.dtype, vae_path: str):
+        vae, _, s_ratio, t_ratio = load_vae(vae_dtype=vae_dtype, device="cpu", vae_path=vae_path)
+
+        if args.vae_chunk_size is not None:
+            vae.set_chunk_size_for_causal_conv_3d(args.vae_chunk_size)
+            logger.info(f"Set chunk_size to {args.vae_chunk_size} for CausalConv3d in VAE")
+        if args.vae_spatial_tile_sample_min_size is not None:
+            vae.enable_spatial_tiling(True)
+            vae.tile_sample_min_size = args.vae_spatial_tile_sample_min_size
+            vae.tile_latent_min_size = args.vae_spatial_tile_sample_min_size // 8
+        elif args.vae_tiling:
+            vae.enable_spatial_tiling(True)
+
+        return vae
+
+    def load_transformer(
+        self,
+        accelerator: Accelerator,
+        args: argparse.Namespace,
+        dit_path: str,
+        attn_mode: str,
+        split_attn: bool,
+        loading_device: str,
+        dit_weight_dtype: Optional[torch.dtype],
+    ):
+        transformer = load_transformer(dit_path, attn_mode, split_attn, loading_device, dit_weight_dtype, args.dit_in_channels)
+
+        if args.img_in_txt_in_offloading:
+            logger.info("Enable offloading img_in and txt_in to CPU")
+            transformer.enable_img_in_txt_in_offloading()
+
+        return transformer
+
+    def scale_shift_latents(self, latents):
+        latents = latents * vae_module.SCALING_FACTOR
+        return latents
+
+    def call_dit(
+        self,
+        args: argparse.Namespace,
+        accelerator: Accelerator,
+        transformer_arg,
+        latents: torch.Tensor,
+        batch: dict[str, torch.Tensor],
+        noise: torch.Tensor,
+        noisy_model_input: torch.Tensor,
+        timesteps: torch.Tensor,
+        network_dtype: torch.dtype,
+    ):
+        transformer: HYVideoDiffusionTransformer = transformer_arg
+        bsz = latents.shape[0]
+
+        # I2V training
+        if self.i2v_training:
+            image_latents = torch.zeros_like(latents)
+            image_latents[:, :, :1, :, :] = latents[:, :, :1, :, :]
+            noisy_model_input = torch.cat([noisy_model_input, image_latents], dim=1)  # concat along channel dim
+
+        # ensure guidance_scale in args is float
+        guidance_vec = torch.full((bsz,), float(args.guidance_scale), device=accelerator.device)  # , dtype=dit_dtype)
+
+        # ensure the hidden state will require grad
+        if args.gradient_checkpointing:
+            noisy_model_input.requires_grad_(True)
+            guidance_vec.requires_grad_(True)
+
+        pos_emb_shape = latents.shape[1:]
+        if pos_emb_shape not in self.pos_embed_cache:
+            freqs_cos, freqs_sin = get_rotary_pos_embed_by_shape(transformer, latents.shape[2:])
+            # freqs_cos = freqs_cos.to(device=accelerator.device, dtype=dit_dtype)
+            # freqs_sin = freqs_sin.to(device=accelerator.device, dtype=dit_dtype)
+            self.pos_embed_cache[pos_emb_shape] = (freqs_cos, freqs_sin)
+        else:
+            freqs_cos, freqs_sin = self.pos_embed_cache[pos_emb_shape]
+
+        # call DiT
+        latents = latents.to(device=accelerator.device, dtype=network_dtype)
+        noisy_model_input = noisy_model_input.to(device=accelerator.device, dtype=network_dtype)
+        with accelerator.autocast():
+            model_pred = transformer(
+                noisy_model_input,
+                timesteps,
+                text_states=batch["llm"],
+                text_mask=batch["llm_mask"],
+                text_states_2=batch["clipL"],
+                freqs_cos=freqs_cos,
+                freqs_sin=freqs_sin,
+                guidance=guidance_vec,
+                return_dict=False,
+            )
+
+        # flow matching loss
+        target = noise - latents
+
+        return model_pred, target
+
+    # endregion model specific
+
+    def train(self, args):
+        # check required arguments
+        if args.dataset_config is None:
+            raise ValueError("dataset_config is required / dataset_configが必要です")
+        if args.dit is None:
+            raise ValueError("path to DiT model is required / DiTモデルのパスが必要です")
+        assert not args.fp8_scaled or args.fp8_base, "fp8_scaled requires fp8_base / fp8_scaledはfp8_baseが必要です"
+
+        # check model specific arguments
+        self.handle_model_specific_args(args)
+
+        # show timesteps for debugging
+        if args.show_timesteps:
+            self.show_timesteps(args)
+            return
+
+        session_id = random.randint(0, 2**32)
+        training_started_at = time.time()
+        # setup_logging(args, reset=True)
+
+        if args.seed is None:
+            args.seed = random.randint(0, 2**32)
+        set_seed(args.seed)
+
+        # Load dataset config
+        blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+        logger.info(f"Load dataset config from {args.dataset_config}")
+        user_config = config_utils.load_user_config(args.dataset_config)
+        blueprint = blueprint_generator.generate(user_config, args, architecture=self.architecture)
+        train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group, training=True)
+
+        current_epoch = Value("i", 0)
+        current_step = Value("i", 0)
+        ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+        collator = collator_class(current_epoch, current_step, ds_for_collator)
+
+        # prepare accelerator
+        logger.info("preparing accelerator")
+        accelerator = prepare_accelerator(args)
+        is_main_process = accelerator.is_main_process
+
+        # prepare dtype
+        weight_dtype = torch.float32
+        if args.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif args.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+
+        # HunyuanVideo: bfloat16 or float16, Wan2.1: bfloat16
+        dit_dtype = torch.bfloat16 if args.dit_dtype is None else model_utils.str_to_dtype(args.dit_dtype)
+        dit_weight_dtype = (None if args.fp8_scaled else torch.float8_e4m3fn) if args.fp8_base else dit_dtype
+        logger.info(f"DiT precision: {dit_dtype}, weight precision: {dit_weight_dtype}")
+
+        # get embedding for sampling images
+        vae_dtype = torch.float16 if args.vae_dtype is None else model_utils.str_to_dtype(args.vae_dtype)
+        sample_parameters = None
+        vae = None
+        if args.sample_prompts:
+            sample_parameters = self.process_sample_prompts(args, accelerator, args.sample_prompts)
+
+            # Load VAE model for sampling images: VAE is loaded to cpu to save gpu memory
+            vae = self.load_vae(args, vae_dtype=vae_dtype, vae_path=args.vae)
+            vae.requires_grad_(False)
+            vae.eval()
+
+        # load DiT model
+        blocks_to_swap = args.blocks_to_swap if args.blocks_to_swap else 0
+        self.blocks_to_swap = blocks_to_swap
+        loading_device = "cpu" if blocks_to_swap > 0 else accelerator.device
+
+        logger.info(f"Loading DiT model from {args.dit}")
+        if args.sdpa:
+            attn_mode = "torch"
+        elif args.flash_attn:
+            attn_mode = "flash"
+        elif args.sage_attn:
+            attn_mode = "sageattn"
+        elif args.xformers:
+            attn_mode = "xformers"
+        elif args.flash3:
+            attn_mode = "flash3"
+        else:
+            raise ValueError(
+                f"either --sdpa, --flash-attn, --flash3, --sage-attn or --xformers must be specified / --sdpa, --flash-attn, --flash3, --sage-attn, --xformersのいずれかを指定してください"
+            )
+        transformer = self.load_transformer(
+            accelerator, args, args.dit, attn_mode, args.split_attn, loading_device, dit_weight_dtype
+        )
+        transformer.eval()
+        transformer.requires_grad_(False)
+
+        if blocks_to_swap > 0:
+            logger.info(f"enable swap {blocks_to_swap} blocks to CPU from device: {accelerator.device}")
+            transformer.enable_block_swap(blocks_to_swap, accelerator.device, supports_backward=True)
+            transformer.move_to_device_except_swap_blocks(accelerator.device)
+
+        # load network model for differential training
+        sys.path.append(os.path.dirname(__file__))
+        accelerator.print("import network module:", args.network_module)
+        network_module: lora_module = importlib.import_module(args.network_module)  # actual module may be different
+
+        if args.base_weights is not None:
+            # if base_weights is specified, merge the weights to DiT model
+            for i, weight_path in enumerate(args.base_weights):
+                if args.base_weights_multiplier is None or len(args.base_weights_multiplier) <= i:
+                    multiplier = 1.0
+                else:
+                    multiplier = args.base_weights_multiplier[i]
+
+                accelerator.print(f"merging module: {weight_path} with multiplier {multiplier}")
+
+                weights_sd = load_file(weight_path)
+                module = network_module.create_arch_network_from_weights(
+                    multiplier, weights_sd, unet=transformer, for_inference=True
+                )
+                module.merge_to(None, transformer, weights_sd, weight_dtype, "cpu")
+
+            accelerator.print(f"all weights merged: {', '.join(args.base_weights)}")
+
+        # prepare network
+        net_kwargs = {}
+        if args.network_args is not None:
+            for net_arg in args.network_args:
+                key, value = net_arg.split("=")
+                net_kwargs[key] = value
+
+        if args.dim_from_weights:
+            logger.info(f"Loading network from weights: {args.dim_from_weights}")
+            weights_sd = load_file(args.dim_from_weights)
+            network, _ = network_module.create_arch_network_from_weights(1, weights_sd, unet=transformer)
+        else:
+            # We use the name create_arch_network for compatibility with LyCORIS
+            if hasattr(network_module, "create_arch_network"):
+                network = network_module.create_arch_network(
+                    1.0,
+                    args.network_dim,
+                    args.network_alpha,
+                    vae,
+                    None,
+                    transformer,
+                    neuron_dropout=args.network_dropout,
+                    **net_kwargs,
+                )
+            else:
+                # LyCORIS compatibility
+                network = network_module.create_network(
+                    1.0,
+                    args.network_dim,
+                    args.network_alpha,
+                    vae,
+                    None,
+                    transformer,
+                    **net_kwargs,
+                )
+        if network is None:
+            return
+
+        if hasattr(network_module, "prepare_network"):
+            network.prepare_network(args)
+
+        # apply network to DiT
+        network.apply_to(None, transformer, apply_text_encoder=False, apply_unet=True)
+
+        if args.network_weights is not None:
+            # FIXME consider alpha of weights: this assumes that the alpha is not changed
+            info = network.load_weights(args.network_weights)
+            accelerator.print(f"load network weights from {args.network_weights}: {info}")
+
+        if args.gradient_checkpointing:
+            transformer.enable_gradient_checkpointing()
+            network.enable_gradient_checkpointing()  # may have no effect
+
+        # prepare optimizer, data loader etc.
+        accelerator.print("prepare optimizer, data loader etc.")
+
+        trainable_params, lr_descriptions = network.prepare_optimizer_params(unet_lr=args.learning_rate)
+        optimizer_name, optimizer_args, optimizer, optimizer_train_fn, optimizer_eval_fn = self.get_optimizer(
+            args, trainable_params
+        )
+
+        # prepare dataloader
+
+        # num workers for data loader: if 0, persistent_workers is not available
+        n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
+
+        train_dataloader = torch.utils.data.DataLoader(
+            train_dataset_group,
+            batch_size=1,
+            shuffle=True,
+            collate_fn=collator,
+            num_workers=n_workers,
+            persistent_workers=args.persistent_data_loader_workers,
+        )
+
+        # calculate max_train_steps
+        if args.max_train_epochs is not None:
+            args.max_train_steps = args.max_train_epochs * math.ceil(
+                len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
+            )
+            accelerator.print(
+                f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+            )
+
+        # send max_train_steps to train_dataset_group
+        train_dataset_group.set_max_train_steps(args.max_train_steps)
+
+        # prepare lr_scheduler
+        lr_scheduler = self.get_lr_scheduler(args, optimizer, accelerator.num_processes)
+
+        # prepare training model. accelerator does some magic here
+
+        # experimental feature: train the model with gradients in fp16/bf16
+        network_dtype = torch.float32
+        args.full_fp16 = args.full_bf16 = False  # temporary disabled because stochastic rounding is not supported yet
+        if args.full_fp16:
+            assert (
+                args.mixed_precision == "fp16"
+            ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
+            accelerator.print("enable full fp16 training.")
+            network_dtype = weight_dtype
+            network.to(network_dtype)
+        elif args.full_bf16:
+            assert (
+                args.mixed_precision == "bf16"
+            ), "full_bf16 requires mixed precision='bf16' / full_bf16を使う場合はmixed_precision='bf16'を指定してください。"
+            accelerator.print("enable full bf16 training.")
+            network_dtype = weight_dtype
+            network.to(network_dtype)
+
+        if dit_weight_dtype != dit_dtype and dit_weight_dtype is not None:
+            logger.info(f"casting model to {dit_weight_dtype}")
+            transformer.to(dit_weight_dtype)
+
+        if blocks_to_swap > 0:
+            transformer = accelerator.prepare(transformer, device_placement=[not blocks_to_swap > 0])
+            accelerator.unwrap_model(transformer).move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
+            accelerator.unwrap_model(transformer).prepare_block_swap_before_forward()
+        else:
+            transformer = accelerator.prepare(transformer)
+
+        network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(network, optimizer, train_dataloader, lr_scheduler)
+        training_model = network
+
+        if args.gradient_checkpointing:
+            transformer.train()
+        else:
+            transformer.eval()
+
+        accelerator.unwrap_model(network).prepare_grad_etc(transformer)
+
+        if args.full_fp16:
+            # patch accelerator for fp16 training
+            # def patch_accelerator_for_fp16_training(accelerator):
+            org_unscale_grads = accelerator.scaler._unscale_grads_
+
+            def _unscale_grads_replacer(optimizer, inv_scale, found_inf, allow_fp16):
+                return org_unscale_grads(optimizer, inv_scale, found_inf, True)
+
+            accelerator.scaler._unscale_grads_ = _unscale_grads_replacer
+
+        # before resuming make hook for saving/loading to save/load the network weights only
+        def save_model_hook(models, weights, output_dir):
+            # pop weights of other models than network to save only network weights
+            # only main process or deepspeed https://github.com/huggingface/diffusers/issues/2606
+            if accelerator.is_main_process:  # or args.deepspeed:
+                remove_indices = []
+                for i, model in enumerate(models):
+                    if not isinstance(model, type(accelerator.unwrap_model(network))):
+                        remove_indices.append(i)
+                for i in reversed(remove_indices):
+                    if len(weights) > i:
+                        weights.pop(i)
+                # print(f"save model hook: {len(weights)} weights will be saved")
+
+        def load_model_hook(models, input_dir):
+            # remove models except network
+            remove_indices = []
+            for i, model in enumerate(models):
+                if not isinstance(model, type(accelerator.unwrap_model(network))):
+                    remove_indices.append(i)
+            for i in reversed(remove_indices):
+                models.pop(i)
+            # print(f"load model hook: {len(models)} models will be loaded")
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+        # resume from local or huggingface. accelerator.step is set
+        self.resume_from_local_or_hf_if_specified(accelerator, args)  # accelerator.load_state(args.resume)
+
+        # epoch数を計算する
+        num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+        num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+        # 学習する
+        # total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+        accelerator.print("running training / 学習開始")
+        accelerator.print(f"  num train items / 学習画像、動画数: {train_dataset_group.num_train_items}")
+        accelerator.print(f"  num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}")
+        accelerator.print(f"  num epochs / epoch数: {num_train_epochs}")
+        accelerator.print(
+            f"  batch size per device / バッチサイズ: {', '.join([str(d.batch_size) for d in train_dataset_group.datasets])}"
+        )
+        # accelerator.print(f"  total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ（並列学習、勾配合計含む）: {total_batch_size}")
+        accelerator.print(f"  gradient accumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}")
+        accelerator.print(f"  total optimization steps / 学習ステップ数: {args.max_train_steps}")
+
+        # TODO refactor metadata creation and move to util
+        metadata = {
+            "ss_" "ss_session_id": session_id,  # random integer indicating which group of epochs the model came from
+            "ss_training_started_at": training_started_at,  # unix timestamp
+            "ss_output_name": args.output_name,
+            "ss_learning_rate": args.learning_rate,
+            "ss_num_train_items": train_dataset_group.num_train_items,
+            "ss_num_batches_per_epoch": len(train_dataloader),
+            "ss_num_epochs": num_train_epochs,
+            "ss_gradient_checkpointing": args.gradient_checkpointing,
+            "ss_gradient_accumulation_steps": args.gradient_accumulation_steps,
+            "ss_max_train_steps": args.max_train_steps,
+            "ss_lr_warmup_steps": args.lr_warmup_steps,
+            "ss_lr_scheduler": args.lr_scheduler,
+            SS_METADATA_KEY_BASE_MODEL_VERSION: self.architecture_full_name,
+            # "ss_network_module": args.network_module,
+            # "ss_network_dim": args.network_dim,  # None means default because another network than LoRA may have another default dim
+            # "ss_network_alpha": args.network_alpha,  # some networks may not have alpha
+            SS_METADATA_KEY_NETWORK_MODULE: args.network_module,
+            SS_METADATA_KEY_NETWORK_DIM: args.network_dim,
+            SS_METADATA_KEY_NETWORK_ALPHA: args.network_alpha,
+            "ss_network_dropout": args.network_dropout,  # some networks may not have dropout
+            "ss_mixed_precision": args.mixed_precision,
+            "ss_seed": args.seed,
+            "ss_training_comment": args.training_comment,  # will not be updated after training
+            # "ss_sd_scripts_commit_hash": train_util.get_git_revision_hash(),
+            "ss_optimizer": optimizer_name + (f"({optimizer_args})" if len(optimizer_args) > 0 else ""),
+            "ss_max_grad_norm": args.max_grad_norm,
+            "ss_fp8_base": bool(args.fp8_base),
+            # "ss_fp8_llm": bool(args.fp8_llm), # remove this because this is only for HuanyuanVideo TODO set architecure dependent metadata
+            "ss_full_fp16": bool(args.full_fp16),
+            "ss_full_bf16": bool(args.full_bf16),
+            "ss_weighting_scheme": args.weighting_scheme,
+            "ss_logit_mean": args.logit_mean,
+            "ss_logit_std": args.logit_std,
+            "ss_mode_scale": args.mode_scale,
+            "ss_guidance_scale": args.guidance_scale,
+            "ss_timestep_sampling": args.timestep_sampling,
+            "ss_sigmoid_scale": args.sigmoid_scale,
+            "ss_discrete_flow_shift": args.discrete_flow_shift,
+        }
+
+        datasets_metadata = []
+        # tag_frequency = {}  # merge tag frequency for metadata editor # TODO support tag frequency
+        for dataset in train_dataset_group.datasets:
+            dataset_metadata = dataset.get_metadata()
+            datasets_metadata.append(dataset_metadata)
+
+        metadata["ss_datasets"] = json.dumps(datasets_metadata)
+
+        # add extra args
+        if args.network_args:
+            # metadata["ss_network_args"] = json.dumps(net_kwargs)
+            metadata[SS_METADATA_KEY_NETWORK_ARGS] = json.dumps(net_kwargs)
+
+        # model name and hash
+        # calculate hash takes time, so we omit it for now
+        if args.dit is not None:
+            # logger.info(f"calculate hash for DiT model: {args.dit}")
+            logger.info(f"set DiT model name for metadata: {args.dit}")
+            sd_model_name = args.dit
+            if os.path.exists(sd_model_name):
+                # metadata["ss_sd_model_hash"] = model_utils.model_hash(sd_model_name)
+                # metadata["ss_new_sd_model_hash"] = model_utils.calculate_sha256(sd_model_name)
+                sd_model_name = os.path.basename(sd_model_name)
+            metadata["ss_sd_model_name"] = sd_model_name
+
+        if args.vae is not None:
+            # logger.info(f"calculate hash for VAE model: {args.vae}")
+            logger.info(f"set VAE model name for metadata: {args.vae}")
+            vae_name = args.vae
+            if os.path.exists(vae_name):
+                # metadata["ss_vae_hash"] = model_utils.model_hash(vae_name)
+                # metadata["ss_new_vae_hash"] = model_utils.calculate_sha256(vae_name)
+                vae_name = os.path.basename(vae_name)
+            metadata["ss_vae_name"] = vae_name
+
+        metadata = {k: str(v) for k, v in metadata.items()}
+
+        # make minimum metadata for filtering
+        minimum_metadata = {}
+        for key in SS_METADATA_MINIMUM_KEYS:
+            if key in metadata:
+                minimum_metadata[key] = metadata[key]
+
+        if accelerator.is_main_process:
+            init_kwargs = {}
+            if args.wandb_run_name:
+                init_kwargs["wandb"] = {"name": args.wandb_run_name}
+            if args.log_tracker_config is not None:
+                init_kwargs = toml.load(args.log_tracker_config)
+            accelerator.init_trackers(
+                "network_train" if args.log_tracker_name is None else args.log_tracker_name,
+                config=train_utils.get_sanitized_config_or_none(args),
+                init_kwargs=init_kwargs,
+            )
+
+        # TODO skip until initial step
+        progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps")
+
+        epoch_to_start = 0
+        global_step = 0
+        noise_scheduler = FlowMatchDiscreteScheduler(shift=args.discrete_flow_shift, reverse=True, solver="euler")
+
+        loss_recorder = train_utils.LossRecorder()
+        del train_dataset_group
+
+        # function for saving/removing
+        save_dtype = dit_dtype
+
+        def save_model(ckpt_name: str, unwrapped_nw, steps, epoch_no, force_sync_upload=False):
+            os.makedirs(args.output_dir, exist_ok=True)
+            ckpt_file = os.path.join(args.output_dir, ckpt_name)
+
+            accelerator.print(f"\nsaving checkpoint: {ckpt_file}")
+            metadata["ss_training_finished_at"] = str(time.time())
+            metadata["ss_steps"] = str(steps)
+            metadata["ss_epoch"] = str(epoch_no)
+
+            metadata_to_save = minimum_metadata if args.no_metadata else metadata
+
+            title = args.metadata_title if args.metadata_title is not None else args.output_name
+            if args.min_timestep is not None or args.max_timestep is not None:
+                min_time_step = args.min_timestep if args.min_timestep is not None else 0
+                max_time_step = args.max_timestep if args.max_timestep is not None else 1000
+                md_timesteps = (min_time_step, max_time_step)
+            else:
+                md_timesteps = None
+
+            sai_metadata = sai_model_spec.build_metadata(
+                None,
+                self.architecture,
+                time.time(),
+                title,
+                None,
+                args.metadata_author,
+                args.metadata_description,
+                args.metadata_license,
+                args.metadata_tags,
+                timesteps=md_timesteps,
+            )
+
+            metadata_to_save.update(sai_metadata)
+
+            unwrapped_nw.save_weights(ckpt_file, save_dtype, metadata_to_save)
+            if args.huggingface_repo_id is not None:
+                huggingface_utils.upload(args, ckpt_file, "/" + ckpt_name, force_sync_upload=force_sync_upload)
+
+        def remove_model(old_ckpt_name):
+            old_ckpt_file = os.path.join(args.output_dir, old_ckpt_name)
+            if os.path.exists(old_ckpt_file):
+                accelerator.print(f"removing old checkpoint: {old_ckpt_file}")
+                os.remove(old_ckpt_file)
+
+        # For --sample_at_first
+        if should_sample_images(args, global_step, epoch=0):
+            optimizer_eval_fn()
+            self.sample_images(accelerator, args, 0, global_step, vae, transformer, sample_parameters, dit_dtype)
+            optimizer_train_fn()
+        if len(accelerator.trackers) > 0:
+            # log empty object to commit the sample images to wandb
+            accelerator.log({}, step=0)
+
+        # training loop
+
+        # log device and dtype for each model
+        logger.info(f"DiT dtype: {transformer.dtype}, device: {transformer.device}")
+
+        clean_memory_on_device(accelerator.device)
+
+        for epoch in range(epoch_to_start, num_train_epochs):
+            accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
+            current_epoch.value = epoch + 1
+
+            metadata["ss_epoch"] = str(epoch + 1)
+
+            accelerator.unwrap_model(network).on_epoch_start(transformer)
+
+            for step, batch in enumerate(train_dataloader):
+                latents = batch["latents"]
+                bsz = latents.shape[0]
+                current_step.value = global_step
+
+                with accelerator.accumulate(training_model):
+                    accelerator.unwrap_model(network).on_step_start()
+
+                    latents = self.scale_shift_latents(latents)
+
+                    # Sample noise that we'll add to the latents
+                    noise = torch.randn_like(latents)
+
+                    # calculate model input and timesteps
+                    noisy_model_input, timesteps = self.get_noisy_model_input_and_timesteps(
+                        args, noise, latents, noise_scheduler, accelerator.device, dit_dtype
+                    )
+
+                    weighting = compute_loss_weighting_for_sd3(
+                        args.weighting_scheme, noise_scheduler, timesteps, accelerator.device, dit_dtype
+                    )
+
+                    model_pred, target = self.call_dit(
+                        args, accelerator, transformer, latents, batch, noise, noisy_model_input, timesteps, network_dtype
+                    )
+                    loss = torch.nn.functional.mse_loss(model_pred.to(network_dtype), target, reduction="none")
+
+                    if weighting is not None:
+                        loss = loss * weighting
+                    # loss = loss.mean([1, 2, 3])
+                    # # min snr gamma, scale v pred loss like noise pred, v pred like loss, debiased estimation etc.
+                    # loss = self.post_process_loss(loss, args, timesteps, noise_scheduler)
+
+                    loss = loss.mean()  # mean loss over all elements in batch
+
+                    accelerator.backward(loss)
+                    if accelerator.sync_gradients:
+                        # self.all_reduce_network(accelerator, network)  # sync DDP grad manually
+                        state = accelerate.PartialState()
+                        if state.distributed_type != accelerate.DistributedType.NO:
+                            for param in network.parameters():
+                                if param.grad is not None:
+                                    param.grad = accelerator.reduce(param.grad, reduction="mean")
+
+                        if args.max_grad_norm != 0.0:
+                            params_to_clip = accelerator.unwrap_model(network).get_trainable_params()
+                            accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad(set_to_none=True)
+
+                if args.scale_weight_norms:
+                    keys_scaled, mean_norm, maximum_norm = accelerator.unwrap_model(network).apply_max_norm_regularization(
+                        args.scale_weight_norms, accelerator.device
+                    )
+                    max_mean_logs = {"Keys Scaled": keys_scaled, "Average key norm": mean_norm}
+                else:
+                    keys_scaled, mean_norm, maximum_norm = None, None, None
+
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                if accelerator.sync_gradients:
+                    progress_bar.update(1)
+                    global_step += 1
+
+                    # to avoid calling optimizer_eval_fn() too frequently, we call it only when we need to sample images or save the model
+                    should_sampling = should_sample_images(args, global_step, epoch=None)
+                    should_saving = args.save_every_n_steps is not None and global_step % args.save_every_n_steps == 0
+
+                    if should_sampling or should_saving:
+                        optimizer_eval_fn()
+                        if should_sampling:
+                            self.sample_images(accelerator, args, None, global_step, vae, transformer, sample_parameters, dit_dtype)
+
+                        if should_saving:
+                            accelerator.wait_for_everyone()
+                            if accelerator.is_main_process:
+                                ckpt_name = train_utils.get_step_ckpt_name(args.output_name, global_step)
+                                save_model(ckpt_name, accelerator.unwrap_model(network), global_step, epoch)
+
+                                if args.save_state:
+                                    train_utils.save_and_remove_state_stepwise(args, accelerator, global_step)
+
+                                remove_step_no = train_utils.get_remove_step_no(args, global_step)
+                                if remove_step_no is not None:
+                                    remove_ckpt_name = train_utils.get_step_ckpt_name(args.output_name, remove_step_no)
+                                    remove_model(remove_ckpt_name)
+                        optimizer_train_fn()
+
+                current_loss = loss.detach().item()
+                loss_recorder.add(epoch=epoch, step=step, loss=current_loss)
+                avr_loss: float = loss_recorder.moving_average
+                logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
+                progress_bar.set_postfix(**logs)
+
+                if args.scale_weight_norms:
+                    progress_bar.set_postfix(**{**max_mean_logs, **logs})
+
+                if len(accelerator.trackers) > 0:
+                    logs = self.generate_step_logs(
+                        args, current_loss, avr_loss, lr_scheduler, lr_descriptions, optimizer, keys_scaled, mean_norm, maximum_norm
+                    )
+                    accelerator.log(logs, step=global_step)
+
+                if global_step >= args.max_train_steps:
+                    break
+
+            if len(accelerator.trackers) > 0:
+                logs = {"loss/epoch": loss_recorder.moving_average}
+                accelerator.log(logs, step=epoch + 1)
+
+            accelerator.wait_for_everyone()
+
+            # save model at the end of epoch if needed
+            optimizer_eval_fn()
+            if args.save_every_n_epochs is not None:
+                saving = (epoch + 1) % args.save_every_n_epochs == 0 and (epoch + 1) < num_train_epochs
+                if is_main_process and saving:
+                    ckpt_name = train_utils.get_epoch_ckpt_name(args.output_name, epoch + 1)
+                    save_model(ckpt_name, accelerator.unwrap_model(network), global_step, epoch + 1)
+
+                    remove_epoch_no = train_utils.get_remove_epoch_no(args, epoch + 1)
+                    if remove_epoch_no is not None:
+                        remove_ckpt_name = train_utils.get_epoch_ckpt_name(args.output_name, remove_epoch_no)
+                        remove_model(remove_ckpt_name)
+
+                    if args.save_state:
+                        train_utils.save_and_remove_state_on_epoch_end(args, accelerator, epoch + 1)
+
+            self.sample_images(accelerator, args, epoch + 1, global_step, vae, transformer, sample_parameters, dit_dtype)
+            optimizer_train_fn()
+
+            # end of epoch
+
+        # metadata["ss_epoch"] = str(num_train_epochs)
+        metadata["ss_training_finished_at"] = str(time.time())
+
+        if is_main_process:
+            network = accelerator.unwrap_model(network)
+
+        accelerator.end_training()
+        optimizer_eval_fn()
+
+        if is_main_process and (args.save_state or args.save_state_on_train_end):
+            train_utils.save_state_on_train_end(args, accelerator)
+
+        if is_main_process:
+            ckpt_name = train_utils.get_last_ckpt_name(args.output_name)
+            save_model(ckpt_name, network, global_step, num_train_epochs, force_sync_upload=True)
+
+            logger.info("model saved.")
+
+
+def setup_parser_common() -> argparse.ArgumentParser:
+    def int_or_float(value):
+        if value.endswith("%"):
+            try:
+                return float(value[:-1]) / 100.0
+            except ValueError:
+                raise argparse.ArgumentTypeError(f"Value '{value}' is not a valid percentage")
+        try:
+            float_value = float(value)
+            if float_value >= 1 and float_value.is_integer():
+                return int(value)
+            return float(value)
+        except ValueError:
+            raise argparse.ArgumentTypeError(f"'{value}' is not an int or float")
+
+    parser = argparse.ArgumentParser()
+
+    # general settings
+    parser.add_argument(
+        "--config_file",
+        type=str,
+        default=None,
+        help="using .toml instead of args to pass hyperparameter / ハイパーパラメータを引数ではなく.tomlファイルで渡す",
+    )
+    parser.add_argument(
+        "--dataset_config",
+        type=pathlib.Path,
+        default=None,
+        help="config file for dataset / データセットの設定ファイル",
+    )
+
+    # training settings
+    parser.add_argument(
+        "--sdpa",
+        action="store_true",
+        help="use sdpa for CrossAttention (requires PyTorch 2.0) / CrossAttentionにsdpaを使う（PyTorch 2.0が必要）",
+    )
+    parser.add_argument(
+        "--flash_attn",
+        action="store_true",
+        help="use FlashAttention for CrossAttention, requires FlashAttention / CrossAttentionにFlashAttentionを使う、FlashAttentionが必要",
+    )
+    parser.add_argument(
+        "--sage_attn",
+        action="store_true",
+        help="use SageAttention. requires SageAttention / SageAttentionを使う。SageAttentionが必要",
+    )
+    parser.add_argument(
+        "--xformers",
+        action="store_true",
+        help="use xformers for CrossAttention, requires xformers / CrossAttentionにxformersを使う、xformersが必要",
+    )
+    parser.add_argument(
+        "--flash3",
+        action="store_true",
+        help="use FlashAttention 3 for CrossAttention, requires FlashAttention 3, HunyuanVideo does not support this yet"
+        " / CrossAttentionにFlashAttention 3を使う、FlashAttention 3が必要。HunyuanVideoは未対応。",
+    )
+    parser.add_argument(
+        "--split_attn",
+        action="store_true",
+        help="use split attention for attention calculation (split batch size=1, affects memory usage and speed)"
+        " / attentionを分割して計算する（バッチサイズ=1に分割、メモリ使用量と速度に影響）",
+    )
+
+    parser.add_argument("--max_train_steps", type=int, default=1600, help="training steps / 学習ステップ数")
+    parser.add_argument(
+        "--max_train_epochs",
+        type=int,
+        default=None,
+        help="training epochs (overrides max_train_steps) / 学習エポック数（max_train_stepsを上書きします）",
+    )
+    parser.add_argument(
+        "--max_data_loader_n_workers",
+        type=int,
+        default=8,
+        help="max num workers for DataLoader (lower is less main RAM usage, faster epoch start and slower data loading) / DataLoaderの最大プロセス数（小さい値ではメインメモリの使用量が減りエポック間の待ち時間が減りますが、データ読み込みは遅くなります）",
+    )
+    parser.add_argument(
+        "--persistent_data_loader_workers",
+        action="store_true",
+        help="persistent DataLoader workers (useful for reduce time gap between epoch, but may use more memory) / DataLoader のワーカーを持続させる (エポック間の時間差を少なくするのに有効だが、より多くのメモリを消費する可能性がある)",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="random seed for training / 学習時の乱数のseed")
+    parser.add_argument(
+        "--gradient_checkpointing", action="store_true", help="enable gradient checkpointing / gradient checkpointingを有効にする"
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass / 学習時に逆伝播をする前に勾配を合計するステップ数",
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help="use mixed precision / 混合精度を使う場合、その精度",
+    )
+
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default=None,
+        help="enable logging and output TensorBoard log to this directory / ログ出力を有効にしてこのディレクトリにTensorBoard用のログを出力する",
+    )
+    parser.add_argument(
+        "--log_with",
+        type=str,
+        default=None,
+        choices=["tensorboard", "wandb", "all"],
+        help="what logging tool(s) to use (if 'all', TensorBoard and WandB are both used) / ログ出力に使用するツール (allを指定するとTensorBoardとWandBの両方が使用される)",
+    )
+    parser.add_argument(
+        "--log_prefix", type=str, default=None, help="add prefix for each log directory / ログディレクトリ名の先頭に追加する文字列"
+    )
+    parser.add_argument(
+        "--log_tracker_name",
+        type=str,
+        default=None,
+        help="name of tracker to use for logging, default is script-specific default name / ログ出力に使用するtrackerの名前、省略時はスクリプトごとのデフォルト名",
+    )
+    parser.add_argument(
+        "--wandb_run_name",
+        type=str,
+        default=None,
+        help="The name of the specific wandb session / wandb ログに表示される特定の実行の名前",
+    )
+    parser.add_argument(
+        "--log_tracker_config",
+        type=str,
+        default=None,
+        help="path to tracker config file to use for logging / ログ出力に使用するtrackerの設定ファイルのパス",
+    )
+    parser.add_argument(
+        "--wandb_api_key",
+        type=str,
+        default=None,
+        help="specify WandB API key to log in before starting training (optional). / WandB APIキーを指定して学習開始前にログインする（オプション）",
+    )
+    parser.add_argument("--log_config", action="store_true", help="log training configuration / 学習設定をログに出力する")
+
+    parser.add_argument(
+        "--ddp_timeout",
+        type=int,
+        default=None,
+        help="DDP timeout (min, None for default of accelerate) / DDPのタイムアウト（分、Noneでaccelerateのデフォルト）",
+    )
+    parser.add_argument(
+        "--ddp_gradient_as_bucket_view",
+        action="store_true",
+        help="enable gradient_as_bucket_view for DDP / DDPでgradient_as_bucket_viewを有効にする",
+    )
+    parser.add_argument(
+        "--ddp_static_graph",
+        action="store_true",
+        help="enable static_graph for DDP / DDPでstatic_graphを有効にする",
+    )
+
+    parser.add_argument(
+        "--sample_every_n_steps",
+        type=int,
+        default=None,
+        help="generate sample images every N steps / 学習中のモデルで指定ステップごとにサンプル出力する",
+    )
+    parser.add_argument(
+        "--sample_at_first", action="store_true", help="generate sample images before training / 学習前にサンプル出力する"
+    )
+    parser.add_argument(
+        "--sample_every_n_epochs",
+        type=int,
+        default=None,
+        help="generate sample images every N epochs (overwrites n_steps) / 学習中のモデルで指定エポックごとにサンプル出力する（ステップ数指定を上書きします）",
+    )
+    parser.add_argument(
+        "--sample_prompts",
+        type=str,
+        default=None,
+        help="file for prompts to generate sample images / 学習中モデルのサンプル出力用プロンプトのファイル",
+    )
+
+    # optimizer and lr scheduler settings
+    parser.add_argument(
+        "--optimizer_type",
+        type=str,
+        default="",
+        help="Optimizer to use / オプティマイザの種類: AdamW (default), AdamW8bit, AdaFactor. "
+        "Also, you can use any optimizer by specifying the full path to the class, like 'torch.optim.AdamW', 'bitsandbytes.optim.AdEMAMix8bit' or 'bitsandbytes.optim.PagedAdEMAMix8bit' etc. / ",
+    )
+    parser.add_argument(
+        "--optimizer_args",
+        type=str,
+        default=None,
+        nargs="*",
+        help='additional arguments for optimizer (like "weight_decay=0.01 betas=0.9,0.999 ...") / オプティマイザの追加引数（例： "weight_decay=0.01 betas=0.9,0.999 ..."）',
+    )
+    parser.add_argument("--learning_rate", type=float, default=2.0e-6, help="learning rate / 学習率")
+    parser.add_argument(
+        "--max_grad_norm",
+        default=1.0,
+        type=float,
+        help="Max gradient norm, 0 for no clipping / 勾配正規化の最大norm、0でclippingを行わない",
+    )
+
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help="scheduler to use for learning rate / 学習率のスケジューラ: linear, cosine, cosine_with_restarts, polynomial, constant (default), constant_with_warmup, adafactor",
+    )
+    parser.add_argument(
+        "--lr_warmup_steps",
+        type=int_or_float,
+        default=0,
+        help="Int number of steps for the warmup in the lr scheduler (default is 0) or float with ratio of train steps"
+        " / 学習率のスケジューラをウォームアップするステップ数（デフォルト0）、または学習ステップの比率（1未満のfloat値の場合）",
+    )
+    parser.add_argument(
+        "--lr_decay_steps",
+        type=int_or_float,
+        default=0,
+        help="Int number of steps for the decay in the lr scheduler (default is 0) or float (<1) with ratio of train steps"
+        " / 学習率のスケジューラを減衰させるステップ数（デフォルト0）、または学習ステップの比率（1未満のfloat値の場合）",
+    )
+    parser.add_argument(
+        "--lr_scheduler_num_cycles",
+        type=int,
+        default=1,
+        help="Number of restarts for cosine scheduler with restarts / cosine with restartsスケジューラでのリスタート回数",
+    )
+    parser.add_argument(
+        "--lr_scheduler_power",
+        type=float,
+        default=1,
+        help="Polynomial power for polynomial scheduler / polynomialスケジューラでのpolynomial power",
+    )
+    parser.add_argument(
+        "--lr_scheduler_timescale",
+        type=int,
+        default=None,
+        help="Inverse sqrt timescale for inverse sqrt scheduler,defaults to `num_warmup_steps`"
+        + " / 逆平方根スケジューラのタイムスケール、デフォルトは`num_warmup_steps`",
+    )
+    parser.add_argument(
+        "--lr_scheduler_min_lr_ratio",
+        type=float,
+        default=None,
+        help="The minimum learning rate as a ratio of the initial learning rate for cosine with min lr scheduler and warmup decay scheduler"
+        + " / 初期学習率の比率としての最小学習率を指定する、cosine with min lr と warmup decay スケジューラ で有効",
+    )
+    parser.add_argument("--lr_scheduler_type", type=str, default="", help="custom scheduler module / 使用するスケジューラ")
+    parser.add_argument(
+        "--lr_scheduler_args",
+        type=str,
+        default=None,
+        nargs="*",
+        help='additional arguments for scheduler (like "T_max=100") / スケジューラの追加引数（例： "T_max100"）',
+    )
+
+    parser.add_argument("--fp8_base", action="store_true", help="use fp8 for base model / base modelにfp8を使う")
+    # parser.add_argument("--full_fp16", action="store_true", help="fp16 training including gradients / 勾配も含めてfp16で学習する")
+    # parser.add_argument("--full_bf16", action="store_true", help="bf16 training including gradients / 勾配も含めてbf16で学習する")
+
+    parser.add_argument(
+        "--blocks_to_swap",
+        type=int,
+        default=None,
+        help="number of blocks to swap in the model, max XXX / モデル内のブロックの数、最大XXX",
+    )
+    parser.add_argument(
+        "--img_in_txt_in_offloading",
+        action="store_true",
+        help="offload img_in and txt_in to cpu / img_inとtxt_inをCPUにオフロードする",
+    )
+
+    # parser.add_argument("--flow_shift", type=float, default=7.0, help="Shift factor for flow matching schedulers")
+    parser.add_argument(
+        "--guidance_scale", type=float, default=1.0, help="Embeded classifier free guidance scale (HunyuanVideo only)."
+    )
+    parser.add_argument(
+        "--timestep_sampling",
+        choices=["sigma", "uniform", "sigmoid", "shift"],
+        default="sigma",
+        help="Method to sample timesteps: sigma-based, uniform random, sigmoid of random normal and shift of sigmoid."
+        " / タイムステップをサンプリングする方法：sigma、random uniform、random normalのsigmoid、sigmoidのシフト。",
+    )
+    parser.add_argument(
+        "--discrete_flow_shift",
+        type=float,
+        default=1.0,
+        help="Discrete flow shift for the Euler Discrete Scheduler, default is 1.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは1.0。",
+    )
+    parser.add_argument(
+        "--sigmoid_scale",
+        type=float,
+        default=1.0,
+        help='Scale factor for sigmoid timestep sampling (only used when timestep-sampling is "sigmoid" or "shift"). / sigmoidタイムステップサンプリングの倍率（timestep-samplingが"sigmoid"または"shift"の場合のみ有効）。',
+    )
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="none",
+        choices=["logit_normal", "mode", "cosmap", "sigma_sqrt", "none"],
+        help="weighting scheme for timestep distribution. Default is none"
+        " / タイムステップ分布の重み付けスキーム、デフォルトはnone",
+    )
+    parser.add_argument(
+        "--logit_mean",
+        type=float,
+        default=0.0,
+        help="mean to use when using the `'logit_normal'` weighting scheme / `'logit_normal'`重み付けスキームを使用する場合の平均",
+    )
+    parser.add_argument(
+        "--logit_std",
+        type=float,
+        default=1.0,
+        help="std to use when using the `'logit_normal'` weighting scheme / `'logit_normal'`重み付けスキームを使用する場合のstd",
+    )
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme` / モード重み付けスキームのスケール",
+    )
+    parser.add_argument(
+        "--min_timestep",
+        type=int,
+        default=None,
+        help="set minimum time step for training (0~999, default is 0) / 学習時のtime stepの最小値を設定する（0~999で指定、省略時はデフォルト値(0)） ",
+    )
+    parser.add_argument(
+        "--max_timestep",
+        type=int,
+        default=None,
+        help="set maximum time step for training (1~1000, default is 1000) / 学習時のtime stepの最大値を設定する（1~1000で指定、省略時はデフォルト値(1000)）",
+    )
+
+    parser.add_argument(
+        "--show_timesteps",
+        type=str,
+        default=None,
+        choices=["image", "console"],
+        help="show timesteps in image or console, and return to console / タイムステップを画像またはコンソールに表示し、コンソールに戻る",
+    )
+
+    # network settings
+    parser.add_argument(
+        "--no_metadata", action="store_true", help="do not save metadata in output model / メタデータを出力先モデルに保存しない"
+    )
+    parser.add_argument(
+        "--network_weights", type=str, default=None, help="pretrained weights for network / 学習するネットワークの初期重み"
+    )
+    parser.add_argument(
+        "--network_module", type=str, default=None, help="network module to train / 学習対象のネットワークのモジュール"
+    )
+    parser.add_argument(
+        "--network_dim",
+        type=int,
+        default=None,
+        help="network dimensions (depends on each network) / モジュールの次元数（ネットワークにより定義は異なります）",
+    )
+    parser.add_argument(
+        "--network_alpha",
+        type=float,
+        default=1,
+        help="alpha for LoRA weight scaling, default 1 (same as network_dim for same behavior as old version) / LoRaの重み調整のalpha値、デフォルト1（旧バージョンと同じ動作をするにはnetwork_dimと同じ値を指定）",
+    )
+    parser.add_argument(
+        "--network_dropout",
+        type=float,
+        default=None,
+        help="Drops neurons out of training every step (0 or None is default behavior (no dropout), 1 would drop all neurons) / 訓練時に毎ステップでニューロンをdropする（0またはNoneはdropoutなし、1は全ニューロンをdropout）",
+    )
+    parser.add_argument(
+        "--network_args",
+        type=str,
+        default=None,
+        nargs="*",
+        help="additional arguments for network (key=value) / ネットワークへの追加の引数",
+    )
+    parser.add_argument(
+        "--training_comment",
+        type=str,
+        default=None,
+        help="arbitrary comment string stored in metadata / メタデータに記録する任意のコメント文字列",
+    )
+    parser.add_argument(
+        "--dim_from_weights",
+        action="store_true",
+        help="automatically determine dim (rank) from network_weights / dim (rank)をnetwork_weightsで指定した重みから自動で決定する",
+    )
+    parser.add_argument(
+        "--scale_weight_norms",
+        type=float,
+        default=None,
+        help="Scale the weight of each key pair to help prevent overtraing via exploding gradients. (1 is a good starting point) / 重みの値をスケーリングして勾配爆発を防ぐ（1が初期値としては適当）",
+    )
+    parser.add_argument(
+        "--base_weights",
+        type=str,
+        default=None,
+        nargs="*",
+        help="network weights to merge into the model before training / 学習前にあらかじめモデルにマージするnetworkの重みファイル",
+    )
+    parser.add_argument(
+        "--base_weights_multiplier",
+        type=float,
+        default=None,
+        nargs="*",
+        help="multiplier for network weights to merge into the model before training / 学習前にあらかじめモデルにマージするnetworkの重みの倍率",
+    )
+
+    # save and load settings
+    parser.add_argument(
+        "--output_dir", type=str, default=None, help="directory to output trained model / 学習後のモデル出力先ディレクトリ"
+    )
+    parser.add_argument(
+        "--output_name",
+        type=str,
+        default=None,
+        help="base name of trained model file / 学習後のモデルの拡張子を除くファイル名",
+    )
+    parser.add_argument("--resume", type=str, default=None, help="saved state to resume training / 学習再開するモデルのstate")
+
+    parser.add_argument(
+        "--save_every_n_epochs",
+        type=int,
+        default=None,
+        help="save checkpoint every N epochs / 学習中のモデルを指定エポックごとに保存する",
+    )
+    parser.add_argument(
+        "--save_every_n_steps",
+        type=int,
+        default=None,
+        help="save checkpoint every N steps / 学習中のモデルを指定ステップごとに保存する",
+    )
+    parser.add_argument(
+        "--save_last_n_epochs",
+        type=int,
+        default=None,
+        help="save last N checkpoints when saving every N epochs (remove older checkpoints) / 指定エポックごとにモデルを保存するとき最大Nエポック保存する（古いチェックポイントは削除する）",
+    )
+    parser.add_argument(
+        "--save_last_n_epochs_state",
+        type=int,
+        default=None,
+        help="save last N checkpoints of state (overrides the value of --save_last_n_epochs)/ 最大Nエポックstateを保存する（--save_last_n_epochsの指定を上書きする）",
+    )
+    parser.add_argument(
+        "--save_last_n_steps",
+        type=int,
+        default=None,
+        help="save checkpoints until N steps elapsed (remove older checkpoints if N steps elapsed) / 指定ステップごとにモデルを保存するとき、このステップ数経過するまで保存する（このステップ数経過したら削除する）",
+    )
+    parser.add_argument(
+        "--save_last_n_steps_state",
+        type=int,
+        default=None,
+        help="save states until N steps elapsed (remove older states if N steps elapsed, overrides --save_last_n_steps) / 指定ステップごとにstateを保存するとき、このステップ数経過するまで保存する（このステップ数経過したら削除する。--save_last_n_stepsを上書きする）",
+    )
+    parser.add_argument(
+        "--save_state",
+        action="store_true",
+        help="save training state additionally (including optimizer states etc.) when saving model / optimizerなど学習状態も含めたstateをモデル保存時に追加で保存する",
+    )
+    parser.add_argument(
+        "--save_state_on_train_end",
+        action="store_true",
+        help="save training state (including optimizer states etc.) on train end even if --save_state is not specified"
+        " / --save_stateが未指定時にもoptimizerなど学習状態も含めたstateを学習終了時に保存する",
+    )
+
+    # SAI Model spec
+    parser.add_argument(
+        "--metadata_title",
+        type=str,
+        default=None,
+        help="title for model metadata (default is output_name) / メタデータに書き込まれるモデルタイトル、省略時はoutput_name",
+    )
+    parser.add_argument(
+        "--metadata_author",
+        type=str,
+        default=None,
+        help="author name for model metadata / メタデータに書き込まれるモデル作者名",
+    )
+    parser.add_argument(
+        "--metadata_description",
+        type=str,
+        default=None,
+        help="description for model metadata / メタデータに書き込まれるモデル説明",
+    )
+    parser.add_argument(
+        "--metadata_license",
+        type=str,
+        default=None,
+        help="license for model metadata / メタデータに書き込まれるモデルライセンス",
+    )
+    parser.add_argument(
+        "--metadata_tags",
+        type=str,
+        default=None,
+        help="tags for model metadata, separated by comma / メタデータに書き込まれるモデルタグ、カンマ区切り",
+    )
+
+    # huggingface settings
+    parser.add_argument(
+        "--huggingface_repo_id",
+        type=str,
+        default=None,
+        help="huggingface repo name to upload / huggingfaceにアップロードするリポジトリ名",
+    )
+    parser.add_argument(
+        "--huggingface_repo_type",
+        type=str,
+        default=None,
+        help="huggingface repo type to upload / huggingfaceにアップロードするリポジトリの種類",
+    )
+    parser.add_argument(
+        "--huggingface_path_in_repo",
+        type=str,
+        default=None,
+        help="huggingface model path to upload files / huggingfaceにアップロードするファイルのパス",
+    )
+    parser.add_argument("--huggingface_token", type=str, default=None, help="huggingface token / huggingfaceのトークン")
+    parser.add_argument(
+        "--huggingface_repo_visibility",
+        type=str,
+        default=None,
+        help="huggingface repository visibility ('public' for public, 'private' or None for private) / huggingfaceにアップロードするリポジトリの公開設定（'public'で公開、'private'またはNoneで非公開）",
+    )
+    parser.add_argument(
+        "--save_state_to_huggingface", action="store_true", help="save state to huggingface / huggingfaceにstateを保存する"
+    )
+    parser.add_argument(
+        "--resume_from_huggingface",
+        action="store_true",
+        help="resume from huggingface (ex: --resume {repo_id}/{path_in_repo}:{revision}:{repo_type}) / huggingfaceから学習を再開する(例: --resume {repo_id}/{path_in_repo}:{revision}:{repo_type})",
+    )
+    parser.add_argument(
+        "--async_upload",
+        action="store_true",
+        help="upload to huggingface asynchronously / huggingfaceに非同期でアップロードする",
+    )
+
+    parser.add_argument("--dit", type=str, help="DiT checkpoint path / DiTのチェックポイントのパス")
+    parser.add_argument("--vae", type=str, help="VAE checkpoint path / VAEのチェックポイントのパス")
+    parser.add_argument("--vae_dtype", type=str, default=None, help="data type for VAE, default is float16")
+
+    return parser
+
+
+def read_config_from_file(args: argparse.Namespace, parser: argparse.ArgumentParser):
+    if not args.config_file:
+        return args
+
+    config_path = args.config_file + ".toml" if not args.config_file.endswith(".toml") else args.config_file
+
+    if not os.path.exists(config_path):
+        logger.info(f"{config_path} not found.")
+        exit(1)
+
+    logger.info(f"Loading settings from {config_path}...")
+    with open(config_path, "r", encoding="utf-8") as f:
+        config_dict = toml.load(f)
+
+    # combine all sections into one
+    ignore_nesting_dict = {}
+    for section_name, section_dict in config_dict.items():
+        # if value is not dict, save key and value as is
+        if not isinstance(section_dict, dict):
+            ignore_nesting_dict[section_name] = section_dict
+            continue
+
+        # if value is dict, save all key and value into one dict
+        for key, value in section_dict.items():
+            ignore_nesting_dict[key] = value
+
+    config_args = argparse.Namespace(**ignore_nesting_dict)
+    args = parser.parse_args(namespace=config_args)
+    args.config_file = os.path.splitext(args.config_file)[0]
+    logger.info(args.config_file)
+
+    return args
+
+
+def hv_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    """HunyuanVideo specific parser setup"""
+    # model settings
+    parser.add_argument("--dit_dtype", type=str, default=None, help="data type for DiT, default is bfloat16")
+    parser.add_argument("--dit_in_channels", type=int, default=16, help="input channels for DiT, default is 16, skyreels I2V is 32")
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for LLM / LLMにfp8を使う")
+    parser.add_argument("--text_encoder1", type=str, help="Text Encoder 1 directory / テキストエンコーダ1のディレクトリ")
+    parser.add_argument("--text_encoder2", type=str, help="Text Encoder 2 directory / テキストエンコーダ2のディレクトリ")
+    parser.add_argument("--text_encoder_dtype", type=str, default=None, help="data type for Text Encoder, default is float16")
+    parser.add_argument(
+        "--vae_tiling",
+        action="store_true",
+        help="enable spatial tiling for VAE, default is False. If vae_spatial_tile_sample_min_size is set, this is automatically enabled."
+        " / VAEの空間タイリングを有効にする、デフォルトはFalse。vae_spatial_tile_sample_min_sizeが設定されている場合、自動的に有効になります。",
+    )
+    parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
+    parser.add_argument(
+        "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser_common()
+    parser = hv_setup_parser(parser)
+
+    args = parser.parse_args()
+    args = read_config_from_file(args, parser)
+
+    args.fp8_scaled = False  # HunyuanVideo does not support this yet
+
+    trainer = NetworkTrainer()
+    trainer.train(args)
diff --git a/merge_lora.py b/merge_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..93aa4275609a96e8ccdec41d540e6381d32654ac
--- /dev/null
+++ b/merge_lora.py
@@ -0,0 +1,63 @@
+import argparse
+import logging
+import torch
+from safetensors.torch import load_file
+from networks import lora
+from utils.safetensors_utils import mem_eff_save_file
+from hunyuan_model.models import load_transformer
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="HunyuanVideo model merger script")
+
+    parser.add_argument("--dit", type=str, required=True, help="DiT checkpoint path or directory")
+    parser.add_argument("--dit_in_channels", type=int, default=16, help="input channels for DiT, default is 16, skyreels I2V is 32")
+    parser.add_argument("--lora_weight", type=str, nargs="*", required=False, default=None, help="LoRA weight path")
+    parser.add_argument("--lora_multiplier", type=float, nargs="*", default=[1.0], help="LoRA multiplier (can specify multiple values)")
+    parser.add_argument("--save_merged_model", type=str, required=True, help="Path to save the merged model")
+    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device to use for merging")
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    device = torch.device(args.device)
+    logger.info(f"Using device: {device}")
+
+    # Load DiT model
+    logger.info(f"Loading DiT model from {args.dit}")
+    transformer = load_transformer(args.dit, "torch", False, "cpu", torch.bfloat16, in_channels=args.dit_in_channels)
+    transformer.eval()
+
+    # Load LoRA weights and merge
+    if args.lora_weight is not None and len(args.lora_weight) > 0:
+        for i, lora_weight in enumerate(args.lora_weight):
+            # Use the corresponding lora_multiplier or default to 1.0
+            if args.lora_multiplier is not None and len(args.lora_multiplier) > i:
+                lora_multiplier = args.lora_multiplier[i]
+            else:
+                lora_multiplier = 1.0
+
+            logger.info(f"Loading LoRA weights from {lora_weight} with multiplier {lora_multiplier}")
+            weights_sd = load_file(lora_weight)
+            network = lora.create_network_from_weights_hunyuan_video(
+                lora_multiplier, weights_sd, unet=transformer, for_inference=True
+            )
+            logger.info("Merging LoRA weights to DiT model")
+            network.merge_to(None, transformer, weights_sd, device=device, non_blocking=True)
+
+            logger.info("LoRA weights loaded")
+
+    # Save the merged model
+    logger.info(f"Saving merged model to {args.save_merged_model}")
+    mem_eff_save_file(transformer.state_dict(), args.save_merged_model)
+    logger.info("Merged model saved")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/modules/__init__.py b/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/modules/custom_offloading_utils.py b/modules/custom_offloading_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d813575af2ce4fcccf4a305c1002bf618844e591
--- /dev/null
+++ b/modules/custom_offloading_utils.py
@@ -0,0 +1,266 @@
+from concurrent.futures import ThreadPoolExecutor
+import gc
+import time
+from typing import Optional
+import torch
+import torch.nn as nn
+
+
+def clean_memory_on_device(device: torch.device):
+    r"""
+    Clean memory on the specified device, will be called from training scripts.
+    """
+    gc.collect()
+
+    # device may "cuda" or "cuda:0", so we need to check the type of device
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+    if device.type == "xpu":
+        torch.xpu.empty_cache()
+    if device.type == "mps":
+        torch.mps.empty_cache()
+
+
+def synchronize_device(device: torch.device):
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    elif device.type == "xpu":
+        torch.xpu.synchronize()
+    elif device.type == "mps":
+        torch.mps.synchronize()
+
+
+def swap_weight_devices_cuda(device: torch.device, layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
+    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
+
+    weight_swap_jobs = []
+
+    # This is not working for all cases (e.g. SD3), so we need to find the corresponding modules
+    # for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
+    #     print(module_to_cpu.__class__, module_to_cuda.__class__)
+    #     if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
+    #         weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+
+    modules_to_cpu = {k: v for k, v in layer_to_cpu.named_modules()}
+    for module_to_cuda_name, module_to_cuda in layer_to_cuda.named_modules():
+        if hasattr(module_to_cuda, "weight") and module_to_cuda.weight is not None:
+            module_to_cpu = modules_to_cpu.get(module_to_cuda_name, None)
+            if module_to_cpu is not None and module_to_cpu.weight.shape == module_to_cuda.weight.shape:
+                weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+            else:
+                if module_to_cuda.weight.data.device.type != device.type:
+                    # print(
+                    #     f"Module {module_to_cuda_name} not found in CPU model or shape mismatch, so not swapping and moving to device"
+                    # )
+                    module_to_cuda.weight.data = module_to_cuda.weight.data.to(device)
+
+    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
+
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        # cuda to cpu
+        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+            cuda_data_view.record_stream(stream)
+            module_to_cpu.weight.data = cuda_data_view.data.to("cpu", non_blocking=True)
+
+        stream.synchronize()
+
+        # cpu to cuda
+        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+            cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
+            module_to_cuda.weight.data = cuda_data_view
+
+    stream.synchronize()
+    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
+
+
+def swap_weight_devices_no_cuda(device: torch.device, layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
+    """
+    not tested
+    """
+    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
+
+    weight_swap_jobs = []
+    for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
+        if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
+            weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+
+    # device to cpu
+    for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+        module_to_cpu.weight.data = cuda_data_view.data.to("cpu", non_blocking=True)
+
+    synchronize_device()
+
+    # cpu to device
+    for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+        cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
+        module_to_cuda.weight.data = cuda_data_view
+
+    synchronize_device()
+
+
+def weighs_to_device(layer: nn.Module, device: torch.device):
+    for module in layer.modules():
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data = module.weight.data.to(device, non_blocking=True)
+
+
+class Offloader:
+    """
+    common offloading class
+    """
+
+    def __init__(self, block_type: str, num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
+        self.block_type = block_type
+        self.num_blocks = num_blocks
+        self.blocks_to_swap = blocks_to_swap
+        self.device = device
+        self.debug = debug
+
+        self.thread_pool = ThreadPoolExecutor(max_workers=1)
+        self.futures = {}
+        self.cuda_available = device.type == "cuda"
+
+    def swap_weight_devices(self, block_to_cpu: nn.Module, block_to_cuda: nn.Module):
+        if self.cuda_available:
+            swap_weight_devices_cuda(self.device, block_to_cpu, block_to_cuda)
+        else:
+            swap_weight_devices_no_cuda(self.device, block_to_cpu, block_to_cuda)
+
+    def _submit_move_blocks(self, blocks, block_idx_to_cpu, block_idx_to_cuda):
+        def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda):
+            if self.debug:
+                start_time = time.perf_counter()
+                print(
+                    f"[{self.block_type}] Move block {bidx_to_cpu} to CPU and block {bidx_to_cuda} to {'CUDA' if self.cuda_available else 'device'}"
+                )
+
+            self.swap_weight_devices(block_to_cpu, block_to_cuda)
+
+            if self.debug:
+                print(f"[{self.block_type}] Moved blocks {bidx_to_cpu} and {bidx_to_cuda} in {time.perf_counter()-start_time:.2f}s")
+            return bidx_to_cpu, bidx_to_cuda  # , event
+
+        block_to_cpu = blocks[block_idx_to_cpu]
+        block_to_cuda = blocks[block_idx_to_cuda]
+
+        self.futures[block_idx_to_cuda] = self.thread_pool.submit(
+            move_blocks, block_idx_to_cpu, block_to_cpu, block_idx_to_cuda, block_to_cuda
+        )
+
+    def _wait_blocks_move(self, block_idx):
+        if block_idx not in self.futures:
+            return
+
+        if self.debug:
+            print(f"[{self.block_type}] Wait for block {block_idx}")
+            start_time = time.perf_counter()
+
+        future = self.futures.pop(block_idx)
+        _, bidx_to_cuda = future.result()
+
+        assert block_idx == bidx_to_cuda, f"Block index mismatch: {block_idx} != {bidx_to_cuda}"
+
+        if self.debug:
+            print(f"[{self.block_type}] Waited for block {block_idx}: {time.perf_counter()-start_time:.2f}s")
+
+
+class ModelOffloader(Offloader):
+    """
+    supports forward offloading
+    """
+
+    def __init__(
+        self,
+        block_type: str,
+        blocks: list[nn.Module],
+        num_blocks: int,
+        blocks_to_swap: int,
+        supports_backward: bool,
+        device: torch.device,
+        debug: bool = False,
+    ):
+        super().__init__(block_type, num_blocks, blocks_to_swap, device, debug)
+
+        self.supports_backward = supports_backward
+        self.forward_only = not supports_backward  # forward only offloading: can be changed to True for inference
+
+        if self.supports_backward:
+            # register backward hooks
+            self.remove_handles = []
+            for i, block in enumerate(blocks):
+                hook = self.create_backward_hook(blocks, i)
+                if hook is not None:
+                    handle = block.register_full_backward_hook(hook)
+                    self.remove_handles.append(handle)
+
+    def set_forward_only(self, forward_only: bool):
+        self.forward_only = forward_only
+
+    def __del__(self):
+        if self.supports_backward:
+            for handle in self.remove_handles:
+                handle.remove()
+
+    def create_backward_hook(self, blocks: list[nn.Module], block_index: int) -> Optional[callable]:
+        # -1 for 0-based index
+        num_blocks_propagated = self.num_blocks - block_index - 1
+        swapping = num_blocks_propagated > 0 and num_blocks_propagated <= self.blocks_to_swap
+        waiting = block_index > 0 and block_index <= self.blocks_to_swap
+
+        if not swapping and not waiting:
+            return None
+
+        # create  hook
+        block_idx_to_cpu = self.num_blocks - num_blocks_propagated
+        block_idx_to_cuda = self.blocks_to_swap - num_blocks_propagated
+        block_idx_to_wait = block_index - 1
+
+        def backward_hook(module, grad_input, grad_output):
+            if self.debug:
+                print(f"Backward hook for block {block_index}")
+
+            if swapping:
+                self._submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda)
+            if waiting:
+                self._wait_blocks_move(block_idx_to_wait)
+            return None
+
+        return backward_hook
+
+    def prepare_block_devices_before_forward(self, blocks: list[nn.Module]):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+
+        if self.debug:
+            print(f"[{self.block_type}] Prepare block devices before forward")
+
+        for b in blocks[0 : self.num_blocks - self.blocks_to_swap]:
+            b.to(self.device)
+            weighs_to_device(b, self.device)  # make sure weights are on device
+
+        for b in blocks[self.num_blocks - self.blocks_to_swap :]:
+            b.to(self.device)  # move block to device first
+            weighs_to_device(b, "cpu")  # make sure weights are on cpu
+
+        synchronize_device(self.device)
+        clean_memory_on_device(self.device)
+
+    def wait_for_block(self, block_idx: int):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self._wait_blocks_move(block_idx)
+
+    def submit_move_blocks_forward(self, blocks: list[nn.Module], block_idx: int):
+        # check if blocks_to_swap is enabled
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+
+        # if supports_backward and backward is enabled, we swap blocks more than blocks_to_swap in backward pass
+        if not self.forward_only and block_idx >= self.blocks_to_swap:
+            return
+
+        block_idx_to_cpu = block_idx
+        block_idx_to_cuda = self.num_blocks - self.blocks_to_swap + block_idx
+        block_idx_to_cuda = block_idx_to_cuda % self.num_blocks  # this works for forward-only offloading
+        self._submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda)
diff --git a/modules/fp8_optimization_utils.py b/modules/fp8_optimization_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec4ac4f8d11200cb715ebeb5b9ff55e26aae76ff
--- /dev/null
+++ b/modules/fp8_optimization_utils.py
@@ -0,0 +1,356 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import logging
+
+from tqdm import tqdm
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+from utils.device_utils import clean_memory_on_device
+
+
+def calculate_fp8_maxval(exp_bits=4, mantissa_bits=3, sign_bits=1):
+    """
+    Calculate the maximum representable value in FP8 format.
+    Default is E4M3 format (4-bit exponent, 3-bit mantissa, 1-bit sign).
+
+    Args:
+        exp_bits (int): Number of exponent bits
+        mantissa_bits (int): Number of mantissa bits
+        sign_bits (int): Number of sign bits (0 or 1)
+
+    Returns:
+        float: Maximum value representable in FP8 format
+    """
+    assert exp_bits + mantissa_bits + sign_bits == 8, "Total bits must be 8"
+
+    # Calculate exponent bias
+    bias = 2 ** (exp_bits - 1) - 1
+
+    # Calculate maximum mantissa value
+    mantissa_max = 1.0
+    for i in range(mantissa_bits - 1):
+        mantissa_max += 2 ** -(i + 1)
+
+    # Calculate maximum value
+    max_value = mantissa_max * (2 ** (2**exp_bits - 1 - bias))
+
+    return max_value
+
+
+def quantize_tensor_to_fp8(tensor, scale, exp_bits=4, mantissa_bits=3, sign_bits=1, max_value=None, min_value=None):
+    """
+    Quantize a tensor to FP8 format.
+
+    Args:
+        tensor (torch.Tensor): Tensor to quantize
+        scale (float or torch.Tensor): Scale factor
+        exp_bits (int): Number of exponent bits
+        mantissa_bits (int): Number of mantissa bits
+        sign_bits (int): Number of sign bits
+
+    Returns:
+        tuple: (quantized_tensor, scale_factor)
+    """
+    # Create scaled tensor
+    scaled_tensor = tensor / scale
+
+    # Calculate FP8 parameters
+    bias = 2 ** (exp_bits - 1) - 1
+
+    if max_value is None:
+        # Calculate max and min values
+        max_value = calculate_fp8_maxval(exp_bits, mantissa_bits, sign_bits)
+        min_value = -max_value if sign_bits > 0 else 0.0
+
+    # Clamp tensor to range
+    clamped_tensor = torch.clamp(scaled_tensor, min_value, max_value)
+
+    # Quantization process
+    abs_values = torch.abs(clamped_tensor)
+    nonzero_mask = abs_values > 0
+
+    # Calculate log scales (only for non-zero elements)
+    log_scales = torch.zeros_like(clamped_tensor)
+    if nonzero_mask.any():
+        log_scales[nonzero_mask] = torch.floor(torch.log2(abs_values[nonzero_mask]) + bias).detach()
+
+    # Limit log scales and calculate quantization factor
+    log_scales = torch.clamp(log_scales, min=1.0)
+    quant_factor = 2.0 ** (log_scales - mantissa_bits - bias)
+
+    # Quantize and dequantize
+    quantized = torch.round(clamped_tensor / quant_factor) * quant_factor
+
+    return quantized, scale
+
+
+def optimize_state_dict_with_fp8(
+    state_dict, calc_device, target_layer_keys=None, exclude_layer_keys=None, exp_bits=4, mantissa_bits=3, move_to_device=False
+):
+    """
+    Optimize Linear layer weights in a model's state dict to FP8 format.
+
+    Args:
+        state_dict (dict): State dict to optimize, replaced in-place
+        calc_device (str): Device to quantize tensors on
+        target_layer_keys (list, optional): Layer key patterns to target (None for all Linear layers)
+        exclude_layer_keys (list, optional): Layer key patterns to exclude
+        exp_bits (int): Number of exponent bits
+        mantissa_bits (int): Number of mantissa bits
+        move_to_device (bool): Move optimized tensors to the calculating device
+
+    Returns:
+        dict: FP8 optimized state dict
+    """
+    if exp_bits == 4 and mantissa_bits == 3:
+        fp8_dtype = torch.float8_e4m3fn
+    elif exp_bits == 5 and mantissa_bits == 2:
+        fp8_dtype = torch.float8_e5m2
+    else:
+        raise ValueError(f"Unsupported FP8 format: E{exp_bits}M{mantissa_bits}")
+
+    # Calculate FP8 max value
+    max_value = calculate_fp8_maxval(exp_bits, mantissa_bits)
+    min_value = -max_value  # this function supports only signed FP8
+
+    # Create optimized state dict
+    optimized_count = 0
+
+    # Enumerate tarket keys
+    target_state_dict_keys = []
+    for key in state_dict.keys():
+        # Check if it's a weight key and matches target patterns
+        is_target = (target_layer_keys is None or any(pattern in key for pattern in target_layer_keys)) and key.endswith(".weight")
+        is_excluded = exclude_layer_keys is not None and any(pattern in key for pattern in exclude_layer_keys)
+        is_target = is_target and not is_excluded
+
+        if is_target and isinstance(state_dict[key], torch.Tensor):
+            target_state_dict_keys.append(key)
+
+    # Process each key
+    for key in tqdm(target_state_dict_keys):
+        value = state_dict[key]
+
+        # Save original device and dtype
+        original_device = value.device
+        original_dtype = value.dtype
+
+        # Move to calculation device
+        if calc_device is not None:
+            value = value.to(calc_device)
+
+        # Calculate scale factor
+        scale = torch.max(torch.abs(value.flatten())) / max_value
+        # print(f"Optimizing {key} with scale: {scale}")
+
+        # Quantize weight to FP8
+        quantized_weight, _ = quantize_tensor_to_fp8(value, scale, exp_bits, mantissa_bits, 1, max_value, min_value)
+
+        # Add to state dict using original key for weight and new key for scale
+        fp8_key = key  # Maintain original key
+        scale_key = key.replace(".weight", ".scale_weight")
+
+        quantized_weight = quantized_weight.to(fp8_dtype)
+
+        if not move_to_device:
+            quantized_weight = quantized_weight.to(original_device)
+
+        scale_tensor = torch.tensor([scale], dtype=original_dtype, device=quantized_weight.device)
+
+        state_dict[fp8_key] = quantized_weight
+        state_dict[scale_key] = scale_tensor
+
+        optimized_count += 1
+
+        if calc_device is not None:  # optimized_count % 10 == 0 and
+            # free memory on calculation device
+            clean_memory_on_device(calc_device)
+
+    logger.info(f"Number of optimized Linear layers: {optimized_count}")
+    return state_dict
+
+
+def fp8_linear_forward_patch(self: nn.Linear, x, use_scaled_mm=False, max_value=None):
+    """
+    Patched forward method for Linear layers with FP8 weights.
+
+    Args:
+        self: Linear layer instance
+        x (torch.Tensor): Input tensor
+        use_scaled_mm (bool): Use scaled_mm for FP8 Linear layers, requires SM 8.9+ (RTX 40 series)
+        max_value (float): Maximum value for FP8 quantization. If None, no quantization is applied for input tensor.
+
+    Returns:
+        torch.Tensor: Result of linear transformation
+    """
+    if use_scaled_mm:
+        input_dtype = x.dtype
+        original_weight_dtype = self.scale_weight.dtype
+        weight_dtype = self.weight.dtype
+        target_dtype = torch.float8_e5m2
+        assert weight_dtype == torch.float8_e4m3fn, "Only FP8 E4M3FN format is supported"
+        assert x.ndim == 3, "Input tensor must be 3D (batch_size, seq_len, hidden_dim)"
+
+        if max_value is None:
+            # no input quantization
+            scale_x = torch.tensor(1.0, dtype=torch.float32, device=x.device)
+        else:
+            # calculate scale factor for input tensor
+            scale_x = (torch.max(torch.abs(x.flatten())) / max_value).to(torch.float32)
+
+            # quantize input tensor to FP8: this seems to consume a lot of memory
+            x, _ = quantize_tensor_to_fp8(x, scale_x, 5, 2, 1, max_value, -max_value)
+
+        original_shape = x.shape
+        x = x.reshape(-1, x.shape[2]).to(target_dtype)
+
+        weight = self.weight.t()
+        scale_weight = self.scale_weight.to(torch.float32)
+
+        if self.bias is not None:
+            # float32 is not supported with bias in scaled_mm
+            o = torch._scaled_mm(x, weight, out_dtype=original_weight_dtype, bias=self.bias, scale_a=scale_x, scale_b=scale_weight)
+        else:
+            o = torch._scaled_mm(x, weight, out_dtype=input_dtype, scale_a=scale_x, scale_b=scale_weight)
+
+        return o.reshape(original_shape[0], original_shape[1], -1).to(input_dtype)
+
+    else:
+        # Dequantize the weight
+        original_dtype = self.scale_weight.dtype
+        dequantized_weight = self.weight.to(original_dtype) * self.scale_weight
+
+        # Perform linear transformation
+        if self.bias is not None:
+            output = F.linear(x, dequantized_weight, self.bias)
+        else:
+            output = F.linear(x, dequantized_weight)
+
+        return output
+
+
+def apply_fp8_monkey_patch(model, optimized_state_dict, use_scaled_mm=False):
+    """
+    Apply monkey patching to a model using FP8 optimized state dict.
+
+    Args:
+        model (nn.Module): Model instance to patch
+        optimized_state_dict (dict): FP8 optimized state dict
+        use_scaled_mm (bool): Use scaled_mm for FP8 Linear layers, requires SM 8.9+ (RTX 40 series)
+
+    Returns:
+        nn.Module: The patched model (same instance, modified in-place)
+    """
+    # # Calculate FP8 float8_e5m2 max value
+    # max_value = calculate_fp8_maxval(5, 2)
+    max_value = None  # do not quantize input tensor
+
+    # Find all scale keys to identify FP8-optimized layers
+    scale_keys = [k for k in optimized_state_dict.keys() if k.endswith(".scale_weight")]
+
+    # Enumerate patched layers
+    patched_module_paths = set()
+    for scale_key in scale_keys:
+        # Extract module path from scale key (remove .scale_weight)
+        module_path = scale_key.rsplit(".scale_weight", 1)[0]
+        patched_module_paths.add(module_path)
+
+    patched_count = 0
+
+    # Apply monkey patch to each layer with FP8 weights
+    for name, module in model.named_modules():
+        # Check if this module has a corresponding scale_weight
+        has_scale = name in patched_module_paths
+
+        # Apply patch if it's a Linear layer with FP8 scale
+        if isinstance(module, nn.Linear) and has_scale:
+            # register the scale_weight as a buffer to load the state_dict
+            module.register_buffer("scale_weight", torch.tensor(1.0, dtype=module.weight.dtype))
+
+            # Create a new forward method with the patched version.
+            def new_forward(self, x):
+                return fp8_linear_forward_patch(self, x, use_scaled_mm, max_value)
+
+            # Bind method to module
+            module.forward = new_forward.__get__(module, type(module))
+
+            patched_count += 1
+
+    logger.info(f"Number of monkey-patched Linear layers: {patched_count}")
+    return model
+
+
+# Example usage
+def example_usage():
+    # Small test model
+    class TestModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            fc1 = nn.Linear(768, 3072)
+            act1 = nn.GELU()
+            fc2 = nn.Linear(3072, 768)
+            act2 = nn.GELU()
+            fc3 = nn.Linear(768, 768)
+
+            # Set layer names for testing
+            self.single_blocks = nn.ModuleList([fc1, act1, fc2, act2, fc3])
+
+            self.fc4 = nn.Linear(768, 128)
+
+        def forward(self, x):
+            for layer in self.single_blocks:
+                x = layer(x)
+            x = self.fc4(x)
+            return x
+
+    # Instantiate model
+    test_model = TestModel()
+    test_model.to(torch.float16)  # convert to FP16 for testing
+
+    # Test input tensor
+    test_input = torch.randn(1, 768, dtype=torch.float16)
+
+    # Calculate output before optimization
+    with torch.no_grad():
+        original_output = test_model(test_input)
+        print("original output", original_output[0, :5])
+
+    # Get state dict
+    state_dict = test_model.state_dict()
+
+    # Apply FP8 optimization to state dict
+    cuda_device = torch.device("cuda")
+    optimized_state_dict = optimize_state_dict_with_fp8(state_dict, cuda_device, ["single_blocks"], ["2"])
+
+    # Apply monkey patching to the model
+    optimized_model = TestModel()  # re-instantiate model
+    optimized_model.to(torch.float16)  # convert to FP16 for testing
+    apply_fp8_monkey_patch(optimized_model, optimized_state_dict)
+
+    # Load optimized state dict
+    optimized_model.load_state_dict(optimized_state_dict, strict=True, assign=True)  # assign=True to load buffer
+
+    # Calculate output after optimization
+    with torch.no_grad():
+        optimized_output = optimized_model(test_input)
+        print("optimized output", optimized_output[0, :5])
+
+    # Compare accuracy
+    error = torch.mean(torch.abs(original_output - optimized_output))
+    print(f"Mean absolute error: {error.item()}")
+
+    # Check memory usage
+    original_params = sum(p.nelement() * p.element_size() for p in test_model.parameters()) / (1024 * 1024)
+    print(f"Model parameter memory: {original_params:.2f} MB")
+    optimized_params = sum(p.nelement() * p.element_size() for p in optimized_model.parameters()) / (1024 * 1024)
+    print(f"Optimized model parameter memory: {optimized_params:.2f} MB")
+
+    return test_model
+
+
+if __name__ == "__main__":
+    example_usage()
diff --git a/modules/scheduling_flow_match_discrete.py b/modules/scheduling_flow_match_discrete.py
new file mode 100644
index 0000000000000000000000000000000000000000..c507ec4eb050463188e250c20aec8d1fde2c4a5d
--- /dev/null
+++ b/modules/scheduling_flow_match_discrete.py
@@ -0,0 +1,257 @@
+# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput, logging
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class FlowMatchDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.FloatTensor
+
+
+class FlowMatchDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        shift (`float`, defaults to 1.0):
+            The shift value for the timestep schedule.
+        reverse (`bool`, defaults to `True`):
+            Whether to reverse the timestep schedule.
+    """
+
+    _compatibles = []
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        shift: float = 1.0,
+        reverse: bool = True,
+        solver: str = "euler",
+        n_tokens: Optional[int] = None,
+    ):
+        sigmas = torch.linspace(1, 0, num_train_timesteps + 1)
+
+        if not reverse:
+            sigmas = sigmas.flip(0)
+
+        self.sigmas = sigmas
+        # the value fed to model
+        self.timesteps = (sigmas[:-1] * num_train_timesteps).to(dtype=torch.float32)
+
+        self._step_index = None
+        self._begin_index = None
+
+        self.supported_solver = ["euler"]
+        if solver not in self.supported_solver:
+            raise ValueError(
+                f"Solver {solver} not supported. Supported solvers: {self.supported_solver}"
+            )
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        device: Union[str, torch.device] = None,
+        n_tokens: int = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            n_tokens (`int`, *optional*):
+                Number of tokens in the input sequence.
+        """
+        self.num_inference_steps = num_inference_steps
+        
+        sigmas = torch.linspace(1, 0, num_inference_steps + 1)
+        sigmas = self.sd3_time_shift(sigmas)
+
+        if not self.config.reverse:
+            sigmas = 1 - sigmas
+
+        self.sigmas = sigmas
+        self.timesteps = (sigmas[:-1] * self.config.num_train_timesteps).to(
+            dtype=torch.float32, device=device
+        )
+
+        # Reset step index
+        self._step_index = None
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def scale_model_input(
+        self, sample: torch.Tensor, timestep: Optional[int] = None
+    ) -> torch.Tensor:
+        return sample
+
+    def sd3_time_shift(self, t: torch.Tensor):
+        return (self.config.shift * t) / (1 + (self.config.shift - 1) * t)
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[FlowMatchDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            n_tokens (`int`, *optional*):
+                Number of tokens in the input sequence.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+
+        dt = self.sigmas[self.step_index + 1] - self.sigmas[self.step_index]
+
+        if self.config.solver == "euler":
+            prev_sample = sample + model_output.to(torch.float32) * dt
+        else:
+            raise ValueError(
+                f"Solver {self.config.solver} not supported. Supported solvers: {self.supported_solver}"
+            )
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return FlowMatchDiscreteSchedulerOutput(prev_sample=prev_sample)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/modules/unet_causal_3d_blocks.py b/modules/unet_causal_3d_blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..27d544170ece6a370cdacfe9e31367b884c2e516
--- /dev/null
+++ b/modules/unet_causal_3d_blocks.py
@@ -0,0 +1,818 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from einops import rearrange
+
+from diffusers.utils import logging
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import SpatialNorm
+from diffusers.models.attention_processor import Attention
+from diffusers.models.normalization import AdaGroupNorm
+from diffusers.models.normalization import RMSNorm
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def prepare_causal_attention_mask(n_frame: int, n_hw: int, dtype, device, batch_size: int = None):
+    seq_len = n_frame * n_hw
+    mask = torch.full((seq_len, seq_len), float("-inf"), dtype=dtype, device=device)
+    for i in range(seq_len):
+        i_frame = i // n_hw
+        mask[i, : (i_frame + 1) * n_hw] = 0
+    if batch_size is not None:
+        mask = mask.unsqueeze(0).expand(batch_size, -1, -1)
+    return mask
+
+
+class CausalConv3d(nn.Module):
+    """
+    Implements a causal 3D convolution layer where each position only depends on previous timesteps and current spatial locations.
+    This maintains temporal causality in video generation tasks.
+    """
+
+    def __init__(
+        self,
+        chan_in,
+        chan_out,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        dilation: Union[int, Tuple[int, int, int]] = 1,
+        pad_mode="replicate",
+        chunk_size=0,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.pad_mode = pad_mode
+        padding = (kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size - 1, 0)  # W, H, T
+        self.time_causal_padding = padding
+        self.chunk_size = chunk_size
+
+        self.conv = nn.Conv3d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs)
+
+    def original_forward(self, x):
+        x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
+        return self.conv(x)
+
+    def forward(self, x):
+        if self.chunk_size == 0:
+            return self.original_forward(x)
+
+        # if not large, call original forward
+        if x.shape[4] < self.chunk_size * 1.5:
+            return self.original_forward(x)
+
+        # # debug: verify the original forward is the same as chunked forward
+        # orig_forwarded_value = None
+        # if x.shape[4] < self.chunk_size * 4:
+        #     orig_forwarded_value = self.original_forward(x)
+
+        # get the kernel size
+        kernel_size = self.conv.kernel_size[0]  # assume cubic kernel
+        assert kernel_size == self.conv.kernel_size[1] == self.conv.kernel_size[2], "Only cubic kernels are supported"
+        padding_size = kernel_size // 2  # 1 for kernel_size=3, 0 for kernel_size=1
+
+        x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
+
+        B, C, D, H, W = orig_shape = x.shape
+        chunk_size = self.chunk_size
+        chunk_size -= chunk_size % self.conv.stride[2]  # make sure the chunk size is divisible by stride
+        # print(f"chunked forward: {x.shape}, chunk_size: {chunk_size}")
+
+        # calculate the indices for chunking with overlap and padding by kernel size and stride
+        indices = []
+        i = 0
+        while i < W - padding_size:
+            start_idx = i - padding_size
+            end_idx = min(i + chunk_size + padding_size, W)
+            if i == 0:
+                start_idx = 0
+                end_idx += padding_size  # to make sure the first chunk is divisible by stride
+            if W - end_idx < chunk_size // 2:  # small chunk at the end
+                end_idx = W
+            indices.append((start_idx, end_idx))
+            i = end_idx - padding_size
+        # print(f"chunked forward: {x.shape}, chunked indices: {indices}")
+
+        chunks = []
+        for start_idx, end_idx in indices:
+            chunk = x[:, :, :, :, start_idx:end_idx]
+            chunk_output = self.conv(chunk)
+            # print(chunk.shape, chunk_output.shape)
+            chunks.append(chunk_output)
+
+        # concatenate the chunks
+        x = torch.cat(chunks, dim=4)
+
+        assert (
+            x.shape[2] == ((D - padding_size * 2) + self.conv.stride[0] - 1) // self.conv.stride[0]
+        ), f"Invalid shape: {x.shape}, {orig_shape}, {padding_size}, {self.conv.stride}"
+        assert (
+            x.shape[3] == ((H - padding_size * 2) + self.conv.stride[1] - 1) // self.conv.stride[1]
+        ), f"Invalid shape: {x.shape}, {orig_shape}, {padding_size}, {self.conv.stride}"
+        assert (
+            x.shape[4] == ((W - padding_size * 2) + self.conv.stride[2] - 1) // self.conv.stride[2]
+        ), f"Invalid shape: {x.shape}, {orig_shape}, {padding_size}, {self.conv.stride}"
+
+        # # debug: verify the original forward is the same as chunked forward
+        # if orig_forwarded_value is not None:
+        #     assert torch.allclose(
+        #         orig_forwarded_value, x, rtol=1e-4, atol=1e-2
+        #     ), f"Chunked forward is different from original forward. {x.shape}, {orig_shape}, {padding_size}, {self.conv.stride}, {self.conv.kernel_size}"
+
+        return x
+
+
+class UpsampleCausal3D(nn.Module):
+    """
+    A 3D upsampling layer with an optional convolution.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        use_conv_transpose: bool = False,
+        out_channels: Optional[int] = None,
+        name: str = "conv",
+        kernel_size: Optional[int] = None,
+        padding=1,
+        norm_type=None,
+        eps=None,
+        elementwise_affine=None,
+        bias=True,
+        interpolate=True,
+        upsample_factor=(2, 2, 2),
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+        self.interpolate = interpolate
+        self.upsample_factor = upsample_factor
+
+        if norm_type == "ln_norm":
+            self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(channels, eps, elementwise_affine)
+        elif norm_type is None:
+            self.norm = None
+        else:
+            raise ValueError(f"unknown norm_type: {norm_type}")
+
+        conv = None
+        if use_conv_transpose:
+            raise NotImplementedError
+        elif use_conv:
+            if kernel_size is None:
+                kernel_size = 3
+            conv = CausalConv3d(self.channels, self.out_channels, kernel_size=kernel_size, bias=bias)
+
+        if name == "conv":
+            self.conv = conv
+        else:
+            self.Conv2d_0 = conv
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        output_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        assert hidden_states.shape[1] == self.channels
+
+        if self.norm is not None:
+            raise NotImplementedError
+
+        if self.use_conv_transpose:
+            return self.conv(hidden_states)
+
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        dtype = hidden_states.dtype
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(torch.float32)
+
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous()
+
+        # if `output_size` is passed we force the interpolation output
+        # size and do not make use of `scale_factor=2`
+        if self.interpolate:
+            B, C, T, H, W = hidden_states.shape
+            first_h, other_h = hidden_states.split((1, T - 1), dim=2)
+            if output_size is None:
+                if T > 1:
+                    other_h = F.interpolate(other_h, scale_factor=self.upsample_factor, mode="nearest")
+
+                first_h = first_h.squeeze(2)
+                first_h = F.interpolate(first_h, scale_factor=self.upsample_factor[1:], mode="nearest")
+                first_h = first_h.unsqueeze(2)
+            else:
+                raise NotImplementedError
+
+            if T > 1:
+                hidden_states = torch.cat((first_h, other_h), dim=2)
+            else:
+                hidden_states = first_h
+
+        # If the input is bfloat16, we cast back to bfloat16
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(dtype)
+
+        if self.use_conv:
+            if self.name == "conv":
+                hidden_states = self.conv(hidden_states)
+            else:
+                hidden_states = self.Conv2d_0(hidden_states)
+
+        return hidden_states
+
+
+class DownsampleCausal3D(nn.Module):
+    """
+    A 3D downsampling layer with an optional convolution.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        out_channels: Optional[int] = None,
+        padding: int = 1,
+        name: str = "conv",
+        kernel_size=3,
+        norm_type=None,
+        eps=None,
+        elementwise_affine=None,
+        bias=True,
+        stride=2,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = stride
+        self.name = name
+
+        if norm_type == "ln_norm":
+            self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(channels, eps, elementwise_affine)
+        elif norm_type is None:
+            self.norm = None
+        else:
+            raise ValueError(f"unknown norm_type: {norm_type}")
+
+        if use_conv:
+            conv = CausalConv3d(self.channels, self.out_channels, kernel_size=kernel_size, stride=stride, bias=bias)
+        else:
+            raise NotImplementedError
+
+        if name == "conv":
+            self.Conv2d_0 = conv
+            self.conv = conv
+        elif name == "Conv2d_0":
+            self.conv = conv
+        else:
+            self.conv = conv
+
+    def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch.FloatTensor:
+        assert hidden_states.shape[1] == self.channels
+
+        if self.norm is not None:
+            hidden_states = self.norm(hidden_states.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+
+        assert hidden_states.shape[1] == self.channels
+
+        hidden_states = self.conv(hidden_states)
+
+        return hidden_states
+
+
+class ResnetBlockCausal3D(nn.Module):
+    r"""
+    A Resnet block.
+    """
+
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        pre_norm: bool = True,
+        eps: float = 1e-6,
+        non_linearity: str = "swish",
+        skip_time_act: bool = False,
+        # default, scale_shift, ada_group, spatial
+        time_embedding_norm: str = "default",
+        kernel: Optional[torch.FloatTensor] = None,
+        output_scale_factor: float = 1.0,
+        use_in_shortcut: Optional[bool] = None,
+        up: bool = False,
+        down: bool = False,
+        conv_shortcut_bias: bool = True,
+        conv_3d_out_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.up = up
+        self.down = down
+        self.output_scale_factor = output_scale_factor
+        self.time_embedding_norm = time_embedding_norm
+        self.skip_time_act = skip_time_act
+
+        linear_cls = nn.Linear
+
+        if groups_out is None:
+            groups_out = groups
+
+        if self.time_embedding_norm == "ada_group":
+            self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm1 = SpatialNorm(in_channels, temb_channels)
+        else:
+            self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+
+        self.conv1 = CausalConv3d(in_channels, out_channels, kernel_size=3, stride=1)
+
+        if temb_channels is not None:
+            if self.time_embedding_norm == "default":
+                self.time_emb_proj = linear_cls(temb_channels, out_channels)
+            elif self.time_embedding_norm == "scale_shift":
+                self.time_emb_proj = linear_cls(temb_channels, 2 * out_channels)
+            elif self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+                self.time_emb_proj = None
+            else:
+                raise ValueError(f"Unknown time_embedding_norm : {self.time_embedding_norm} ")
+        else:
+            self.time_emb_proj = None
+
+        if self.time_embedding_norm == "ada_group":
+            self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm2 = SpatialNorm(out_channels, temb_channels)
+        else:
+            self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+
+        self.dropout = torch.nn.Dropout(dropout)
+        conv_3d_out_channels = conv_3d_out_channels or out_channels
+        self.conv2 = CausalConv3d(out_channels, conv_3d_out_channels, kernel_size=3, stride=1)
+
+        self.nonlinearity = get_activation(non_linearity)
+
+        self.upsample = self.downsample = None
+        if self.up:
+            self.upsample = UpsampleCausal3D(in_channels, use_conv=False)
+        elif self.down:
+            self.downsample = DownsampleCausal3D(in_channels, use_conv=False, name="op")
+
+        self.use_in_shortcut = self.in_channels != conv_3d_out_channels if use_in_shortcut is None else use_in_shortcut
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = CausalConv3d(
+                in_channels,
+                conv_3d_out_channels,
+                kernel_size=1,
+                stride=1,
+                bias=conv_shortcut_bias,
+            )
+
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        hidden_states = input_tensor
+
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+            hidden_states = self.norm1(hidden_states, temb)
+        else:
+            hidden_states = self.norm1(hidden_states)
+
+        hidden_states = self.nonlinearity(hidden_states)
+
+        if self.upsample is not None:
+            # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+            if hidden_states.shape[0] >= 64:
+                input_tensor = input_tensor.contiguous()
+                hidden_states = hidden_states.contiguous()
+            input_tensor = self.upsample(input_tensor, scale=scale)
+            hidden_states = self.upsample(hidden_states, scale=scale)
+        elif self.downsample is not None:
+            input_tensor = self.downsample(input_tensor, scale=scale)
+            hidden_states = self.downsample(hidden_states, scale=scale)
+
+        hidden_states = self.conv1(hidden_states)
+
+        if self.time_emb_proj is not None:
+            if not self.skip_time_act:
+                temb = self.nonlinearity(temb)
+            temb = self.time_emb_proj(temb, scale)[:, :, None, None]
+
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+            hidden_states = self.norm2(hidden_states, temb)
+        else:
+            hidden_states = self.norm2(hidden_states)
+
+        if temb is not None and self.time_embedding_norm == "scale_shift":
+            scale, shift = torch.chunk(temb, 2, dim=1)
+            hidden_states = hidden_states * (1 + scale) + shift
+
+        hidden_states = self.nonlinearity(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+
+        return output_tensor
+
+
+def get_down_block3d(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+    downsample_stride: int,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    downsample_padding: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    downsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+):
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_down_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownEncoderBlockCausal3D":
+        return DownEncoderBlockCausal3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            downsample_stride=downsample_stride,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_up_block3d(
+    up_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    prev_output_channel: int,
+    temb_channels: int,
+    add_upsample: bool,
+    upsample_scale_factor: Tuple,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    resolution_idx: Optional[int] = None,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    upsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+) -> nn.Module:
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_up_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpDecoderBlockCausal3D":
+        return UpDecoderBlockCausal3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            upsample_scale_factor=upsample_scale_factor,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+class UNetMidBlockCausal3D(nn.Module):
+    """
+    A 3D UNet mid-block [`UNetMidBlockCausal3D`] with multiple residual blocks and optional attention blocks.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        attn_groups: Optional[int] = None,
+        resnet_pre_norm: bool = True,
+        add_attention: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+    ):
+        super().__init__()
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        self.add_attention = add_attention
+
+        if attn_groups is None:
+            attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlockCausal3D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
+            )
+            attention_head_dim = in_channels
+
+        for _ in range(num_layers):
+            if self.add_attention:
+                attentions.append(
+                    Attention(
+                        in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        rescale_output_factor=output_scale_factor,
+                        eps=resnet_eps,
+                        norm_num_groups=attn_groups,
+                        spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                        residual_connection=True,
+                        bias=True,
+                        upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
+                    )
+                )
+            else:
+                attentions.append(None)
+
+            resnets.append(
+                ResnetBlockCausal3D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                B, C, T, H, W = hidden_states.shape
+                hidden_states = rearrange(hidden_states, "b c f h w -> b (f h w) c")
+                attention_mask = prepare_causal_attention_mask(T, H * W, hidden_states.dtype, hidden_states.device, batch_size=B)
+                hidden_states = attn(hidden_states, temb=temb, attention_mask=attention_mask)
+                hidden_states = rearrange(hidden_states, "b (f h w) c -> b c f h w", f=T, h=H, w=W)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+class DownEncoderBlockCausal3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_stride: int = 2,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlockCausal3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    DownsampleCausal3D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                        stride=downsample_stride,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None, scale=scale)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale)
+
+        return hidden_states
+
+
+class UpDecoderBlockCausal3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        upsample_scale_factor=(2, 2, 2),
+        temb_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlockCausal3D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [
+                    UpsampleCausal3D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        upsample_factor=upsample_scale_factor,
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=temb, scale=scale)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
diff --git a/networks/__init__.py b/networks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/networks/lora.py b/networks/lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..971828958abc6d7a47e4bf3103f75abaf7299700
--- /dev/null
+++ b/networks/lora.py
@@ -0,0 +1,913 @@
+# LoRA network module: currently conv2d is not fully supported
+# reference:
+# https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
+# https://github.com/cloneofsimo/lora/blob/master/lora_diffusion/lora.py
+
+import ast
+import math
+import os
+import re
+from typing import Dict, List, Optional, Type, Union
+from transformers import CLIPTextModel
+import numpy as np
+import torch
+import torch.nn as nn
+
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+HUNYUAN_TARGET_REPLACE_MODULES = ["MMDoubleStreamBlock", "MMSingleStreamBlock"]
+
+
+class LoRAModule(torch.nn.Module):
+    """
+    replaces forward method of the original Linear, instead of replacing the original Linear module.
+    """
+
+    def __init__(
+        self,
+        lora_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        lora_dim=4,
+        alpha=1,
+        dropout=None,
+        rank_dropout=None,
+        module_dropout=None,
+        split_dims: Optional[List[int]] = None,
+    ):
+        """
+        if alpha == 0 or None, alpha is rank (no scaling).
+
+        split_dims is used to mimic the split qkv of multi-head attention.
+        """
+        super().__init__()
+        self.lora_name = lora_name
+
+        if org_module.__class__.__name__ == "Conv2d":
+            in_dim = org_module.in_channels
+            out_dim = org_module.out_channels
+        else:
+            in_dim = org_module.in_features
+            out_dim = org_module.out_features
+
+        self.lora_dim = lora_dim
+        self.split_dims = split_dims
+
+        if split_dims is None:
+            if org_module.__class__.__name__ == "Conv2d":
+                kernel_size = org_module.kernel_size
+                stride = org_module.stride
+                padding = org_module.padding
+                self.lora_down = torch.nn.Conv2d(in_dim, self.lora_dim, kernel_size, stride, padding, bias=False)
+                self.lora_up = torch.nn.Conv2d(self.lora_dim, out_dim, (1, 1), (1, 1), bias=False)
+            else:
+                self.lora_down = torch.nn.Linear(in_dim, self.lora_dim, bias=False)
+                self.lora_up = torch.nn.Linear(self.lora_dim, out_dim, bias=False)
+
+            torch.nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
+            torch.nn.init.zeros_(self.lora_up.weight)
+        else:
+            # conv2d not supported
+            assert sum(split_dims) == out_dim, "sum of split_dims must be equal to out_dim"
+            assert org_module.__class__.__name__ == "Linear", "split_dims is only supported for Linear"
+            # print(f"split_dims: {split_dims}")
+            self.lora_down = torch.nn.ModuleList(
+                [torch.nn.Linear(in_dim, self.lora_dim, bias=False) for _ in range(len(split_dims))]
+            )
+            self.lora_up = torch.nn.ModuleList([torch.nn.Linear(self.lora_dim, split_dim, bias=False) for split_dim in split_dims])
+            for lora_down in self.lora_down:
+                torch.nn.init.kaiming_uniform_(lora_down.weight, a=math.sqrt(5))
+            for lora_up in self.lora_up:
+                torch.nn.init.zeros_(lora_up.weight)
+
+        if type(alpha) == torch.Tensor:
+            alpha = alpha.detach().float().numpy()  # without casting, bf16 causes error
+        alpha = self.lora_dim if alpha is None or alpha == 0 else alpha
+        self.scale = alpha / self.lora_dim
+        self.register_buffer("alpha", torch.tensor(alpha))  # for save/load
+
+        # same as microsoft's
+        self.multiplier = multiplier
+        self.org_module = org_module  # remove in applying
+        self.dropout = dropout
+        self.rank_dropout = rank_dropout
+        self.module_dropout = module_dropout
+
+    def apply_to(self):
+        self.org_forward = self.org_module.forward
+        self.org_module.forward = self.forward
+        del self.org_module
+
+    def forward(self, x):
+        org_forwarded = self.org_forward(x)
+
+        # module dropout
+        if self.module_dropout is not None and self.training:
+            if torch.rand(1) < self.module_dropout:
+                return org_forwarded
+
+        if self.split_dims is None:
+            lx = self.lora_down(x)
+
+            # normal dropout
+            if self.dropout is not None and self.training:
+                lx = torch.nn.functional.dropout(lx, p=self.dropout)
+
+            # rank dropout
+            if self.rank_dropout is not None and self.training:
+                mask = torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout
+                if len(lx.size()) == 3:
+                    mask = mask.unsqueeze(1)  # for Text Encoder
+                elif len(lx.size()) == 4:
+                    mask = mask.unsqueeze(-1).unsqueeze(-1)  # for Conv2d
+                lx = lx * mask
+
+                # scaling for rank dropout: treat as if the rank is changed
+                scale = self.scale * (1.0 / (1.0 - self.rank_dropout))  # redundant for readability
+            else:
+                scale = self.scale
+
+            lx = self.lora_up(lx)
+
+            return org_forwarded + lx * self.multiplier * scale
+        else:
+            lxs = [lora_down(x) for lora_down in self.lora_down]
+
+            # normal dropout
+            if self.dropout is not None and self.training:
+                lxs = [torch.nn.functional.dropout(lx, p=self.dropout) for lx in lxs]
+
+            # rank dropout
+            if self.rank_dropout is not None and self.training:
+                masks = [torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout for lx in lxs]
+                for i in range(len(lxs)):
+                    if len(lx.size()) == 3:
+                        masks[i] = masks[i].unsqueeze(1)
+                    elif len(lx.size()) == 4:
+                        masks[i] = masks[i].unsqueeze(-1).unsqueeze(-1)
+                    lxs[i] = lxs[i] * masks[i]
+
+                # scaling for rank dropout: treat as if the rank is changed
+                scale = self.scale * (1.0 / (1.0 - self.rank_dropout))  # redundant for readability
+            else:
+                scale = self.scale
+
+            lxs = [lora_up(lx) for lora_up, lx in zip(self.lora_up, lxs)]
+
+            return org_forwarded + torch.cat(lxs, dim=-1) * self.multiplier * scale
+
+
+class LoRAInfModule(LoRAModule):
+    def __init__(
+        self,
+        lora_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        lora_dim=4,
+        alpha=1,
+        **kwargs,
+    ):
+        # no dropout for inference
+        super().__init__(lora_name, org_module, multiplier, lora_dim, alpha)
+
+        self.org_module_ref = [org_module]  # for reference
+        self.enabled = True
+        self.network: LoRANetwork = None
+
+    def set_network(self, network):
+        self.network = network
+
+    # merge weight to org_module
+    # def merge_to(self, sd, dtype, device, non_blocking=False):
+    #     if torch.cuda.is_available():
+    #         stream = torch.cuda.Stream(device=device)
+    #         with torch.cuda.stream(stream):
+    #             print(f"merge_to {self.lora_name}")
+    #             self._merge_to(sd, dtype, device, non_blocking)
+    #             torch.cuda.synchronize(device=device)
+    #             print(f"merge_to {self.lora_name} done")
+    #         torch.cuda.empty_cache()
+    #     else:
+    #         self._merge_to(sd, dtype, device, non_blocking)
+
+    def merge_to(self, sd, dtype, device, non_blocking=False):
+        # extract weight from org_module
+        org_sd = self.org_module.state_dict()
+        weight = org_sd["weight"]
+        org_dtype = weight.dtype
+        org_device = weight.device
+        weight = weight.to(device, dtype=torch.float, non_blocking=non_blocking)  # for calculation
+
+        if dtype is None:
+            dtype = org_dtype
+        if device is None:
+            device = org_device
+
+        if self.split_dims is None:
+            # get up/down weight
+            down_weight = sd["lora_down.weight"].to(device, dtype=torch.float, non_blocking=non_blocking)
+            up_weight = sd["lora_up.weight"].to(device, dtype=torch.float, non_blocking=non_blocking)
+
+            # merge weight
+            if len(weight.size()) == 2:
+                # linear
+                weight = weight + self.multiplier * (up_weight @ down_weight) * self.scale
+            elif down_weight.size()[2:4] == (1, 1):
+                # conv2d 1x1
+                weight = (
+                    weight
+                    + self.multiplier
+                    * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                    * self.scale
+                )
+            else:
+                # conv2d 3x3
+                conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+                # logger.info(conved.size(), weight.size(), module.stride, module.padding)
+                weight = weight + self.multiplier * conved * self.scale
+
+            # set weight to org_module
+            org_sd["weight"] = weight.to(org_device, dtype=dtype)  # back to CPU without non_blocking
+            self.org_module.load_state_dict(org_sd)
+        else:
+            # split_dims
+            total_dims = sum(self.split_dims)
+            for i in range(len(self.split_dims)):
+                # get up/down weight
+                down_weight = sd[f"lora_down.{i}.weight"].to(device, torch.float, non_blocking=non_blocking)  # (rank, in_dim)
+                up_weight = sd[f"lora_up.{i}.weight"].to(device, torch.float, non_blocking=non_blocking)  # (split dim, rank)
+
+                # pad up_weight -> (total_dims, rank)
+                padded_up_weight = torch.zeros((total_dims, up_weight.size(0)), device=device, dtype=torch.float)
+                padded_up_weight[sum(self.split_dims[:i]) : sum(self.split_dims[: i + 1])] = up_weight
+
+                # merge weight
+                weight = weight + self.multiplier * (up_weight @ down_weight) * self.scale
+
+            # set weight to org_module
+            org_sd["weight"] = weight.to(org_device, dtype)  # back to CPU without non_blocking
+            self.org_module.load_state_dict(org_sd)
+
+    # return weight for merge
+    def get_weight(self, multiplier=None):
+        if multiplier is None:
+            multiplier = self.multiplier
+
+        # get up/down weight from module
+        up_weight = self.lora_up.weight.to(torch.float)
+        down_weight = self.lora_down.weight.to(torch.float)
+
+        # pre-calculated weight
+        if len(down_weight.size()) == 2:
+            # linear
+            weight = self.multiplier * (up_weight @ down_weight) * self.scale
+        elif down_weight.size()[2:4] == (1, 1):
+            # conv2d 1x1
+            weight = (
+                self.multiplier
+                * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                * self.scale
+            )
+        else:
+            # conv2d 3x3
+            conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+            weight = self.multiplier * conved * self.scale
+
+        return weight
+
+    def default_forward(self, x):
+        # logger.info(f"default_forward {self.lora_name} {x.size()}")
+        if self.split_dims is None:
+            lx = self.lora_down(x)
+            lx = self.lora_up(lx)
+            return self.org_forward(x) + lx * self.multiplier * self.scale
+        else:
+            lxs = [lora_down(x) for lora_down in self.lora_down]
+            lxs = [lora_up(lx) for lora_up, lx in zip(self.lora_up, lxs)]
+            return self.org_forward(x) + torch.cat(lxs, dim=-1) * self.multiplier * self.scale
+
+    def forward(self, x):
+        if not self.enabled:
+            return self.org_forward(x)
+        return self.default_forward(x)
+
+
+def create_arch_network(
+    multiplier: float,
+    network_dim: Optional[int],
+    network_alpha: Optional[float],
+    vae: nn.Module,
+    text_encoders: List[nn.Module],
+    unet: nn.Module,
+    neuron_dropout: Optional[float] = None,
+    **kwargs,
+):
+    # add default exclude patterns
+    exclude_patterns = kwargs.get("exclude_patterns", None)
+    if exclude_patterns is None:
+        exclude_patterns = []
+    else:
+        exclude_patterns = ast.literal_eval(exclude_patterns)
+
+    # exclude if 'img_mod', 'txt_mod' or 'modulation' in the name
+    exclude_patterns.append(r".*(img_mod|txt_mod|modulation).*")
+
+    kwargs["exclude_patterns"] = exclude_patterns
+
+    return create_network(
+        HUNYUAN_TARGET_REPLACE_MODULES,
+        "lora_unet",
+        multiplier,
+        network_dim,
+        network_alpha,
+        vae,
+        text_encoders,
+        unet,
+        neuron_dropout=neuron_dropout,
+        **kwargs,
+    )
+
+
+def create_network(
+    target_replace_modules: List[str],
+    prefix: str,
+    multiplier: float,
+    network_dim: Optional[int],
+    network_alpha: Optional[float],
+    vae: nn.Module,
+    text_encoders: List[nn.Module],
+    unet: nn.Module,
+    neuron_dropout: Optional[float] = None,
+    **kwargs,
+):
+    """ architecture independent network creation """
+    if network_dim is None:
+        network_dim = 4  # default
+    if network_alpha is None:
+        network_alpha = 1.0
+
+    # extract dim/alpha for conv2d, and block dim
+    conv_dim = kwargs.get("conv_dim", None)
+    conv_alpha = kwargs.get("conv_alpha", None)
+    if conv_dim is not None:
+        conv_dim = int(conv_dim)
+        if conv_alpha is None:
+            conv_alpha = 1.0
+        else:
+            conv_alpha = float(conv_alpha)
+
+    # TODO generic rank/dim setting with regular expression
+
+    # rank/module dropout
+    rank_dropout = kwargs.get("rank_dropout", None)
+    if rank_dropout is not None:
+        rank_dropout = float(rank_dropout)
+    module_dropout = kwargs.get("module_dropout", None)
+    if module_dropout is not None:
+        module_dropout = float(module_dropout)
+
+    # verbose
+    verbose = kwargs.get("verbose", False)
+    if verbose is not None:
+        verbose = True if verbose == "True" else False
+
+    # regular expression for module selection: exclude and include
+    exclude_patterns = kwargs.get("exclude_patterns", None)
+    if exclude_patterns is not None and isinstance(exclude_patterns, str):
+        exclude_patterns = ast.literal_eval(exclude_patterns)
+    include_patterns = kwargs.get("include_patterns", None)
+    if include_patterns is not None and isinstance(include_patterns, str):
+        include_patterns = ast.literal_eval(include_patterns)
+
+    # too many arguments ( ^ω^)･･･
+    network = LoRANetwork(
+        target_replace_modules,
+        prefix,
+        text_encoders,
+        unet,
+        multiplier=multiplier,
+        lora_dim=network_dim,
+        alpha=network_alpha,
+        dropout=neuron_dropout,
+        rank_dropout=rank_dropout,
+        module_dropout=module_dropout,
+        conv_lora_dim=conv_dim,
+        conv_alpha=conv_alpha,
+        exclude_patterns=exclude_patterns,
+        include_patterns=include_patterns,
+        verbose=verbose,
+    )
+
+    loraplus_lr_ratio = kwargs.get("loraplus_lr_ratio", None)
+    # loraplus_unet_lr_ratio = kwargs.get("loraplus_unet_lr_ratio", None)
+    # loraplus_text_encoder_lr_ratio = kwargs.get("loraplus_text_encoder_lr_ratio", None)
+    loraplus_lr_ratio = float(loraplus_lr_ratio) if loraplus_lr_ratio is not None else None
+    # loraplus_unet_lr_ratio = float(loraplus_unet_lr_ratio) if loraplus_unet_lr_ratio is not None else None
+    # loraplus_text_encoder_lr_ratio = float(loraplus_text_encoder_lr_ratio) if loraplus_text_encoder_lr_ratio is not None else None
+    if loraplus_lr_ratio is not None:  # or loraplus_unet_lr_ratio is not None or loraplus_text_encoder_lr_ratio is not None:
+        network.set_loraplus_lr_ratio(loraplus_lr_ratio)  # , loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio)
+
+    return network
+
+
+class LoRANetwork(torch.nn.Module):
+    # only supports U-Net (DiT), Text Encoders are not supported
+
+    def __init__(
+        self,
+        target_replace_modules: List[str],
+        prefix: str,
+        text_encoders: Union[List[CLIPTextModel], CLIPTextModel],
+        unet: nn.Module,
+        multiplier: float = 1.0,
+        lora_dim: int = 4,
+        alpha: float = 1,
+        dropout: Optional[float] = None,
+        rank_dropout: Optional[float] = None,
+        module_dropout: Optional[float] = None,
+        conv_lora_dim: Optional[int] = None,
+        conv_alpha: Optional[float] = None,
+        module_class: Type[object] = LoRAModule,
+        modules_dim: Optional[Dict[str, int]] = None,
+        modules_alpha: Optional[Dict[str, int]] = None,
+        exclude_patterns: Optional[List[str]] = None,
+        include_patterns: Optional[List[str]] = None,
+        verbose: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        self.multiplier = multiplier
+
+        self.lora_dim = lora_dim
+        self.alpha = alpha
+        self.conv_lora_dim = conv_lora_dim
+        self.conv_alpha = conv_alpha
+        self.dropout = dropout
+        self.rank_dropout = rank_dropout
+        self.module_dropout = module_dropout
+        self.target_replace_modules = target_replace_modules
+        self.prefix = prefix
+
+        self.loraplus_lr_ratio = None
+        # self.loraplus_unet_lr_ratio = None
+        # self.loraplus_text_encoder_lr_ratio = None
+
+        if modules_dim is not None:
+            logger.info(f"create LoRA network from weights")
+        else:
+            logger.info(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}")
+            logger.info(
+                f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}"
+            )
+            # if self.conv_lora_dim is not None:
+            #     logger.info(
+            #         f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}"
+            #     )
+        # if train_t5xxl:
+        #     logger.info(f"train T5XXL as well")
+
+        # compile regular expression if specified
+        exclude_re_patterns = []
+        if exclude_patterns is not None:
+            for pattern in exclude_patterns:
+                try:
+                    re_pattern = re.compile(pattern)
+                except re.error as e:
+                    logger.error(f"Invalid exclude pattern '{pattern}': {e}")
+                    continue
+                exclude_re_patterns.append(re_pattern)
+
+        include_re_patterns = []
+        if include_patterns is not None:
+            for pattern in include_patterns:
+                try:
+                    re_pattern = re.compile(pattern)
+                except re.error as e:
+                    logger.error(f"Invalid include pattern '{pattern}': {e}")
+                    continue
+                include_re_patterns.append(re_pattern)
+
+        # create module instances
+        def create_modules(
+            is_unet: bool,
+            pfx: str,
+            root_module: torch.nn.Module,
+            target_replace_mods: Optional[List[str]] = None,
+            filter: Optional[str] = None,
+            default_dim: Optional[int] = None,
+        ) -> List[LoRAModule]:
+            loras = []
+            skipped = []
+            for name, module in root_module.named_modules():
+                if target_replace_mods is None or module.__class__.__name__ in target_replace_mods:
+                    if target_replace_mods is None:  # dirty hack for all modules
+                        module = root_module  # search all modules
+
+                    for child_name, child_module in module.named_modules():
+                        is_linear = child_module.__class__.__name__ == "Linear"
+                        is_conv2d = child_module.__class__.__name__ == "Conv2d"
+                        is_conv2d_1x1 = is_conv2d and child_module.kernel_size == (1, 1)
+
+                        if is_linear or is_conv2d:
+                            original_name = (name + "." if name else "") + child_name
+                            lora_name = f"{pfx}.{original_name}".replace(".", "_")
+
+                            # exclude/include filter
+                            excluded = False
+                            for pattern in exclude_re_patterns:
+                                if pattern.match(original_name):
+                                    excluded = True
+                                    break
+                            included = False
+                            for pattern in include_re_patterns:
+                                if pattern.match(original_name):
+                                    included = True
+                                    break
+                            if excluded and not included:
+                                if verbose:
+                                    logger.info(f"exclude: {original_name}")
+                                continue
+
+                            # filter by name (not used in the current implementation)
+                            if filter is not None and not filter in lora_name:
+                                continue
+
+                            dim = None
+                            alpha = None
+
+                            if modules_dim is not None:
+                                # モジュール指定あり
+                                if lora_name in modules_dim:
+                                    dim = modules_dim[lora_name]
+                                    alpha = modules_alpha[lora_name]
+                            else:
+                                # 通常、すべて対象とする
+                                if is_linear or is_conv2d_1x1:
+                                    dim = default_dim if default_dim is not None else self.lora_dim
+                                    alpha = self.alpha
+                                elif self.conv_lora_dim is not None:
+                                    dim = self.conv_lora_dim
+                                    alpha = self.conv_alpha
+
+                            if dim is None or dim == 0:
+                                # skipした情報を出力
+                                if is_linear or is_conv2d_1x1 or (self.conv_lora_dim is not None):
+                                    skipped.append(lora_name)
+                                continue
+
+                            lora = module_class(
+                                lora_name,
+                                child_module,
+                                self.multiplier,
+                                dim,
+                                alpha,
+                                dropout=dropout,
+                                rank_dropout=rank_dropout,
+                                module_dropout=module_dropout,
+                            )
+                            loras.append(lora)
+
+                if target_replace_mods is None:
+                    break  # all modules are searched
+            return loras, skipped
+
+        # # create LoRA for text encoder
+        # # it is redundant to create LoRA modules even if they are not used
+
+        self.text_encoder_loras: List[Union[LoRAModule, LoRAInfModule]] = []
+        # skipped_te = []
+        # for i, text_encoder in enumerate(text_encoders):
+        #     index = i
+        #     if not train_t5xxl and index > 0:  # 0: CLIP, 1: T5XXL, so we skip T5XXL if train_t5xxl is False
+        #         break
+        #     logger.info(f"create LoRA for Text Encoder {index+1}:")
+        #     text_encoder_loras, skipped = create_modules(False, index, text_encoder, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
+        #     logger.info(f"create LoRA for Text Encoder {index+1}: {len(text_encoder_loras)} modules.")
+        #     self.text_encoder_loras.extend(text_encoder_loras)
+        #     skipped_te += skipped
+
+        # create LoRA for U-Net
+        self.unet_loras: List[Union[LoRAModule, LoRAInfModule]]
+        self.unet_loras, skipped_un = create_modules(True, prefix, unet, target_replace_modules)
+
+        logger.info(f"create LoRA for U-Net/DiT: {len(self.unet_loras)} modules.")
+        if verbose:
+            for lora in self.unet_loras:
+                logger.info(f"\t{lora.lora_name:50} {lora.lora_dim}, {lora.alpha}")
+
+        skipped = skipped_un
+        if verbose and len(skipped) > 0:
+            logger.warning(
+                f"because dim (rank) is 0, {len(skipped)} LoRA modules are skipped / dim (rank)が0の為、次の{len(skipped)}個のLoRAモジュールはスキップされます:"
+            )
+            for name in skipped:
+                logger.info(f"\t{name}")
+
+        # assertion
+        names = set()
+        for lora in self.text_encoder_loras + self.unet_loras:
+            assert lora.lora_name not in names, f"duplicated lora name: {lora.lora_name}"
+            names.add(lora.lora_name)
+
+    def prepare_network(self, args):
+        """
+        called after the network is created
+        """
+        pass
+
+    def set_multiplier(self, multiplier):
+        self.multiplier = multiplier
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.multiplier = self.multiplier
+
+    def set_enabled(self, is_enabled):
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.enabled = is_enabled
+
+    def load_weights(self, file):
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file
+
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+
+        info = self.load_state_dict(weights_sd, False)
+        return info
+
+    def apply_to(
+        self,
+        text_encoders: Optional[nn.Module],
+        unet: Optional[nn.Module],
+        apply_text_encoder: bool = True,
+        apply_unet: bool = True,
+    ):
+        if apply_text_encoder:
+            logger.info(f"enable LoRA for text encoder: {len(self.text_encoder_loras)} modules")
+        else:
+            self.text_encoder_loras = []
+
+        if apply_unet:
+            logger.info(f"enable LoRA for U-Net: {len(self.unet_loras)} modules")
+        else:
+            self.unet_loras = []
+
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.apply_to()
+            self.add_module(lora.lora_name, lora)
+
+    # マージできるかどうかを返す
+    def is_mergeable(self):
+        return True
+
+    # TODO refactor to common function with apply_to
+    def merge_to(self, text_encoders, unet, weights_sd, dtype=None, device=None, non_blocking=False):
+        from concurrent.futures import ThreadPoolExecutor
+
+        with ThreadPoolExecutor(max_workers=2) as executor:  # 2 workers is enough
+            futures = []
+            for lora in self.text_encoder_loras + self.unet_loras:
+                sd_for_lora = {}
+                for key in weights_sd.keys():
+                    if key.startswith(lora.lora_name):
+                        sd_for_lora[key[len(lora.lora_name) + 1 :]] = weights_sd[key]
+                if len(sd_for_lora) == 0:
+                    logger.info(f"no weight for {lora.lora_name}")
+                    continue
+
+                # lora.merge_to(sd_for_lora, dtype, device)
+                futures.append(executor.submit(lora.merge_to, sd_for_lora, dtype, device, non_blocking))
+
+        for future in futures:
+            future.result()
+
+        logger.info(f"weights are merged")
+
+    def set_loraplus_lr_ratio(self, loraplus_lr_ratio):  # , loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio):
+        self.loraplus_lr_ratio = loraplus_lr_ratio
+
+        logger.info(f"LoRA+ UNet LR Ratio: {self.loraplus_lr_ratio}")
+        # logger.info(f"LoRA+ Text Encoder LR Ratio: {self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio}")
+
+    def prepare_optimizer_params(self, unet_lr: float = 1e-4, **kwargs):
+        self.requires_grad_(True)
+
+        all_params = []
+        lr_descriptions = []
+
+        def assemble_params(loras, lr, loraplus_ratio):
+            param_groups = {"lora": {}, "plus": {}}
+            for lora in loras:
+                for name, param in lora.named_parameters():
+                    if loraplus_ratio is not None and "lora_up" in name:
+                        param_groups["plus"][f"{lora.lora_name}.{name}"] = param
+                    else:
+                        param_groups["lora"][f"{lora.lora_name}.{name}"] = param
+
+            params = []
+            descriptions = []
+            for key in param_groups.keys():
+                param_data = {"params": param_groups[key].values()}
+
+                if len(param_data["params"]) == 0:
+                    continue
+
+                if lr is not None:
+                    if key == "plus":
+                        param_data["lr"] = lr * loraplus_ratio
+                    else:
+                        param_data["lr"] = lr
+
+                if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
+                    logger.info("NO LR skipping!")
+                    continue
+
+                params.append(param_data)
+                descriptions.append("plus" if key == "plus" else "")
+
+            return params, descriptions
+
+        if self.unet_loras:
+            params, descriptions = assemble_params(self.unet_loras, unet_lr, self.loraplus_lr_ratio)
+            all_params.extend(params)
+            lr_descriptions.extend(["unet" + (" " + d if d else "") for d in descriptions])
+
+        return all_params, lr_descriptions
+
+    def enable_gradient_checkpointing(self):
+        # not supported
+        pass
+
+    def prepare_grad_etc(self, unet):
+        self.requires_grad_(True)
+
+    def on_epoch_start(self, unet):
+        self.train()
+
+    def on_step_start(self):
+        pass
+
+    def get_trainable_params(self):
+        return self.parameters()
+
+    def save_weights(self, file, dtype, metadata):
+        if metadata is not None and len(metadata) == 0:
+            metadata = None
+
+        state_dict = self.state_dict()
+
+        if dtype is not None:
+            for key in list(state_dict.keys()):
+                v = state_dict[key]
+                v = v.detach().clone().to("cpu").to(dtype)
+                state_dict[key] = v
+
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import save_file
+            from utils import model_utils
+
+            # Precalculate model hashes to save time on indexing
+            if metadata is None:
+                metadata = {}
+            model_hash, legacy_hash = model_utils.precalculate_safetensors_hashes(state_dict, metadata)
+            metadata["sshs_model_hash"] = model_hash
+            metadata["sshs_legacy_hash"] = legacy_hash
+
+            save_file(state_dict, file, metadata)
+        else:
+            torch.save(state_dict, file)
+
+    def backup_weights(self):
+        # 重みのバックアップを行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            if not hasattr(org_module, "_lora_org_weight"):
+                sd = org_module.state_dict()
+                org_module._lora_org_weight = sd["weight"].detach().clone()
+                org_module._lora_restored = True
+
+    def restore_weights(self):
+        # 重みのリストアを行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            if not org_module._lora_restored:
+                sd = org_module.state_dict()
+                sd["weight"] = org_module._lora_org_weight
+                org_module.load_state_dict(sd)
+                org_module._lora_restored = True
+
+    def pre_calculation(self):
+        # 事前計算を行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            sd = org_module.state_dict()
+
+            org_weight = sd["weight"]
+            lora_weight = lora.get_weight().to(org_weight.device, dtype=org_weight.dtype)
+            sd["weight"] = org_weight + lora_weight
+            assert sd["weight"].shape == org_weight.shape
+            org_module.load_state_dict(sd)
+
+            org_module._lora_restored = False
+            lora.enabled = False
+
+    def apply_max_norm_regularization(self, max_norm_value, device):
+        downkeys = []
+        upkeys = []
+        alphakeys = []
+        norms = []
+        keys_scaled = 0
+
+        state_dict = self.state_dict()
+        for key in state_dict.keys():
+            if "lora_down" in key and "weight" in key:
+                downkeys.append(key)
+                upkeys.append(key.replace("lora_down", "lora_up"))
+                alphakeys.append(key.replace("lora_down.weight", "alpha"))
+
+        for i in range(len(downkeys)):
+            down = state_dict[downkeys[i]].to(device)
+            up = state_dict[upkeys[i]].to(device)
+            alpha = state_dict[alphakeys[i]].to(device)
+            dim = down.shape[0]
+            scale = alpha / dim
+
+            if up.shape[2:] == (1, 1) and down.shape[2:] == (1, 1):
+                updown = (up.squeeze(2).squeeze(2) @ down.squeeze(2).squeeze(2)).unsqueeze(2).unsqueeze(3)
+            elif up.shape[2:] == (3, 3) or down.shape[2:] == (3, 3):
+                updown = torch.nn.functional.conv2d(down.permute(1, 0, 2, 3), up).permute(1, 0, 2, 3)
+            else:
+                updown = up @ down
+
+            updown *= scale
+
+            norm = updown.norm().clamp(min=max_norm_value / 2)
+            desired = torch.clamp(norm, max=max_norm_value)
+            ratio = desired.cpu() / norm.cpu()
+            sqrt_ratio = ratio**0.5
+            if ratio != 1:
+                keys_scaled += 1
+                state_dict[upkeys[i]] *= sqrt_ratio
+                state_dict[downkeys[i]] *= sqrt_ratio
+            scalednorm = updown.norm() * ratio
+            norms.append(scalednorm.item())
+
+        return keys_scaled, sum(norms) / len(norms), max(norms)
+
+
+def create_arch_network_from_weights(
+    multiplier: float,
+    weights_sd: Dict[str, torch.Tensor],
+    text_encoders: Optional[List[nn.Module]] = None,
+    unet: Optional[nn.Module] = None,
+    for_inference: bool = False,
+    **kwargs,
+) -> LoRANetwork:
+    return create_network_from_weights(
+        HUNYUAN_TARGET_REPLACE_MODULES, multiplier, weights_sd, text_encoders, unet, for_inference, **kwargs
+    )
+
+
+# Create network from weights for inference, weights are not loaded here (because can be merged)
+def create_network_from_weights(
+    target_replace_modules: List[str],
+    multiplier: float,
+    weights_sd: Dict[str, torch.Tensor],
+    text_encoders: Optional[List[nn.Module]] = None,
+    unet: Optional[nn.Module] = None,
+    for_inference: bool = False,
+    **kwargs,
+) -> LoRANetwork:
+    # get dim/alpha mapping
+    modules_dim = {}
+    modules_alpha = {}
+    for key, value in weights_sd.items():
+        if "." not in key:
+            continue
+
+        lora_name = key.split(".")[0]
+        if "alpha" in key:
+            modules_alpha[lora_name] = value
+        elif "lora_down" in key:
+            dim = value.shape[0]
+            modules_dim[lora_name] = dim
+            # logger.info(lora_name, value.size(), dim)
+
+    module_class = LoRAInfModule if for_inference else LoRAModule
+
+    network = LoRANetwork(
+        target_replace_modules,
+        "lora_unet",
+        text_encoders,
+        unet,
+        multiplier=multiplier,
+        modules_dim=modules_dim,
+        modules_alpha=modules_alpha,
+        module_class=module_class,
+    )
+    return network
diff --git a/networks/lora_wan.py b/networks/lora_wan.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9b171a741d317a551f17d1f45046e7eed6b161e
--- /dev/null
+++ b/networks/lora_wan.py
@@ -0,0 +1,65 @@
+# LoRA module for Wan2.1
+
+import ast
+from typing import Dict, List, Optional
+import torch
+import torch.nn as nn
+
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+import networks.lora as lora
+
+
+WAN_TARGET_REPLACE_MODULES = ["WanAttentionBlock"]
+
+
+def create_arch_network(
+    multiplier: float,
+    network_dim: Optional[int],
+    network_alpha: Optional[float],
+    vae: nn.Module,
+    text_encoders: List[nn.Module],
+    unet: nn.Module,
+    neuron_dropout: Optional[float] = None,
+    **kwargs,
+):
+    # add default exclude patterns
+    exclude_patterns = kwargs.get("exclude_patterns", None)
+    if exclude_patterns is None:
+        exclude_patterns = []
+    else:
+        exclude_patterns = ast.literal_eval(exclude_patterns)
+
+    # exclude if 'img_mod', 'txt_mod' or 'modulation' in the name
+    exclude_patterns.append(r".*(patch_embedding|text_embedding|time_embedding|time_projection|norm|head).*")
+
+    kwargs["exclude_patterns"] = exclude_patterns
+
+    return lora.create_network(
+        WAN_TARGET_REPLACE_MODULES,
+        "lora_unet",
+        multiplier,
+        network_dim,
+        network_alpha,
+        vae,
+        text_encoders,
+        unet,
+        neuron_dropout=neuron_dropout,
+        **kwargs,
+    )
+
+
+def create_arch_network_from_weights(
+    multiplier: float,
+    weights_sd: Dict[str, torch.Tensor],
+    text_encoders: Optional[List[nn.Module]] = None,
+    unet: Optional[nn.Module] = None,
+    for_inference: bool = False,
+    **kwargs,
+) -> lora.LoRANetwork:
+    return lora.create_network_from_weights(
+        WAN_TARGET_REPLACE_MODULES, multiplier, weights_sd, text_encoders, unet, for_inference, **kwargs
+    )
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..66c812e35d41eaf2438419b38abfc3dc9824b8fe
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,40 @@
+[project]
+name = "musubi-tuner"
+version = "0.1.0"
+description = "Musubi Tuner by kohya_ss"
+readme = "README.md"
+requires-python = ">=3.10, <3.11"
+dependencies = [
+    "accelerate>=1.0.0",
+    "ascii-magic==2.3.0",
+    "av==14.0.1",
+    "bitsandbytes>=0.45.0",
+    "diffusers>=0.32.1",
+    "einops>=0.7.0",
+    "huggingface-hub>=0.26.5",
+    "matplotlib>=3.10.0",
+    "opencv-python>=4.10.0.84",
+    "pillow>=10.2.0",
+    "safetensors>=0.4.5",
+    "sageattention>=1.0.6",
+    "tensorboard>=2.18.0",
+    "toml>=0.10.2",
+    "torch>=2.5.1",
+    "torchvision>=0.20.1",
+    "tqdm>=4.66.5",
+    "transformers>=4.46.3",
+    "voluptuous>=0.15.2",
+]
+
+[tool.uv.sources]
+torch = [
+  { index = "pytorch-cu124" },
+]
+torchvision = [
+  { index = "pytorch-cu124" },
+]
+
+[[tool.uv.index]]
+name = "pytorch-cu124"
+url = "https://download.pytorch.org/whl/cu124"
+explicit = true
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..22d39e2f06d373e07adfa95ea186fa17ced50565
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,24 @@
+torch
+torchvision
+accelerate==1.2.1
+av==14.0.1
+bitsandbytes==0.45.0
+diffusers==0.32.1
+einops==0.7.0
+huggingface-hub==0.26.5
+opencv-python==4.10.0.84
+pillow==10.2.0
+safetensors==0.4.5
+toml==0.10.2
+tqdm==4.67.1
+transformers==4.46.3
+voluptuous==0.15.2
+
+# Wan2.1
+ftfy==6.3.1
+easydict==1.13
+
+# optional dependencies
+# ascii-magic==2.3.0
+# matplotlib==3.10.0
+# tensorboard
\ No newline at end of file
diff --git a/testsettings.json b/testsettings.json
new file mode 100644
index 0000000000000000000000000000000000000000..832fffa5b2f85c6a88d94e4b06a222f0e25233a6
--- /dev/null
+++ b/testsettings.json
@@ -0,0 +1,44 @@
+{
+    "DATASET_CONFIG": "C:/AI/WAN Tuner/QWEN-tuner/musubi-tuner-wan-gui/dataset/testtoml.toml",
+    "VAE_MODEL": "C:/AI/WAN Tuner/QWEN-tuner/musubi-tuner/Wan2.1-I2V-14B-720P/Wan2.1_VAE.pth",
+    "CLIP_MODEL": "C:/AI/WAN Tuner/QWEN-tuner/musubi-tuner/Wan2.1-I2V-14B-720P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth",
+    "T5_MODEL": "C:/AI/WAN Tuner/QWEN-tuner/musubi-tuner/Wan2.1-I2V-14B-720P/models_t5_umt5-xxl-enc-bf16.pth",
+    "DIT_MODEL": "C:/AI/WAN Tuner/QWEN-tuner/musubi-tuner/Wan2.1-I2V-14B-720P/wan2.1_i2v_720p_14B_fp8_e4m3fn.safetensors",
+    "LORA_OUTPUT_DIR": "C:/AI/WAN Tuner/QWEN-tuner/musubi-tuner-wan-gui/Output_LoRAs",
+    "LORA_NAME": "My_Test_Lora_v1",
+    "MODEL_TYPE": "i2v-14B",
+    "FLOW_SHIFT": "3.0",
+    "LEARNING_RATE": "2e-5",
+    "LORA_LR_RATIO": "4",
+    "NETWORK_DIM": "32",
+    "NETWORK_ALPHA": "4",
+    "MAX_TRAIN_EPOCHS": "70",
+    "SAVE_EVERY_N_EPOCHS": "10",
+    "SEED": "1234",
+    "BLOCKS_SWAP": "16",
+    "RESUME_TRAINING": "",
+    "OPTIMIZER_TYPE": "adamw8bit",
+    "OPTIMIZER_ARGS": "",
+    "ATTENTION_MECHANISM": "none",
+    "LOGGING_DIR": "",
+    "LOG_WITH": "none",
+    "LOG_PREFIX": "",
+    "IMG_IN_TXT_IN_OFFLOADING": false,
+    "LR_SCHEDULER": "constant",
+    "LR_WARMUP_STEPS": "",
+    "LR_DECAY_STEPS": "",
+    "TIMESTEP_SAMPLING": "shift",
+    "DISCRETE_FLOW_SHIFT": "3.0",
+    "WEIGHTING_SCHEME": "none",
+    "METADATA_TITLE": "",
+    "METADATA_AUTHOR": "",
+    "METADATA_DESCRIPTION": "",
+    "METADATA_LICENSE": "",
+    "METADATA_TAGS": "",
+    "INPUT_LORA": "",
+    "OUTPUT_DIR": "",
+    "CONVERTED_LORA_NAME": "",
+    "FP8": true,
+    "SCALED": false,
+    "ENABLE_CACHE": true
+}
\ No newline at end of file
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/utils/device_utils.py b/utils/device_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b14803e499d7b92acebf8d8bddc3426d178695c4
--- /dev/null
+++ b/utils/device_utils.py
@@ -0,0 +1,19 @@
+import torch
+
+
+def clean_memory_on_device(device):
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+    elif device.type == "cpu":
+        pass
+    elif device.type == "mps":  # not tested
+        torch.mps.empty_cache()
+
+
+def synchronize_device(device: torch.device):
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    elif device.type == "xpu":
+        torch.xpu.synchronize()
+    elif device.type == "mps":
+        torch.mps.synchronize()
diff --git a/utils/huggingface_utils.py b/utils/huggingface_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dc7bd7dbb2ef70e0b6244b9db686aae00f46408
--- /dev/null
+++ b/utils/huggingface_utils.py
@@ -0,0 +1,89 @@
+import threading
+from typing import Union, BinaryIO
+from huggingface_hub import HfApi
+from pathlib import Path
+import argparse
+import os
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def fire_in_thread(f, *args, **kwargs):
+    threading.Thread(target=f, args=args, kwargs=kwargs).start()
+
+
+def exists_repo(repo_id: str, repo_type: str, revision: str = "main", token: str = None):
+    api = HfApi(
+        token=token,
+    )
+    try:
+        api.repo_info(repo_id=repo_id, revision=revision, repo_type=repo_type)
+        return True
+    except:
+        return False
+
+
+def upload(
+    args: argparse.Namespace,
+    src: Union[str, Path, bytes, BinaryIO],
+    dest_suffix: str = "",
+    force_sync_upload: bool = False,
+):
+    repo_id = args.huggingface_repo_id
+    repo_type = args.huggingface_repo_type
+    token = args.huggingface_token
+    path_in_repo = args.huggingface_path_in_repo + dest_suffix if args.huggingface_path_in_repo is not None else None
+    private = args.huggingface_repo_visibility is None or args.huggingface_repo_visibility != "public"
+    api = HfApi(token=token)
+    if not exists_repo(repo_id=repo_id, repo_type=repo_type, token=token):
+        try:
+            api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private)
+        except Exception as e:  # RepositoryNotFoundError or something else
+            logger.error("===========================================")
+            logger.error(f"failed to create HuggingFace repo / HuggingFaceのリポジトリの作成に失敗しました : {e}")
+            logger.error("===========================================")
+
+    is_folder = (type(src) == str and os.path.isdir(src)) or (isinstance(src, Path) and src.is_dir())
+
+    def uploader():
+        try:
+            if is_folder:
+                api.upload_folder(
+                    repo_id=repo_id,
+                    repo_type=repo_type,
+                    folder_path=src,
+                    path_in_repo=path_in_repo,
+                )
+            else:
+                api.upload_file(
+                    repo_id=repo_id,
+                    repo_type=repo_type,
+                    path_or_fileobj=src,
+                    path_in_repo=path_in_repo,
+                )
+        except Exception as e:  # RuntimeError or something else
+            logger.error("===========================================")
+            logger.error(f"failed to upload to HuggingFace / HuggingFaceへのアップロードに失敗しました : {e}")
+            logger.error("===========================================")
+
+    if args.async_upload and not force_sync_upload:
+        fire_in_thread(uploader)
+    else:
+        uploader()
+
+
+def list_dir(
+    repo_id: str,
+    subfolder: str,
+    repo_type: str,
+    revision: str = "main",
+    token: str = None,
+):
+    api = HfApi(
+        token=token,
+    )
+    repo_info = api.repo_info(repo_id=repo_id, revision=revision, repo_type=repo_type)
+    file_list = [file for file in repo_info.siblings if file.rfilename.startswith(subfolder)]
+    return file_list
diff --git a/utils/model_utils.py b/utils/model_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5beed8ec4e09f433ba2e84556a6c8f342a2903f5
--- /dev/null
+++ b/utils/model_utils.py
@@ -0,0 +1,151 @@
+import hashlib
+from io import BytesIO
+from typing import Optional
+
+import safetensors.torch
+import torch
+
+
+def model_hash(filename):
+    """Old model hash used by stable-diffusion-webui"""
+    try:
+        with open(filename, "rb") as file:
+            m = hashlib.sha256()
+
+            file.seek(0x100000)
+            m.update(file.read(0x10000))
+            return m.hexdigest()[0:8]
+    except FileNotFoundError:
+        return "NOFILE"
+    except IsADirectoryError:  # Linux?
+        return "IsADirectory"
+    except PermissionError:  # Windows
+        return "IsADirectory"
+
+
+def calculate_sha256(filename):
+    """New model hash used by stable-diffusion-webui"""
+    try:
+        hash_sha256 = hashlib.sha256()
+        blksize = 1024 * 1024
+
+        with open(filename, "rb") as f:
+            for chunk in iter(lambda: f.read(blksize), b""):
+                hash_sha256.update(chunk)
+
+        return hash_sha256.hexdigest()
+    except FileNotFoundError:
+        return "NOFILE"
+    except IsADirectoryError:  # Linux?
+        return "IsADirectory"
+    except PermissionError:  # Windows
+        return "IsADirectory"
+
+
+def addnet_hash_legacy(b):
+    """Old model hash used by sd-webui-additional-networks for .safetensors format files"""
+    m = hashlib.sha256()
+
+    b.seek(0x100000)
+    m.update(b.read(0x10000))
+    return m.hexdigest()[0:8]
+
+
+def addnet_hash_safetensors(b):
+    """New model hash used by sd-webui-additional-networks for .safetensors format files"""
+    hash_sha256 = hashlib.sha256()
+    blksize = 1024 * 1024
+
+    b.seek(0)
+    header = b.read(8)
+    n = int.from_bytes(header, "little")
+
+    offset = n + 8
+    b.seek(offset)
+    for chunk in iter(lambda: b.read(blksize), b""):
+        hash_sha256.update(chunk)
+
+    return hash_sha256.hexdigest()
+
+
+def precalculate_safetensors_hashes(tensors, metadata):
+    """Precalculate the model hashes needed by sd-webui-additional-networks to
+    save time on indexing the model later."""
+
+    # Because writing user metadata to the file can change the result of
+    # sd_models.model_hash(), only retain the training metadata for purposes of
+    # calculating the hash, as they are meant to be immutable
+    metadata = {k: v for k, v in metadata.items() if k.startswith("ss_")}
+
+    bytes = safetensors.torch.save(tensors, metadata)
+    b = BytesIO(bytes)
+
+    model_hash = addnet_hash_safetensors(b)
+    legacy_hash = addnet_hash_legacy(b)
+    return model_hash, legacy_hash
+
+
+def dtype_to_str(dtype: torch.dtype) -> str:
+    # get name of the dtype
+    dtype_name = str(dtype).split(".")[-1]
+    return dtype_name
+
+
+def str_to_dtype(s: Optional[str], default_dtype: Optional[torch.dtype] = None) -> torch.dtype:
+    """
+    Convert a string to a torch.dtype
+
+    Args:
+        s: string representation of the dtype
+        default_dtype: default dtype to return if s is None
+
+    Returns:
+        torch.dtype: the corresponding torch.dtype
+
+    Raises:
+        ValueError: if the dtype is not supported
+
+    Examples:
+        >>> str_to_dtype("float32")
+        torch.float32
+        >>> str_to_dtype("fp32")
+        torch.float32
+        >>> str_to_dtype("float16")
+        torch.float16
+        >>> str_to_dtype("fp16")
+        torch.float16
+        >>> str_to_dtype("bfloat16")
+        torch.bfloat16
+        >>> str_to_dtype("bf16")
+        torch.bfloat16
+        >>> str_to_dtype("fp8")
+        torch.float8_e4m3fn
+        >>> str_to_dtype("fp8_e4m3fn")
+        torch.float8_e4m3fn
+        >>> str_to_dtype("fp8_e4m3fnuz")
+        torch.float8_e4m3fnuz
+        >>> str_to_dtype("fp8_e5m2")
+        torch.float8_e5m2
+        >>> str_to_dtype("fp8_e5m2fnuz")
+        torch.float8_e5m2fnuz
+    """
+    if s is None:
+        return default_dtype
+    if s in ["bf16", "bfloat16"]:
+        return torch.bfloat16
+    elif s in ["fp16", "float16"]:
+        return torch.float16
+    elif s in ["fp32", "float32", "float"]:
+        return torch.float32
+    elif s in ["fp8_e4m3fn", "e4m3fn", "float8_e4m3fn"]:
+        return torch.float8_e4m3fn
+    elif s in ["fp8_e4m3fnuz", "e4m3fnuz", "float8_e4m3fnuz"]:
+        return torch.float8_e4m3fnuz
+    elif s in ["fp8_e5m2", "e5m2", "float8_e5m2"]:
+        return torch.float8_e5m2
+    elif s in ["fp8_e5m2fnuz", "e5m2fnuz", "float8_e5m2fnuz"]:
+        return torch.float8_e5m2fnuz
+    elif s in ["fp8", "float8"]:
+        return torch.float8_e4m3fn  # default fp8
+    else:
+        raise ValueError(f"Unsupported dtype: {s}")
diff --git a/utils/safetensors_utils.py b/utils/safetensors_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b746bc0b7924513616d5dc0c6f9a8ad3f8c37bf9
--- /dev/null
+++ b/utils/safetensors_utils.py
@@ -0,0 +1,191 @@
+import torch
+import json
+import struct
+from typing import Dict, Any, Union, Optional
+
+from safetensors.torch import load_file
+
+
+def mem_eff_save_file(tensors: Dict[str, torch.Tensor], filename: str, metadata: Dict[str, Any] = None):
+    """
+    memory efficient save file
+    """
+
+    _TYPES = {
+        torch.float64: "F64",
+        torch.float32: "F32",
+        torch.float16: "F16",
+        torch.bfloat16: "BF16",
+        torch.int64: "I64",
+        torch.int32: "I32",
+        torch.int16: "I16",
+        torch.int8: "I8",
+        torch.uint8: "U8",
+        torch.bool: "BOOL",
+        getattr(torch, "float8_e5m2", None): "F8_E5M2",
+        getattr(torch, "float8_e4m3fn", None): "F8_E4M3",
+    }
+    _ALIGN = 256
+
+    def validate_metadata(metadata: Dict[str, Any]) -> Dict[str, str]:
+        validated = {}
+        for key, value in metadata.items():
+            if not isinstance(key, str):
+                raise ValueError(f"Metadata key must be a string, got {type(key)}")
+            if not isinstance(value, str):
+                print(f"Warning: Metadata value for key '{key}' is not a string. Converting to string.")
+                validated[key] = str(value)
+            else:
+                validated[key] = value
+        return validated
+
+    # print(f"Using memory efficient save file: {filename}")
+
+    header = {}
+    offset = 0
+    if metadata:
+        header["__metadata__"] = validate_metadata(metadata)
+    for k, v in tensors.items():
+        if v.numel() == 0:  # empty tensor
+            header[k] = {"dtype": _TYPES[v.dtype], "shape": list(v.shape), "data_offsets": [offset, offset]}
+        else:
+            size = v.numel() * v.element_size()
+            header[k] = {"dtype": _TYPES[v.dtype], "shape": list(v.shape), "data_offsets": [offset, offset + size]}
+            offset += size
+
+    hjson = json.dumps(header).encode("utf-8")
+    hjson += b" " * (-(len(hjson) + 8) % _ALIGN)
+
+    with open(filename, "wb") as f:
+        f.write(struct.pack("<Q", len(hjson)))
+        f.write(hjson)
+
+        for k, v in tensors.items():
+            if v.numel() == 0:
+                continue
+            if v.is_cuda:
+                # Direct GPU to disk save
+                with torch.cuda.device(v.device):
+                    if v.dim() == 0:  # if scalar, need to add a dimension to work with view
+                        v = v.unsqueeze(0)
+                    tensor_bytes = v.contiguous().view(torch.uint8)
+                    tensor_bytes.cpu().numpy().tofile(f)
+            else:
+                # CPU tensor save
+                if v.dim() == 0:  # if scalar, need to add a dimension to work with view
+                    v = v.unsqueeze(0)
+                v.contiguous().view(torch.uint8).numpy().tofile(f)
+
+
+class MemoryEfficientSafeOpen:
+    # does not support metadata loading
+    def __init__(self, filename):
+        self.filename = filename
+        self.file = open(filename, "rb")
+        self.header, self.header_size = self._read_header()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def keys(self):
+        return [k for k in self.header.keys() if k != "__metadata__"]
+
+    def metadata(self) -> Dict[str, str]:
+        return self.header.get("__metadata__", {})
+
+    def get_tensor(self, key):
+        if key not in self.header:
+            raise KeyError(f"Tensor '{key}' not found in the file")
+
+        metadata = self.header[key]
+        offset_start, offset_end = metadata["data_offsets"]
+
+        if offset_start == offset_end:
+            tensor_bytes = None
+        else:
+            # adjust offset by header size
+            self.file.seek(self.header_size + 8 + offset_start)
+            tensor_bytes = self.file.read(offset_end - offset_start)
+
+        return self._deserialize_tensor(tensor_bytes, metadata)
+
+    def _read_header(self):
+        header_size = struct.unpack("<Q", self.file.read(8))[0]
+        header_json = self.file.read(header_size).decode("utf-8")
+        return json.loads(header_json), header_size
+
+    def _deserialize_tensor(self, tensor_bytes, metadata):
+        dtype = self._get_torch_dtype(metadata["dtype"])
+        shape = metadata["shape"]
+
+        if tensor_bytes is None:
+            byte_tensor = torch.empty(0, dtype=torch.uint8)
+        else:
+            tensor_bytes = bytearray(tensor_bytes)  # make it writable
+            byte_tensor = torch.frombuffer(tensor_bytes, dtype=torch.uint8)
+
+        # process float8 types
+        if metadata["dtype"] in ["F8_E5M2", "F8_E4M3"]:
+            return self._convert_float8(byte_tensor, metadata["dtype"], shape)
+
+        # convert to the target dtype and reshape
+        return byte_tensor.view(dtype).reshape(shape)
+
+    @staticmethod
+    def _get_torch_dtype(dtype_str):
+        dtype_map = {
+            "F64": torch.float64,
+            "F32": torch.float32,
+            "F16": torch.float16,
+            "BF16": torch.bfloat16,
+            "I64": torch.int64,
+            "I32": torch.int32,
+            "I16": torch.int16,
+            "I8": torch.int8,
+            "U8": torch.uint8,
+            "BOOL": torch.bool,
+        }
+        # add float8 types if available
+        if hasattr(torch, "float8_e5m2"):
+            dtype_map["F8_E5M2"] = torch.float8_e5m2
+        if hasattr(torch, "float8_e4m3fn"):
+            dtype_map["F8_E4M3"] = torch.float8_e4m3fn
+        return dtype_map.get(dtype_str)
+
+    @staticmethod
+    def _convert_float8(byte_tensor, dtype_str, shape):
+        if dtype_str == "F8_E5M2" and hasattr(torch, "float8_e5m2"):
+            return byte_tensor.view(torch.float8_e5m2).reshape(shape)
+        elif dtype_str == "F8_E4M3" and hasattr(torch, "float8_e4m3fn"):
+            return byte_tensor.view(torch.float8_e4m3fn).reshape(shape)
+        else:
+            # # convert to float16 if float8 is not supported
+            # print(f"Warning: {dtype_str} is not supported in this PyTorch version. Converting to float16.")
+            # return byte_tensor.view(torch.uint8).to(torch.float16).reshape(shape)
+            raise ValueError(f"Unsupported float8 type: {dtype_str} (upgrade PyTorch to support float8 types)")
+
+
+def load_safetensors(
+    path: str, device: Union[str, torch.device], disable_mmap: bool = False, dtype: Optional[torch.dtype] = torch.float32
+) -> dict[str, torch.Tensor]:
+    if disable_mmap:
+        # return safetensors.torch.load(open(path, "rb").read())
+        # use experimental loader
+        # logger.info(f"Loading without mmap (experimental)")
+        state_dict = {}
+        with MemoryEfficientSafeOpen(path) as f:
+            for key in f.keys():
+                state_dict[key] = f.get_tensor(key).to(device, dtype=dtype)
+        return state_dict
+    else:
+        try:
+            state_dict = load_file(path, device=device)
+        except:
+            state_dict = load_file(path)  # prevent device invalid Error
+        if dtype is not None:
+            for key in state_dict.keys():
+                state_dict[key] = state_dict[key].to(dtype=dtype)
+        return state_dict
diff --git a/utils/sai_model_spec.py b/utils/sai_model_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..70f70684cf60276ed9cca0e4d18fc06b6844af91
--- /dev/null
+++ b/utils/sai_model_spec.py
@@ -0,0 +1,278 @@
+# based on https://github.com/Stability-AI/ModelSpec
+import datetime
+import hashlib
+from io import BytesIO
+import os
+from typing import List, Optional, Tuple, Union
+import safetensors
+import logging
+
+from dataset.image_video_dataset import ARCHITECTURE_HUNYUAN_VIDEO, ARCHITECTURE_WAN
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+r"""
+# Metadata Example
+metadata = {
+    # === Must ===
+    "modelspec.sai_model_spec": "1.0.0", # Required version ID for the spec
+    "modelspec.architecture": "stable-diffusion-xl-v1-base", # Architecture, reference the ID of the original model of the arch to match the ID
+    "modelspec.implementation": "sgm",
+    "modelspec.title": "Example Model Version 1.0", # Clean, human-readable title. May use your own phrasing/language/etc
+    # === Should ===
+    "modelspec.author": "Example Corp", # Your name or company name
+    "modelspec.description": "This is my example model to show you how to do it!", # Describe the model in your own words/language/etc. Focus on what users need to know
+    "modelspec.date": "2023-07-20", # ISO-8601 compliant date of when the model was created
+    # === Can ===
+    "modelspec.license": "ExampleLicense-1.0", # eg CreativeML Open RAIL, etc.
+    "modelspec.usage_hint": "Use keyword 'example'" # In your own language, very short hints about how the user should use the model
+}
+"""
+
+BASE_METADATA = {
+    # === Must ===
+    "modelspec.sai_model_spec": "1.0.0",  # Required version ID for the spec
+    "modelspec.architecture": None,
+    "modelspec.implementation": None,
+    "modelspec.title": None,
+    "modelspec.resolution": None,
+    # === Should ===
+    "modelspec.description": None,
+    "modelspec.author": None,
+    "modelspec.date": None,
+    # === Can ===
+    "modelspec.license": None,
+    "modelspec.tags": None,
+    "modelspec.merged_from": None,
+    "modelspec.prediction_type": None,
+    "modelspec.timestep_range": None,
+    "modelspec.encoder_layer": None,
+}
+
+# 別に使うやつだけ定義
+MODELSPEC_TITLE = "modelspec.title"
+
+ARCH_HUNYUAN_VIDEO = "hunyuan-video"
+
+# Official Wan2.1 weights does not have sai_model_spec, so we use this as an architecture name
+ARCH_WAN = "wan2.1"
+
+ADAPTER_LORA = "lora"
+
+IMPL_HUNYUAN_VIDEO = "https://github.com/Tencent/HunyuanVideo"
+
+PRED_TYPE_EPSILON = "epsilon"
+# PRED_TYPE_V = "v"
+
+
+def load_bytes_in_safetensors(tensors):
+    bytes = safetensors.torch.save(tensors)
+    b = BytesIO(bytes)
+
+    b.seek(0)
+    header = b.read(8)
+    n = int.from_bytes(header, "little")
+
+    offset = n + 8
+    b.seek(offset)
+
+    return b.read()
+
+
+def precalculate_safetensors_hashes(state_dict):
+    # calculate each tensor one by one to reduce memory usage
+    hash_sha256 = hashlib.sha256()
+    for tensor in state_dict.values():
+        single_tensor_sd = {"tensor": tensor}
+        bytes_for_tensor = load_bytes_in_safetensors(single_tensor_sd)
+        hash_sha256.update(bytes_for_tensor)
+
+    return f"0x{hash_sha256.hexdigest()}"
+
+
+def update_hash_sha256(metadata: dict, state_dict: dict):
+    raise NotImplementedError
+
+
+def build_metadata(
+    state_dict: Optional[dict],
+    architecture: str,
+    timestamp: float,
+    title: Optional[str] = None,
+    reso: Optional[Union[int, Tuple[int, int]]] = None,
+    author: Optional[str] = None,
+    description: Optional[str] = None,
+    license: Optional[str] = None,
+    tags: Optional[str] = None,
+    merged_from: Optional[str] = None,
+    timesteps: Optional[Tuple[int, int]] = None,
+    is_lora: bool = True,
+):
+    metadata = {}
+    metadata.update(BASE_METADATA)
+
+    # TODO implement if we can calculate hash without loading all tensors
+    # if state_dict is not None:
+    # hash = precalculate_safetensors_hashes(state_dict)
+    # metadata["modelspec.hash_sha256"] = hash
+
+    # arch = ARCH_HUNYUAN_VIDEO
+    if architecture == ARCHITECTURE_HUNYUAN_VIDEO:
+        arch = ARCH_HUNYUAN_VIDEO
+    elif architecture == ARCHITECTURE_WAN:
+        arch = ARCH_WAN
+    else:
+        raise ValueError(f"Unknown architecture: {architecture}")
+
+    if is_lora:
+        arch += f"/{ADAPTER_LORA}"
+    metadata["modelspec.architecture"] = arch
+
+    impl = IMPL_HUNYUAN_VIDEO
+    metadata["modelspec.implementation"] = impl
+
+    if title is None:
+        title = "LoRA" if is_lora else "Hunyuan-Video"
+        title += f"@{timestamp}"
+    metadata[MODELSPEC_TITLE] = title
+
+    if author is not None:
+        metadata["modelspec.author"] = author
+    else:
+        del metadata["modelspec.author"]
+
+    if description is not None:
+        metadata["modelspec.description"] = description
+    else:
+        del metadata["modelspec.description"]
+
+    if merged_from is not None:
+        metadata["modelspec.merged_from"] = merged_from
+    else:
+        del metadata["modelspec.merged_from"]
+
+    if license is not None:
+        metadata["modelspec.license"] = license
+    else:
+        del metadata["modelspec.license"]
+
+    if tags is not None:
+        metadata["modelspec.tags"] = tags
+    else:
+        del metadata["modelspec.tags"]
+
+    # remove microsecond from time
+    int_ts = int(timestamp)
+
+    # time to iso-8601 compliant date
+    date = datetime.datetime.fromtimestamp(int_ts).isoformat()
+    metadata["modelspec.date"] = date
+
+    if reso is not None:
+        # comma separated to tuple
+        if isinstance(reso, str):
+            reso = tuple(map(int, reso.split(",")))
+        if len(reso) == 1:
+            reso = (reso[0], reso[0])
+    else:
+        # resolution is defined in dataset, so use default
+        reso = (1280, 720)
+    if isinstance(reso, int):
+        reso = (reso, reso)
+
+    metadata["modelspec.resolution"] = f"{reso[0]}x{reso[1]}"
+
+    # metadata["modelspec.prediction_type"] = PRED_TYPE_EPSILON
+    del metadata["modelspec.prediction_type"]
+
+    if timesteps is not None:
+        if isinstance(timesteps, str) or isinstance(timesteps, int):
+            timesteps = (timesteps, timesteps)
+        if len(timesteps) == 1:
+            timesteps = (timesteps[0], timesteps[0])
+        metadata["modelspec.timestep_range"] = f"{timesteps[0]},{timesteps[1]}"
+    else:
+        del metadata["modelspec.timestep_range"]
+
+    # if clip_skip is not None:
+    #     metadata["modelspec.encoder_layer"] = f"{clip_skip}"
+    # else:
+    del metadata["modelspec.encoder_layer"]
+
+    # # assert all values are filled
+    # assert all([v is not None for v in metadata.values()]), metadata
+    if not all([v is not None for v in metadata.values()]):
+        logger.error(f"Internal error: some metadata values are None: {metadata}")
+
+    return metadata
+
+
+# region utils
+
+
+def get_title(metadata: dict) -> Optional[str]:
+    return metadata.get(MODELSPEC_TITLE, None)
+
+
+def load_metadata_from_safetensors(model: str) -> dict:
+    if not model.endswith(".safetensors"):
+        return {}
+
+    with safetensors.safe_open(model, framework="pt") as f:
+        metadata = f.metadata()
+    if metadata is None:
+        metadata = {}
+    return metadata
+
+
+def build_merged_from(models: List[str]) -> str:
+    def get_title(model: str):
+        metadata = load_metadata_from_safetensors(model)
+        title = metadata.get(MODELSPEC_TITLE, None)
+        if title is None:
+            title = os.path.splitext(os.path.basename(model))[0]  # use filename
+        return title
+
+    titles = [get_title(model) for model in models]
+    return ", ".join(titles)
+
+
+# endregion
+
+
+r"""
+if __name__ == "__main__":
+    import argparse
+    import torch
+    from safetensors.torch import load_file
+    from library import train_util
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt", type=str, required=True)
+    args = parser.parse_args()
+
+    print(f"Loading {args.ckpt}")
+    state_dict = load_file(args.ckpt)
+
+    print(f"Calculating metadata")
+    metadata = get(state_dict, False, False, False, False, "sgm", False, False, "title", "date", 256, 1000, 0)
+    print(metadata)
+    del state_dict
+
+    # by reference implementation
+    with open(args.ckpt, mode="rb") as file_data:
+        file_hash = hashlib.sha256()
+        head_len = struct.unpack("Q", file_data.read(8))  # int64 header length prefix
+        header = json.loads(file_data.read(head_len[0]))  # header itself, json string
+        content = (
+            file_data.read()
+        )  # All other content is tightly packed tensors. Copy to RAM for simplicity, but you can avoid this read with a more careful FS-dependent impl.
+        file_hash.update(content)
+        # ===== Update the hash for modelspec =====
+        by_ref = f"0x{file_hash.hexdigest()}"
+    print(by_ref)
+    print("is same?", by_ref == metadata["modelspec.hash_sha256"])
+
+"""
diff --git a/utils/train_utils.py b/utils/train_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d6af9ae69c5f4748406ae96577f373ae6df5da1
--- /dev/null
+++ b/utils/train_utils.py
@@ -0,0 +1,177 @@
+import argparse
+import logging
+import os
+import shutil
+
+import accelerate
+import torch
+
+from utils import huggingface_utils
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+# checkpointファイル名
+EPOCH_STATE_NAME = "{}-{:06d}-state"
+EPOCH_FILE_NAME = "{}-{:06d}"
+EPOCH_DIFFUSERS_DIR_NAME = "{}-{:06d}"
+LAST_STATE_NAME = "{}-state"
+STEP_STATE_NAME = "{}-step{:08d}-state"
+STEP_FILE_NAME = "{}-step{:08d}"
+STEP_DIFFUSERS_DIR_NAME = "{}-step{:08d}"
+
+
+def get_sanitized_config_or_none(args: argparse.Namespace):
+    # if `--log_config` is enabled, return args for logging. if not, return None.
+    # when `--log_config is enabled, filter out sensitive values from args
+    # if wandb is not enabled, the log is not exposed to the public, but it is fine to filter out sensitive values to be safe
+
+    if not args.log_config:
+        return None
+
+    sensitive_args = ["wandb_api_key", "huggingface_token"]
+    sensitive_path_args = [
+        "dit",
+        "vae",
+        "text_encoder1",
+        "text_encoder2",
+        "base_weights",
+        "network_weights",
+        "output_dir",
+        "logging_dir",
+    ]
+    filtered_args = {}
+    for k, v in vars(args).items():
+        # filter out sensitive values and convert to string if necessary
+        if k not in sensitive_args + sensitive_path_args:
+            # Accelerate values need to have type `bool`,`str`, `float`, `int`, or `None`.
+            if v is None or isinstance(v, bool) or isinstance(v, str) or isinstance(v, float) or isinstance(v, int):
+                filtered_args[k] = v
+            # accelerate does not support lists
+            elif isinstance(v, list):
+                filtered_args[k] = f"{v}"
+            # accelerate does not support objects
+            elif isinstance(v, object):
+                filtered_args[k] = f"{v}"
+
+    return filtered_args
+
+
+class LossRecorder:
+    def __init__(self):
+        self.loss_list: list[float] = []
+        self.loss_total: float = 0.0
+
+    def add(self, *, epoch: int, step: int, loss: float) -> None:
+        if epoch == 0:
+            self.loss_list.append(loss)
+        else:
+            while len(self.loss_list) <= step:
+                self.loss_list.append(0.0)
+            self.loss_total -= self.loss_list[step]
+            self.loss_list[step] = loss
+        self.loss_total += loss
+
+    @property
+    def moving_average(self) -> float:
+        return self.loss_total / len(self.loss_list)
+
+
+def get_epoch_ckpt_name(model_name, epoch_no: int):
+    return EPOCH_FILE_NAME.format(model_name, epoch_no) + ".safetensors"
+
+
+def get_step_ckpt_name(model_name, step_no: int):
+    return STEP_FILE_NAME.format(model_name, step_no) + ".safetensors"
+
+
+def get_last_ckpt_name(model_name):
+    return model_name + ".safetensors"
+
+
+def get_remove_epoch_no(args: argparse.Namespace, epoch_no: int):
+    if args.save_last_n_epochs is None:
+        return None
+
+    remove_epoch_no = epoch_no - args.save_every_n_epochs * args.save_last_n_epochs
+    if remove_epoch_no < 0:
+        return None
+    return remove_epoch_no
+
+
+def get_remove_step_no(args: argparse.Namespace, step_no: int):
+    if args.save_last_n_steps is None:
+        return None
+
+    # calculate the step number to remove from the last_n_steps and save_every_n_steps
+    # e.g. if save_every_n_steps=10, save_last_n_steps=30, at step 50, keep 30 steps and remove step 10
+    remove_step_no = step_no - args.save_last_n_steps - 1
+    remove_step_no = remove_step_no - (remove_step_no % args.save_every_n_steps)
+    if remove_step_no < 0:
+        return None
+    return remove_step_no
+
+
+def save_and_remove_state_on_epoch_end(args: argparse.Namespace, accelerator: accelerate.Accelerator, epoch_no: int):
+    model_name = args.output_name
+
+    logger.info("")
+    logger.info(f"saving state at epoch {epoch_no}")
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    state_dir = os.path.join(args.output_dir, EPOCH_STATE_NAME.format(model_name, epoch_no))
+    accelerator.save_state(state_dir)
+    if args.save_state_to_huggingface:
+        logger.info("uploading state to huggingface.")
+        huggingface_utils.upload(args, state_dir, "/" + EPOCH_STATE_NAME.format(model_name, epoch_no))
+
+    last_n_epochs = args.save_last_n_epochs_state if args.save_last_n_epochs_state else args.save_last_n_epochs
+    if last_n_epochs is not None:
+        remove_epoch_no = epoch_no - args.save_every_n_epochs * last_n_epochs
+        state_dir_old = os.path.join(args.output_dir, EPOCH_STATE_NAME.format(model_name, remove_epoch_no))
+        if os.path.exists(state_dir_old):
+            logger.info(f"removing old state: {state_dir_old}")
+            shutil.rmtree(state_dir_old)
+
+
+def save_and_remove_state_stepwise(args: argparse.Namespace, accelerator: accelerate.Accelerator, step_no: int):
+    model_name = args.output_name
+
+    logger.info("")
+    logger.info(f"saving state at step {step_no}")
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    state_dir = os.path.join(args.output_dir, STEP_STATE_NAME.format(model_name, step_no))
+    accelerator.save_state(state_dir)
+    if args.save_state_to_huggingface:
+        logger.info("uploading state to huggingface.")
+        huggingface_utils.upload(args, state_dir, "/" + STEP_STATE_NAME.format(model_name, step_no))
+
+    last_n_steps = args.save_last_n_steps_state if args.save_last_n_steps_state else args.save_last_n_steps
+    if last_n_steps is not None:
+        # last_n_steps前のstep_noから、save_every_n_stepsの倍数のstep_noを計算して削除する
+        remove_step_no = step_no - last_n_steps - 1
+        remove_step_no = remove_step_no - (remove_step_no % args.save_every_n_steps)
+
+        if remove_step_no > 0:
+            state_dir_old = os.path.join(args.output_dir, STEP_STATE_NAME.format(model_name, remove_step_no))
+            if os.path.exists(state_dir_old):
+                logger.info(f"removing old state: {state_dir_old}")
+                shutil.rmtree(state_dir_old)
+
+
+def save_state_on_train_end(args: argparse.Namespace, accelerator: accelerate.Accelerator):
+    model_name = args.output_name
+
+    logger.info("")
+    logger.info("saving last state.")
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    state_dir = os.path.join(args.output_dir, LAST_STATE_NAME.format(model_name))
+    accelerator.save_state(state_dir)
+
+    if args.save_state_to_huggingface:
+        logger.info("uploading last state to huggingface.")
+        huggingface_utils.upload(args, state_dir, "/" + LAST_STATE_NAME.format(model_name))
+
diff --git a/wan/__init__.py b/wan/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f7ed4df10cae220744639f079b4e11d985f9d05
--- /dev/null
+++ b/wan/__init__.py
@@ -0,0 +1 @@
+# from . import configs, distributed, modules
diff --git a/wan/configs/__init__.py b/wan/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c72d2d01be834882d659701fc0dc67beb152383f
--- /dev/null
+++ b/wan/configs/__init__.py
@@ -0,0 +1,42 @@
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import copy
+import os
+
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+
+from .wan_i2v_14B import i2v_14B
+from .wan_t2v_1_3B import t2v_1_3B
+from .wan_t2v_14B import t2v_14B
+
+# the config of t2i_14B is the same as t2v_14B
+t2i_14B = copy.deepcopy(t2v_14B)
+t2i_14B.__name__ = 'Config: Wan T2I 14B'
+
+WAN_CONFIGS = {
+    't2v-14B': t2v_14B,
+    't2v-1.3B': t2v_1_3B,
+    'i2v-14B': i2v_14B,
+    't2i-14B': t2i_14B,
+}
+
+SIZE_CONFIGS = {
+    '720*1280': (720, 1280),
+    '1280*720': (1280, 720),
+    '480*832': (480, 832),
+    '832*480': (832, 480),
+    '1024*1024': (1024, 1024),
+}
+
+MAX_AREA_CONFIGS = {
+    '720*1280': 720 * 1280,
+    '1280*720': 1280 * 720,
+    '480*832': 480 * 832,
+    '832*480': 832 * 480,
+}
+
+SUPPORTED_SIZES = {
+    't2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
+    't2v-1.3B': ('480*832', '832*480'),
+    'i2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
+    't2i-14B': tuple(SIZE_CONFIGS.keys()),
+}
diff --git a/wan/configs/shared_config.py b/wan/configs/shared_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..04a9f454218fc1ce958b628e71ad5738222e2aa4
--- /dev/null
+++ b/wan/configs/shared_config.py
@@ -0,0 +1,19 @@
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+from easydict import EasyDict
+
+#------------------------ Wan shared config ------------------------#
+wan_shared_cfg = EasyDict()
+
+# t5
+wan_shared_cfg.t5_model = 'umt5_xxl'
+wan_shared_cfg.t5_dtype = torch.bfloat16
+wan_shared_cfg.text_len = 512
+
+# transformer
+wan_shared_cfg.param_dtype = torch.bfloat16
+
+# inference
+wan_shared_cfg.num_train_timesteps = 1000
+wan_shared_cfg.sample_fps = 16
+wan_shared_cfg.sample_neg_prompt = '色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走'
diff --git a/wan/configs/wan_i2v_14B.py b/wan/configs/wan_i2v_14B.py
new file mode 100644
index 0000000000000000000000000000000000000000..12e8e205bffb343a6e27d2828fb573db1d6349f8
--- /dev/null
+++ b/wan/configs/wan_i2v_14B.py
@@ -0,0 +1,35 @@
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+from easydict import EasyDict
+
+from .shared_config import wan_shared_cfg
+
+#------------------------ Wan I2V 14B ------------------------#
+
+i2v_14B = EasyDict(__name__='Config: Wan I2V 14B')
+i2v_14B.update(wan_shared_cfg)
+
+i2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+i2v_14B.t5_tokenizer = 'google/umt5-xxl'
+
+# clip
+i2v_14B.clip_model = 'clip_xlm_roberta_vit_h_14'
+i2v_14B.clip_dtype = torch.float16
+i2v_14B.clip_checkpoint = 'models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth'
+i2v_14B.clip_tokenizer = 'xlm-roberta-large'
+
+# vae
+i2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+i2v_14B.vae_stride = (4, 8, 8)
+
+# transformer
+i2v_14B.patch_size = (1, 2, 2)
+i2v_14B.dim = 5120
+i2v_14B.ffn_dim = 13824
+i2v_14B.freq_dim = 256
+i2v_14B.num_heads = 40
+i2v_14B.num_layers = 40
+i2v_14B.window_size = (-1, -1)
+i2v_14B.qk_norm = True
+i2v_14B.cross_attn_norm = True
+i2v_14B.eps = 1e-6
diff --git a/wan/configs/wan_t2v_14B.py b/wan/configs/wan_t2v_14B.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d0ee69dea796bfd6eccdedf4ec04835086227a6
--- /dev/null
+++ b/wan/configs/wan_t2v_14B.py
@@ -0,0 +1,29 @@
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from easydict import EasyDict
+
+from .shared_config import wan_shared_cfg
+
+#------------------------ Wan T2V 14B ------------------------#
+
+t2v_14B = EasyDict(__name__='Config: Wan T2V 14B')
+t2v_14B.update(wan_shared_cfg)
+
+# t5
+t2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+t2v_14B.t5_tokenizer = 'google/umt5-xxl'
+
+# vae
+t2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+t2v_14B.vae_stride = (4, 8, 8)
+
+# transformer
+t2v_14B.patch_size = (1, 2, 2)
+t2v_14B.dim = 5120
+t2v_14B.ffn_dim = 13824
+t2v_14B.freq_dim = 256
+t2v_14B.num_heads = 40
+t2v_14B.num_layers = 40
+t2v_14B.window_size = (-1, -1)
+t2v_14B.qk_norm = True
+t2v_14B.cross_attn_norm = True
+t2v_14B.eps = 1e-6
diff --git a/wan/configs/wan_t2v_1_3B.py b/wan/configs/wan_t2v_1_3B.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea9502b0df685b5d22f9091cc8cdf5c6a7880c4b
--- /dev/null
+++ b/wan/configs/wan_t2v_1_3B.py
@@ -0,0 +1,29 @@
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from easydict import EasyDict
+
+from .shared_config import wan_shared_cfg
+
+#------------------------ Wan T2V 1.3B ------------------------#
+
+t2v_1_3B = EasyDict(__name__='Config: Wan T2V 1.3B')
+t2v_1_3B.update(wan_shared_cfg)
+
+# t5
+t2v_1_3B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+t2v_1_3B.t5_tokenizer = 'google/umt5-xxl'
+
+# vae
+t2v_1_3B.vae_checkpoint = 'Wan2.1_VAE.pth'
+t2v_1_3B.vae_stride = (4, 8, 8)
+
+# transformer
+t2v_1_3B.patch_size = (1, 2, 2)
+t2v_1_3B.dim = 1536
+t2v_1_3B.ffn_dim = 8960
+t2v_1_3B.freq_dim = 256
+t2v_1_3B.num_heads = 12
+t2v_1_3B.num_layers = 30
+t2v_1_3B.window_size = (-1, -1)
+t2v_1_3B.qk_norm = True
+t2v_1_3B.cross_attn_norm = True
+t2v_1_3B.eps = 1e-6
diff --git a/wan/modules/__init__.py b/wan/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8935bbb45ab4e3f349d203b673102f7cfc07553
--- /dev/null
+++ b/wan/modules/__init__.py
@@ -0,0 +1,16 @@
+from .attention import flash_attention
+from .model import WanModel
+from .t5 import T5Decoder, T5Encoder, T5EncoderModel, T5Model
+from .tokenizers import HuggingfaceTokenizer
+from .vae import WanVAE
+
+__all__ = [
+    'WanVAE',
+    'WanModel',
+    'T5Model',
+    'T5Encoder',
+    'T5Decoder',
+    'T5EncoderModel',
+    'HuggingfaceTokenizer',
+    'flash_attention',
+]
diff --git a/wan/modules/attention.py b/wan/modules/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..7653f7c7c1ceee172f6fd32686fa038dff3472dc
--- /dev/null
+++ b/wan/modules/attention.py
@@ -0,0 +1,312 @@
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from typing import Optional
+import torch
+
+try:
+    import flash_attn_interface
+
+    FLASH_ATTN_3_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_3_AVAILABLE = False
+
+try:
+    import flash_attn
+
+    FLASH_ATTN_2_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_2_AVAILABLE = False
+
+try:
+    import sageattention
+
+    SAGE_ATTN_AVAILABLE = True
+except ModuleNotFoundError:
+    SAGE_ATTN_AVAILABLE = False
+
+try:
+    import xformers.ops as xops
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    XFORMERS_AVAILABLE = False
+
+
+import warnings
+
+__all__ = [
+    "flash_attention",
+    "attention",
+]
+
+
+def flash_attention(
+    qkv,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.0,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    version=None,
+    attn_mode: Optional[str] = "torch",
+    split_attn: bool = False,
+):
+    """
+    q:              [B, Lq, Nq, C1].
+    k:              [B, Lk, Nk, C1].
+    v:              [B, Lk, Nk, C2]. Nq must be divisible by Nk.
+    q_lens:         [B].
+    k_lens:         [B].
+    dropout_p:      float. Dropout probability.
+    softmax_scale:  float. The scaling of QK^T before applying softmax.
+    causal:         bool. Whether to apply causal attention mask.
+    window_size:    (left right). If not (-1, -1), apply sliding window local attention.
+    deterministic:  bool. If True, slightly slower and uses more memory.
+    dtype:          torch.dtype. Apply when dtype of q/k/v is not float16/bfloat16.
+    """
+    q, k, v = qkv
+    qkv.clear()
+
+    half_dtypes = (torch.float16, torch.bfloat16)
+    assert dtype in half_dtypes
+    # assert q.device.type == "cuda" and q.size(-1) <= 256
+
+    # params
+    b, lq, lk, out_dtype = q.size(0), q.size(1), k.size(1), q.dtype
+
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+
+    # We cannot test Flash attention 3 in musubi tuner, so keep the original code.
+    # Customized code (except for flash attention 3) is not supported q_lens and k_lens.
+    if attn_mode != "flash3" and attn_mode != "sageattn":
+        assert q_lens is None, "q_lens is not supported except for flash attention 3."
+        assert k_lens is None or (
+            min(k_lens) == max(k_lens) and k_lens[0] == lk
+        ), "k_lens is not supported except for flash attention 3."
+
+    # SDPA
+    if attn_mode == "torch" or attn_mode == "sdpa":
+        assert not deterministic, "deterministic is not supported in scaled_dot_product_attention."
+        if q_scale is not None:
+            q = q * q_scale
+        q = half(q.transpose(1, 2))
+        k = half(k.transpose(1, 2))
+        v = half(v.transpose(1, 2))
+
+        if not split_attn:
+            q = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, is_causal=causal, dropout_p=dropout_p, scale=softmax_scale
+            )
+            x = q
+        else:
+            x = torch.empty_like(q)
+            for i in range(q.size(0)):
+                x[i : i + 1] = torch.nn.functional.scaled_dot_product_attention(
+                    q[i : i + 1], k[i : i + 1], v[i : i + 1], is_causal=causal, dropout_p=dropout_p, scale=softmax_scale
+                )
+
+        del q, k, v
+        x = x.transpose(1, 2).contiguous()
+        return x.type(out_dtype)
+
+    # flash attention 2
+    if attn_mode == "flash" or attn_mode == "flash2":
+        if q_scale is not None:
+            q = q * q_scale
+        q = half(q)
+        k = half(k)
+        v = half(v)
+
+        if not split_attn:
+            q = flash_attn.flash_attn_func(q, k, v, dropout_p, softmax_scale, causal, window_size, deterministic=deterministic)
+            x = q
+        else:
+            x = torch.empty_like(q)
+            for i in range(q.size(0)):
+                x[i : i + 1] = flash_attn.flash_attn_func(
+                    q[i : i + 1],
+                    k[i : i + 1],
+                    v[i : i + 1],
+                    dropout_p,
+                    softmax_scale,
+                    causal,
+                    window_size,
+                    deterministic=deterministic,
+                )
+        del q, k, v
+        return x.type(out_dtype)
+
+    # xformers
+    if attn_mode == "xformers":
+        assert not deterministic, "deterministic is not supported in xformers."
+        assert not causal, "causal is not supported in xformers."
+        if q_scale is not None:
+            q = q * q_scale
+        q = half(q)
+        k = half(k)
+        v = half(v)
+
+        if not split_attn:
+            q = xops.memory_efficient_attention(q, k, v, p=dropout_p, scale=softmax_scale)
+            x = q
+        else:
+            x = torch.empty_like(q)
+            for i in range(q.size(0)):
+                x[i : i + 1] = xops.memory_efficient_attention(
+                    q[i : i + 1], k[i : i + 1], v[i : i + 1], p=dropout_p, scale=softmax_scale
+                )
+
+        del q, k, v
+        return x.type(out_dtype)
+
+    # sage attention with fixed length seems to cause NaN in I2V inference.
+    # # sage attention
+    # if attn_mode == "sageattn":
+    #     print("Using sage attention")
+    #     assert not deterministic, "deterministic is not supported in sage attention."
+    #     if q_scale is not None:
+    #         q = q * q_scale
+    #     q, k, v = half(q), half(k), half(v)
+    #     x = sageattention.sageattn(q, k, v, "NHD", is_causal=causal, sm_scale=softmax_scale)
+    #     del q, k, v
+    #     return x.type(out_dtype)
+
+    assert not split_attn, "split_attn is not supported in flash attention 3 or sage attention."
+
+    # preprocess query: in Wan 2.1, q_lens is always None.
+    if q_lens is None:
+        q = half(q.flatten(0, 1))
+        q_lens = torch.tensor([lq] * b, dtype=torch.int32).to(device=q.device, non_blocking=True)
+    else:
+        q = half(torch.cat([u[:v] for u, v in zip(q, q_lens)]))
+
+    # preprocess key, value
+    if k_lens is None:
+        k = half(k.flatten(0, 1))
+        v = half(v.flatten(0, 1))
+        k_lens = torch.tensor([lk] * b, dtype=torch.int32).to(device=k.device, non_blocking=True)
+    else:
+        # Note: in Wan 2.1, all k_lens are same if we have same image size in the batch.
+        if min(k_lens) == max(k_lens) and k.shape[1] == k_lens[0]:
+            # B, L, N, C -> BN, L, C
+            k = half(k.flatten(0, 1))
+            v = half(v.flatten(0, 1))
+        else:
+            k = half(torch.cat([u[:v] for u, v in zip(k, k_lens)]))
+            v = half(torch.cat([u[:v] for u, v in zip(v, k_lens)]))
+
+    q = q.to(v.dtype)
+    k = k.to(v.dtype)
+
+    if q_scale is not None:
+        q = q * q_scale
+
+    # if version is not None and version == 3 and not FLASH_ATTN_3_AVAILABLE:
+    #     warnings.warn("Flash attention 3 is not available, use flash attention 2 instead.")
+
+    # apply attention
+    # if (version is None or version == 3) and FLASH_ATTN_3_AVAILABLE:
+    if attn_mode == "flash3":
+        # Not tested yet in musubi tuner.
+        # Note: dropout_p, window_size are not supported in FA3 now.
+        x = flash_attn_interface.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(0, dtype=torch.int32).to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(0, dtype=torch.int32).to(q.device, non_blocking=True),
+            seqused_q=None,
+            seqused_k=None,
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            deterministic=deterministic,
+        )[0].unflatten(0, (b, lq))
+    # elif (version is None or version == 2) and FLASH_ATTN_2_AVAILABLE:
+    #     # assert FLASH_ATTN_2_AVAILABLE
+    #     x = flash_attn.flash_attn_varlen_func(
+    #         q=q,
+    #         k=k,
+    #         v=v,
+    #         cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(0, dtype=torch.int32).to(q.device, non_blocking=True),
+    #         cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(0, dtype=torch.int32).to(q.device, non_blocking=True),
+    #         max_seqlen_q=lq,
+    #         max_seqlen_k=lk,
+    #         dropout_p=dropout_p,
+    #         softmax_scale=softmax_scale,
+    #         causal=causal,
+    #         window_size=window_size,
+    #         deterministic=deterministic,
+    #     ).unflatten(0, (b, lq))
+    # elif version is None and SAGE_ATTN_AVAILABLE:
+    elif attn_mode == "sageattn":
+        # print("Using sage attention")
+        assert not causal, "SAGE attention does not support causal attention."
+        x = sageattention.sageattn_varlen(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(0, dtype=torch.int32).to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(0, dtype=torch.int32).to(q.device, non_blocking=True),
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            sm_scale=softmax_scale,
+        ).unflatten(0, (b, lq))
+    else:
+        raise ValueError(f"Unknown attention mode: {attn_mode}")
+
+    # output
+    return x.type(out_dtype)
+
+
+def attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.0,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    fa_version=None,
+):
+    if FLASH_ATTN_2_AVAILABLE or FLASH_ATTN_3_AVAILABLE:
+        return flash_attention(
+            q=q,
+            k=k,
+            v=v,
+            q_lens=q_lens,
+            k_lens=k_lens,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            q_scale=q_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic,
+            dtype=dtype,
+            version=fa_version,
+        )
+    else:
+        if q_lens is not None or k_lens is not None:
+            warnings.warn(
+                "Padding mask is disabled when using scaled_dot_product_attention. It can have a significant impact on performance."
+            )
+        attn_mask = None
+
+        q = q.transpose(1, 2).to(dtype)
+        k = k.transpose(1, 2).to(dtype)
+        v = v.transpose(1, 2).to(dtype)
+
+        out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p)
+
+        out = out.transpose(1, 2).contiguous()
+        return out
diff --git a/wan/modules/clip.py b/wan/modules/clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fbd867678e1d75d402583c91ea97bba74194c52
--- /dev/null
+++ b/wan/modules/clip.py
@@ -0,0 +1,546 @@
+# Modified from ``https://github.com/openai/CLIP'' and ``https://github.com/mlfoundations/open_clip''
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import logging
+import math
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as T
+from accelerate import init_empty_weights
+
+from .attention import flash_attention
+from .tokenizers import HuggingfaceTokenizer
+from .xlm_roberta import XLMRoberta
+
+from utils.safetensors_utils import load_safetensors
+
+__all__ = [
+    "XLMRobertaCLIP",
+    "clip_xlm_roberta_vit_h_14",
+    "CLIPModel",
+]
+
+
+def pos_interpolate(pos, seq_len):
+    if pos.size(1) == seq_len:
+        return pos
+    else:
+        src_grid = int(math.sqrt(pos.size(1)))
+        tar_grid = int(math.sqrt(seq_len))
+        n = pos.size(1) - src_grid * src_grid
+        return torch.cat(
+            [
+                pos[:, :n],
+                F.interpolate(
+                    pos[:, n:].float().reshape(1, src_grid, src_grid, -1).permute(0, 3, 1, 2),
+                    size=(tar_grid, tar_grid),
+                    mode="bicubic",
+                    align_corners=False,
+                )
+                .flatten(2)
+                .transpose(1, 2),
+            ],
+            dim=1,
+        )
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class LayerNorm(nn.LayerNorm):
+
+    def forward(self, x):
+        return super().forward(x.float()).type_as(x)
+
+
+class SelfAttention(nn.Module):
+
+    def __init__(self, dim, num_heads, causal=False, attn_dropout=0.0, proj_dropout=0.0):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.causal = causal
+        self.attn_dropout = attn_dropout
+        self.proj_dropout = proj_dropout
+
+        # layers
+        self.to_qkv = nn.Linear(dim, dim * 3)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(self, x):
+        """
+        x:   [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).view(b, s, 3, n, d).unbind(2)
+
+        # compute attention
+        p = self.attn_dropout if self.training else 0.0
+        # x = flash_attention(q, k, v, dropout_p=p, causal=self.causal, version=2)
+        # print(q.shape, k.shape, v.shape)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        x = torch.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=p, is_causal=self.causal)
+        # print(x.shape)
+        x = x.transpose(1, 2).contiguous()
+        x = x.reshape(b, s, c)
+
+        # output
+        x = self.proj(x)
+        x = F.dropout(x, self.proj_dropout, self.training)
+        return x
+
+
+class SwiGLU(nn.Module):
+
+    def __init__(self, dim, mid_dim):
+        super().__init__()
+        self.dim = dim
+        self.mid_dim = mid_dim
+
+        # layers
+        self.fc1 = nn.Linear(dim, mid_dim)
+        self.fc2 = nn.Linear(dim, mid_dim)
+        self.fc3 = nn.Linear(mid_dim, dim)
+
+    def forward(self, x):
+        x = F.silu(self.fc1(x)) * self.fc2(x)
+        x = self.fc3(x)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim,
+        mlp_ratio,
+        num_heads,
+        post_norm=False,
+        causal=False,
+        activation="quick_gelu",
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        norm_eps=1e-5,
+    ):
+        assert activation in ["quick_gelu", "gelu", "swi_glu"]
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.post_norm = post_norm
+        self.causal = causal
+        self.norm_eps = norm_eps
+
+        # layers
+        self.norm1 = LayerNorm(dim, eps=norm_eps)
+        self.attn = SelfAttention(dim, num_heads, causal, attn_dropout, proj_dropout)
+        self.norm2 = LayerNorm(dim, eps=norm_eps)
+        if activation == "swi_glu":
+            self.mlp = SwiGLU(dim, int(dim * mlp_ratio))
+        else:
+            self.mlp = nn.Sequential(
+                nn.Linear(dim, int(dim * mlp_ratio)),
+                QuickGELU() if activation == "quick_gelu" else nn.GELU(),
+                nn.Linear(int(dim * mlp_ratio), dim),
+                nn.Dropout(proj_dropout),
+            )
+
+    def forward(self, x):
+        if self.post_norm:
+            x = x + self.norm1(self.attn(x))
+            x = x + self.norm2(self.mlp(x))
+        else:
+            x = x + self.attn(self.norm1(x))
+            x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class AttentionPool(nn.Module):
+
+    def __init__(self, dim, mlp_ratio, num_heads, activation="gelu", proj_dropout=0.0, norm_eps=1e-5):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.proj_dropout = proj_dropout
+        self.norm_eps = norm_eps
+
+        # layers
+        gain = 1.0 / math.sqrt(dim)
+        self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.to_q = nn.Linear(dim, dim)
+        self.to_kv = nn.Linear(dim, dim * 2)
+        self.proj = nn.Linear(dim, dim)
+        self.norm = LayerNorm(dim, eps=norm_eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, int(dim * mlp_ratio)),
+            QuickGELU() if activation == "quick_gelu" else nn.GELU(),
+            nn.Linear(int(dim * mlp_ratio), dim),
+            nn.Dropout(proj_dropout),
+        )
+
+    def forward(self, x):
+        """
+        x:  [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+
+        # compute query, key, value
+        q = self.to_q(self.cls_embedding).view(1, 1, n, d).expand(b, -1, -1, -1)
+        k, v = self.to_kv(x).view(b, s, 2, n, d).unbind(2)
+
+        # compute attention
+        # this line is never used because pool_type="token" in Wan2.1
+        x = flash_attention(q, k, v, version=2)
+        x = x.reshape(b, 1, c)
+
+        # output
+        x = self.proj(x)
+        x = F.dropout(x, self.proj_dropout, self.training)
+
+        # mlp
+        x = x + self.mlp(self.norm(x))
+        return x[:, 0]
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        image_size=224,
+        patch_size=16,
+        dim=768,
+        mlp_ratio=4,
+        out_dim=512,
+        num_heads=12,
+        num_layers=12,
+        pool_type="token",
+        pre_norm=True,
+        post_norm=False,
+        activation="quick_gelu",
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0,
+        norm_eps=1e-5,
+    ):
+        if image_size % patch_size != 0:
+            print("[WARNING] image_size is not divisible by patch_size", flush=True)
+        assert pool_type in ("token", "token_fc", "attn_pool")
+        out_dim = out_dim or dim
+        super().__init__()
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = (image_size // patch_size) ** 2
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.pool_type = pool_type
+        self.post_norm = post_norm
+        self.norm_eps = norm_eps
+
+        # embeddings
+        gain = 1.0 / math.sqrt(dim)
+        self.patch_embedding = nn.Conv2d(3, dim, kernel_size=patch_size, stride=patch_size, bias=not pre_norm)
+        if pool_type in ("token", "token_fc"):
+            self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.pos_embedding = nn.Parameter(
+            gain * torch.randn(1, self.num_patches + (1 if pool_type in ("token", "token_fc") else 0), dim)
+        )
+        self.dropout = nn.Dropout(embedding_dropout)
+
+        # transformer
+        self.pre_norm = LayerNorm(dim, eps=norm_eps) if pre_norm else None
+        self.transformer = nn.Sequential(
+            *[
+                AttentionBlock(dim, mlp_ratio, num_heads, post_norm, False, activation, attn_dropout, proj_dropout, norm_eps)
+                for _ in range(num_layers)
+            ]
+        )
+        self.post_norm = LayerNorm(dim, eps=norm_eps)
+
+        # head
+        if pool_type == "token":
+            self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
+        elif pool_type == "token_fc":
+            self.head = nn.Linear(dim, out_dim)
+        elif pool_type == "attn_pool":
+            self.head = AttentionPool(dim, mlp_ratio, num_heads, activation, proj_dropout, norm_eps)
+
+    def forward(self, x, interpolation=False, use_31_block=False):
+        b = x.size(0)
+
+        # embeddings
+        x = self.patch_embedding(x).flatten(2).permute(0, 2, 1)
+        if self.pool_type in ("token", "token_fc"):
+            x = torch.cat([self.cls_embedding.expand(b, -1, -1), x], dim=1)
+        if interpolation:
+            e = pos_interpolate(self.pos_embedding, x.size(1))
+        else:
+            e = self.pos_embedding
+        x = self.dropout(x + e)
+        if self.pre_norm is not None:
+            x = self.pre_norm(x)
+
+        # transformer
+        if use_31_block:
+            x = self.transformer[:-1](x)
+            return x
+        else:
+            x = self.transformer(x)
+            return x
+
+
+class XLMRobertaWithHead(XLMRoberta):
+
+    def __init__(self, **kwargs):
+        self.out_dim = kwargs.pop("out_dim")
+        super().__init__(**kwargs)
+
+        # head
+        mid_dim = (self.dim + self.out_dim) // 2
+        self.head = nn.Sequential(nn.Linear(self.dim, mid_dim, bias=False), nn.GELU(), nn.Linear(mid_dim, self.out_dim, bias=False))
+
+    def forward(self, ids):
+        # xlm-roberta
+        x = super().forward(ids)
+
+        # average pooling
+        mask = ids.ne(self.pad_id).unsqueeze(-1).to(x)
+        x = (x * mask).sum(dim=1) / mask.sum(dim=1)
+
+        # head
+        x = self.head(x)
+        return x
+
+
+class XLMRobertaCLIP(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim=1024,
+        image_size=224,
+        patch_size=14,
+        vision_dim=1280,
+        vision_mlp_ratio=4,
+        vision_heads=16,
+        vision_layers=32,
+        vision_pool="token",
+        vision_pre_norm=True,
+        vision_post_norm=False,
+        activation="gelu",
+        vocab_size=250002,
+        max_text_len=514,
+        type_size=1,
+        pad_id=1,
+        text_dim=1024,
+        text_heads=16,
+        text_layers=24,
+        text_post_norm=True,
+        text_dropout=0.1,
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0,
+        norm_eps=1e-5,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.vision_dim = vision_dim
+        self.vision_mlp_ratio = vision_mlp_ratio
+        self.vision_heads = vision_heads
+        self.vision_layers = vision_layers
+        self.vision_pre_norm = vision_pre_norm
+        self.vision_post_norm = vision_post_norm
+        self.activation = activation
+        self.vocab_size = vocab_size
+        self.max_text_len = max_text_len
+        self.type_size = type_size
+        self.pad_id = pad_id
+        self.text_dim = text_dim
+        self.text_heads = text_heads
+        self.text_layers = text_layers
+        self.text_post_norm = text_post_norm
+        self.norm_eps = norm_eps
+
+        # models
+        self.visual = VisionTransformer(
+            image_size=image_size,
+            patch_size=patch_size,
+            dim=vision_dim,
+            mlp_ratio=vision_mlp_ratio,
+            out_dim=embed_dim,
+            num_heads=vision_heads,
+            num_layers=vision_layers,
+            pool_type=vision_pool,
+            pre_norm=vision_pre_norm,
+            post_norm=vision_post_norm,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout,
+            norm_eps=norm_eps,
+        )
+        self.textual = XLMRobertaWithHead(
+            vocab_size=vocab_size,
+            max_seq_len=max_text_len,
+            type_size=type_size,
+            pad_id=pad_id,
+            dim=text_dim,
+            out_dim=embed_dim,
+            num_heads=text_heads,
+            num_layers=text_layers,
+            post_norm=text_post_norm,
+            dropout=text_dropout,
+        )
+        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
+
+    def forward(self, imgs, txt_ids):
+        """
+        imgs:       [B, 3, H, W] of torch.float32.
+        - mean:     [0.48145466, 0.4578275, 0.40821073]
+        - std:      [0.26862954, 0.26130258, 0.27577711]
+        txt_ids:    [B, L] of torch.long.
+                    Encoded by data.CLIPTokenizer.
+        """
+        xi = self.visual(imgs)
+        xt = self.textual(txt_ids)
+        return xi, xt
+
+    def param_groups(self):
+        groups = [
+            {"params": [p for n, p in self.named_parameters() if "norm" in n or n.endswith("bias")], "weight_decay": 0.0},
+            {"params": [p for n, p in self.named_parameters() if not ("norm" in n or n.endswith("bias"))]},
+        ]
+        return groups
+
+
+def _clip(
+    pretrained=False,
+    pretrained_name=None,
+    model_cls=XLMRobertaCLIP,
+    return_transforms=False,
+    return_tokenizer=False,
+    tokenizer_padding="eos",
+    dtype=torch.float32,
+    device="cpu",
+    **kwargs,
+):
+    # # init a model on device
+    # with torch.device(device):
+    model = model_cls(**kwargs)
+
+    # # set device
+    # model = model.to(dtype=dtype, device=device)
+    output = (model,)
+
+    # init transforms
+    if return_transforms:
+        # mean and std
+        if "siglip" in pretrained_name.lower():
+            mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
+        else:
+            mean = [0.48145466, 0.4578275, 0.40821073]
+            std = [0.26862954, 0.26130258, 0.27577711]
+
+        # transforms
+        transforms = T.Compose(
+            [
+                T.Resize((model.image_size, model.image_size), interpolation=T.InterpolationMode.BICUBIC),
+                T.ToTensor(),
+                T.Normalize(mean=mean, std=std),
+            ]
+        )
+        output += (transforms,)
+    return output[0] if len(output) == 1 else output
+
+
+def clip_xlm_roberta_vit_h_14(pretrained=False, pretrained_name="open-clip-xlm-roberta-large-vit-huge-14", **kwargs):
+    cfg = dict(
+        embed_dim=1024,
+        image_size=224,
+        patch_size=14,
+        vision_dim=1280,
+        vision_mlp_ratio=4,
+        vision_heads=16,
+        vision_layers=32,
+        vision_pool="token",
+        activation="gelu",
+        vocab_size=250002,
+        max_text_len=514,
+        type_size=1,
+        pad_id=1,
+        text_dim=1024,
+        text_heads=16,
+        text_layers=24,
+        text_post_norm=True,
+        text_dropout=0.1,
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0,
+    )
+    cfg.update(**kwargs)
+    return _clip(pretrained, pretrained_name, XLMRobertaCLIP, **cfg)
+
+
+class CLIPModel:
+
+    def __init__(self, dtype, device, checkpoint_path=None, tokenizer_path=None, weight_path=None):
+        self.dtype = dtype
+        self.device = device
+        self.checkpoint_path = checkpoint_path
+        self.tokenizer_path = tokenizer_path
+        self.weight_path = weight_path
+
+        # init model
+        with init_empty_weights():
+            self.model, self.transforms = clip_xlm_roberta_vit_h_14(
+                pretrained=False, return_transforms=True, return_tokenizer=False, dtype=dtype, device=device
+            )
+        self.model = self.model.eval().requires_grad_(False)
+
+        logging.info(f"loading {weight_path}")
+        if os.path.splitext(weight_path)[-1] == ".safetensors":
+            sd = load_safetensors(weight_path, device=device, disable_mmap=True, dtype=dtype)
+        else:
+            sd = torch.load(weight_path, map_location=device, weights_only=True)
+        info = self.model.load_state_dict(sd, strict=True, assign=True)
+        self.model = self.model.to(dtype=dtype, device=device)
+        logging.info(f"weights loaded from {weight_path}: {info}")
+
+        # init tokenizer
+        if tokenizer_path is None:
+            tokenizer_path = "Wan-AI/Wan2.1-I2V-14B-720P"
+            subfolder = "xlm-roberta-large"
+        else:
+            subfolder = None
+
+        self.tokenizer = HuggingfaceTokenizer(
+            name=tokenizer_path, seq_len=self.model.max_text_len - 2, clean="whitespace", subfolder=subfolder
+        )
+
+    def visual(self, videos):
+        # preprocess
+        size = (self.model.image_size,) * 2
+        videos = torch.cat([F.interpolate(u.transpose(0, 1), size=size, mode="bicubic", align_corners=False) for u in videos])
+        videos = self.transforms.transforms[-1](videos.mul_(0.5).add_(0.5))
+
+        # forward
+        # with torch.cuda.amp.autocast(dtype=self.dtype):
+        out = self.model.visual(videos, use_31_block=True)
+        return out
diff --git a/wan/modules/model.py b/wan/modules/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..e727980e0579481f2dc26028f153fb64e108054a
--- /dev/null
+++ b/wan/modules/model.py
@@ -0,0 +1,930 @@
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import math
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from accelerate import init_empty_weights
+
+import logging
+
+from utils.safetensors_utils import MemoryEfficientSafeOpen, load_safetensors
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+from utils.device_utils import clean_memory_on_device
+
+from .attention import flash_attention
+from utils.device_utils import clean_memory_on_device
+from modules.custom_offloading_utils import ModelOffloader
+from modules.fp8_optimization_utils import apply_fp8_monkey_patch, optimize_state_dict_with_fp8
+
+__all__ = ["WanModel"]
+
+
+def sinusoidal_embedding_1d(dim, position):
+    # preprocess
+    assert dim % 2 == 0
+    half = dim // 2
+    position = position.type(torch.float64)
+
+    # calculation
+    sinusoid = torch.outer(position, torch.pow(10000, -torch.arange(half).to(position).div(half)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x
+
+
+# @amp.autocast(enabled=False)
+# no autocast is needed for rope_apply, because it is already in float64
+def rope_params(max_seq_len, dim, theta=10000):
+    assert dim % 2 == 0
+    freqs = torch.outer(torch.arange(max_seq_len), 1.0 / torch.pow(theta, torch.arange(0, dim, 2).to(torch.float64).div(dim)))
+    freqs = torch.polar(torch.ones_like(freqs), freqs)
+    return freqs
+
+
+# @amp.autocast(enabled=False)
+def rope_apply(x, grid_sizes, freqs):
+    device_type = x.device.type
+    with torch.amp.autocast(device_type=device_type, enabled=False):
+        n, c = x.size(2), x.size(3) // 2
+
+        # split freqs
+        freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+
+        # loop over samples
+        output = []
+        for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+            seq_len = f * h * w
+
+            # precompute multipliers
+            x_i = torch.view_as_complex(x[i, :seq_len].to(torch.float64).reshape(seq_len, n, -1, 2))
+            freqs_i = torch.cat(
+                [
+                    freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+                    freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+                    freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
+                ],
+                dim=-1,
+            ).reshape(seq_len, 1, -1)
+
+            # apply rotary embedding
+            x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
+            x_i = torch.cat([x_i, x[i, seq_len:]])
+
+            # append to collection
+            output.append(x_i)
+        return torch.stack(output).float()
+
+
+def calculate_freqs_i(fhw, c, freqs):
+    f, h, w = fhw
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    freqs_i = torch.cat(
+        [
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
+        ],
+        dim=-1,
+    ).reshape(f * h * w, 1, -1)
+    return freqs_i
+
+
+# inplace version of rope_apply
+def rope_apply_inplace_cached(x, grid_sizes, freqs_list):
+    # with torch.amp.autocast(device_type=device_type, enabled=False):
+    rope_dtype = torch.float64  # float32 does not reduce memory usage significantly
+
+    n, c = x.size(2), x.size(3) // 2
+
+    # loop over samples
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+
+        # precompute multipliers
+        x_i = torch.view_as_complex(x[i, :seq_len].to(rope_dtype).reshape(seq_len, n, -1, 2))
+        freqs_i = freqs_list[i]
+
+        # apply rotary embedding
+        x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
+        # x_i = torch.cat([x_i, x[i, seq_len:]])
+
+        # inplace update
+        x[i, :seq_len] = x_i.to(x.dtype)
+
+    return x
+
+
+class WanRMSNorm(nn.Module):
+
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+        """
+        # return self._norm(x.float()).type_as(x) * self.weight
+        # support fp8
+        return self._norm(x.float()).type_as(x) * self.weight.to(x.dtype)
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+
+    # def forward(self, x):
+    #     r"""
+    #     Args:
+    #         x(Tensor): Shape [B, L, C]
+    #     """
+    #     # inplace version, also supports fp8 -> does not have significant performance improvement
+    #     original_dtype = x.dtype
+    #     x = x.float()
+    #     y = x.pow(2).mean(dim=-1, keepdim=True)
+    #     y.add_(self.eps)
+    #     y.rsqrt_()
+    #     x *= y
+    #     x = x.to(original_dtype)
+    #     x *= self.weight.to(original_dtype)
+    #     return x
+
+
+class WanLayerNorm(nn.LayerNorm):
+
+    def __init__(self, dim, eps=1e-6, elementwise_affine=False):
+        super().__init__(dim, elementwise_affine=elementwise_affine, eps=eps)
+
+    def forward(self, x):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+        """
+        return super().forward(x.float()).type_as(x)
+
+
+class WanSelfAttention(nn.Module):
+
+    def __init__(self, dim, num_heads, window_size=(-1, -1), qk_norm=True, eps=1e-6, attn_mode="torch", split_attn=False):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.eps = eps
+        self.attn_mode = attn_mode
+        self.split_attn = split_attn
+
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+        self.norm_k = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+
+    def forward(self, x, seq_lens, grid_sizes, freqs):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, num_heads, C / num_heads]
+            seq_lens(Tensor): Shape [B]
+            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
+            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+        """
+        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+
+        # # query, key, value function
+        # def qkv_fn(x):
+        #     q = self.norm_q(self.q(x)).view(b, s, n, d)
+        #     k = self.norm_k(self.k(x)).view(b, s, n, d)
+        #     v = self.v(x).view(b, s, n, d)
+        #     return q, k, v
+        # q, k, v = qkv_fn(x)
+        # del x
+        # query, key, value function
+
+        q = self.q(x)
+        k = self.k(x)
+        v = self.v(x)
+        del x
+        q = self.norm_q(q)
+        k = self.norm_k(k)
+        q = q.view(b, s, n, d)
+        k = k.view(b, s, n, d)
+        v = v.view(b, s, n, d)
+
+        rope_apply_inplace_cached(q, grid_sizes, freqs)
+        rope_apply_inplace_cached(k, grid_sizes, freqs)
+        qkv = [q, k, v]
+        del q, k, v
+        x = flash_attention(
+            qkv, k_lens=seq_lens, window_size=self.window_size, attn_mode=self.attn_mode, split_attn=self.split_attn
+        )
+
+        # output
+        x = x.flatten(2)
+        x = self.o(x)
+        return x
+
+
+class WanT2VCrossAttention(WanSelfAttention):
+
+    def forward(self, x, context, context_lens):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            context(Tensor): Shape [B, L2, C]
+            context_lens(Tensor): Shape [B]
+        """
+        b, n, d = x.size(0), self.num_heads, self.head_dim
+
+        # compute query, key, value
+        # q = self.norm_q(self.q(x)).view(b, -1, n, d)
+        # k = self.norm_k(self.k(context)).view(b, -1, n, d)
+        # v = self.v(context).view(b, -1, n, d)
+        q = self.q(x)
+        del x
+        k = self.k(context)
+        v = self.v(context)
+        del context
+        q = self.norm_q(q)
+        k = self.norm_k(k)
+        q = q.view(b, -1, n, d)
+        k = k.view(b, -1, n, d)
+        v = v.view(b, -1, n, d)
+
+        # compute attention
+        qkv = [q, k, v]
+        del q, k, v
+        x = flash_attention(qkv, k_lens=context_lens, attn_mode=self.attn_mode, split_attn=self.split_attn)
+
+        # output
+        x = x.flatten(2)
+        x = self.o(x)
+        return x
+
+
+class WanI2VCrossAttention(WanSelfAttention):
+
+    def __init__(self, dim, num_heads, window_size=(-1, -1), qk_norm=True, eps=1e-6, attn_mode="torch", split_attn=False):
+        super().__init__(dim, num_heads, window_size, qk_norm, eps, attn_mode, split_attn)
+
+        self.k_img = nn.Linear(dim, dim)
+        self.v_img = nn.Linear(dim, dim)
+        # self.alpha = nn.Parameter(torch.zeros((1, )))
+        self.norm_k_img = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+
+    def forward(self, x, context, context_lens):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            context(Tensor): Shape [B, L2, C]
+            context_lens(Tensor): Shape [B]
+        """
+        context_img = context[:, :257]
+        context = context[:, 257:]
+        b, n, d = x.size(0), self.num_heads, self.head_dim
+
+        # compute query, key, value
+        q = self.q(x)
+        del x
+        q = self.norm_q(q)
+        q = q.view(b, -1, n, d)
+        k = self.k(context)
+        k = self.norm_k(k).view(b, -1, n, d)
+        v = self.v(context).view(b, -1, n, d)
+        del context
+
+        # compute attention
+        qkv = [q, k, v]
+        del k, v
+        x = flash_attention(qkv, k_lens=context_lens, attn_mode=self.attn_mode, split_attn=self.split_attn)
+
+        # compute query, key, value
+        k_img = self.norm_k_img(self.k_img(context_img)).view(b, -1, n, d)
+        v_img = self.v_img(context_img).view(b, -1, n, d)
+        del context_img
+
+        # compute attention
+        qkv = [q, k_img, v_img]
+        del q, k_img, v_img
+        img_x = flash_attention(qkv, k_lens=None, attn_mode=self.attn_mode, split_attn=self.split_attn)
+
+        # output
+        x = x.flatten(2)
+        img_x = img_x.flatten(2)
+        if self.training:
+            x = x + img_x  # avoid inplace
+        else:
+            x += img_x
+        del img_x
+
+        x = self.o(x)
+        return x
+
+
+WAN_CROSSATTENTION_CLASSES = {
+    "t2v_cross_attn": WanT2VCrossAttention,
+    "i2v_cross_attn": WanI2VCrossAttention,
+}
+
+
+class WanAttentionBlock(nn.Module):
+
+    def __init__(
+        self,
+        cross_attn_type,
+        dim,
+        ffn_dim,
+        num_heads,
+        window_size=(-1, -1),
+        qk_norm=True,
+        cross_attn_norm=False,
+        eps=1e-6,
+        attn_mode="torch",
+        split_attn=False,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+
+        # layers
+        self.norm1 = WanLayerNorm(dim, eps)
+        self.self_attn = WanSelfAttention(dim, num_heads, window_size, qk_norm, eps, attn_mode, split_attn)
+        self.norm3 = WanLayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity()
+        self.cross_attn = WAN_CROSSATTENTION_CLASSES[cross_attn_type](dim, num_heads, (-1, -1), qk_norm, eps, attn_mode, split_attn)
+        self.norm2 = WanLayerNorm(dim, eps)
+        self.ffn = nn.Sequential(nn.Linear(dim, ffn_dim), nn.GELU(approximate="tanh"), nn.Linear(ffn_dim, dim))
+
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+
+        self.gradient_checkpointing = False
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+
+    def _forward(self, x, e, seq_lens, grid_sizes, freqs, context, context_lens):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+            e(Tensor): Shape [B, 6, C]
+            seq_lens(Tensor): Shape [B], length of each sequence in batch
+            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
+            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+        """
+        assert e.dtype == torch.float32
+        # with amp.autocast(dtype=torch.float32):
+        #     e = (self.modulation + e).chunk(6, dim=1)
+        # support fp8
+        e = self.modulation.to(torch.float32) + e
+        e = e.chunk(6, dim=1)
+        assert e[0].dtype == torch.float32
+
+        # self-attention
+        y = self.self_attn(self.norm1(x).float() * (1 + e[1]) + e[0], seq_lens, grid_sizes, freqs)
+        # with amp.autocast(dtype=torch.float32):
+        #     x = x + y * e[2]
+        x = x + y.to(torch.float32) * e[2]
+        del y
+
+        # cross-attention & ffn function
+        # def cross_attn_ffn(x, context, context_lens, e):
+        #     x += self.cross_attn(self.norm3(x), context, context_lens)
+        #     y = self.ffn(self.norm2(x).float() * (1 + e[4]) + e[3])
+        #     # with amp.autocast(dtype=torch.float32):
+        #     #     x = x + y * e[5]
+        #     x += y.to(torch.float32) * e[5]
+        #     return x
+        # x = cross_attn_ffn(x, context, context_lens, e)
+
+        # x += self.cross_attn(self.norm3(x), context, context_lens) # backward error
+        x = x + self.cross_attn(self.norm3(x), context, context_lens)
+        del context
+        y = self.ffn(self.norm2(x).float() * (1 + e[4]) + e[3])
+        x = x + y.to(torch.float32) * e[5]
+        del y
+        return x
+
+    def forward(self, x, e, seq_lens, grid_sizes, freqs, context, context_lens):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, x, e, seq_lens, grid_sizes, freqs, context, context_lens, use_reentrant=False)
+        return self._forward(x, e, seq_lens, grid_sizes, freqs, context, context_lens)
+
+
+class Head(nn.Module):
+
+    def __init__(self, dim, out_dim, patch_size, eps=1e-6):
+        super().__init__()
+        self.dim = dim
+        self.out_dim = out_dim
+        self.patch_size = patch_size
+        self.eps = eps
+
+        # layers
+        out_dim = math.prod(patch_size) * out_dim
+        self.norm = WanLayerNorm(dim, eps)
+        self.head = nn.Linear(dim, out_dim)
+
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
+
+    def forward(self, x, e):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            e(Tensor): Shape [B, C]
+        """
+        assert e.dtype == torch.float32
+        # with amp.autocast(dtype=torch.float32):
+        #     e = (self.modulation + e.unsqueeze(1)).chunk(2, dim=1)
+        #     x = self.head(self.norm(x) * (1 + e[1]) + e[0])
+        # support fp8
+        e = (self.modulation.to(torch.float32) + e.unsqueeze(1)).chunk(2, dim=1)
+        x = self.head(self.norm(x) * (1 + e[1]) + e[0])
+        return x
+
+
+class MLPProj(torch.nn.Module):
+
+    def __init__(self, in_dim, out_dim):
+        super().__init__()
+
+        self.proj = torch.nn.Sequential(
+            torch.nn.LayerNorm(in_dim),
+            torch.nn.Linear(in_dim, in_dim),
+            torch.nn.GELU(),
+            torch.nn.Linear(in_dim, out_dim),
+            torch.nn.LayerNorm(out_dim),
+        )
+
+    def forward(self, image_embeds):
+        clip_extra_context_tokens = self.proj(image_embeds)
+        return clip_extra_context_tokens
+
+
+class WanModel(nn.Module):  # ModelMixin, ConfigMixin):
+    r"""
+    Wan diffusion backbone supporting both text-to-video and image-to-video.
+    """
+
+    ignore_for_config = ["patch_size", "cross_attn_norm", "qk_norm", "text_dim", "window_size"]
+    _no_split_modules = ["WanAttentionBlock"]
+
+    # @register_to_config
+    def __init__(
+        self,
+        model_type="t2v",
+        patch_size=(1, 2, 2),
+        text_len=512,
+        in_dim=16,
+        dim=2048,
+        ffn_dim=8192,
+        freq_dim=256,
+        text_dim=4096,
+        out_dim=16,
+        num_heads=16,
+        num_layers=32,
+        window_size=(-1, -1),
+        qk_norm=True,
+        cross_attn_norm=True,
+        eps=1e-6,
+        attn_mode=None,
+        split_attn=False,
+    ):
+        r"""
+        Initialize the diffusion model backbone.
+
+        Args:
+            model_type (`str`, *optional*, defaults to 't2v'):
+                Model variant - 't2v' (text-to-video) or 'i2v' (image-to-video)
+            patch_size (`tuple`, *optional*, defaults to (1, 2, 2)):
+                3D patch dimensions for video embedding (t_patch, h_patch, w_patch)
+            text_len (`int`, *optional*, defaults to 512):
+                Fixed length for text embeddings
+            in_dim (`int`, *optional*, defaults to 16):
+                Input video channels (C_in)
+            dim (`int`, *optional*, defaults to 2048):
+                Hidden dimension of the transformer
+            ffn_dim (`int`, *optional*, defaults to 8192):
+                Intermediate dimension in feed-forward network
+            freq_dim (`int`, *optional*, defaults to 256):
+                Dimension for sinusoidal time embeddings
+            text_dim (`int`, *optional*, defaults to 4096):
+                Input dimension for text embeddings
+            out_dim (`int`, *optional*, defaults to 16):
+                Output video channels (C_out)
+            num_heads (`int`, *optional*, defaults to 16):
+                Number of attention heads
+            num_layers (`int`, *optional*, defaults to 32):
+                Number of transformer blocks
+            window_size (`tuple`, *optional*, defaults to (-1, -1)):
+                Window size for local attention (-1 indicates global attention)
+            qk_norm (`bool`, *optional*, defaults to True):
+                Enable query/key normalization
+            cross_attn_norm (`bool`, *optional*, defaults to False):
+                Enable cross-attention normalization
+            eps (`float`, *optional*, defaults to 1e-6):
+                Epsilon value for normalization layers
+        """
+
+        super().__init__()
+
+        assert model_type in ["t2v", "i2v"]
+        self.model_type = model_type
+
+        self.patch_size = patch_size
+        self.text_len = text_len
+        self.in_dim = in_dim
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.freq_dim = freq_dim
+        self.text_dim = text_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        self.attn_mode = attn_mode if attn_mode is not None else "torch"
+        self.split_attn = split_attn
+
+        # embeddings
+        self.patch_embedding = nn.Conv3d(in_dim, dim, kernel_size=patch_size, stride=patch_size)
+        self.text_embedding = nn.Sequential(nn.Linear(text_dim, dim), nn.GELU(approximate="tanh"), nn.Linear(dim, dim))
+
+        self.time_embedding = nn.Sequential(nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+        self.time_projection = nn.Sequential(nn.SiLU(), nn.Linear(dim, dim * 6))
+
+        # blocks
+        cross_attn_type = "t2v_cross_attn" if model_type == "t2v" else "i2v_cross_attn"
+        self.blocks = nn.ModuleList(
+            [
+                WanAttentionBlock(
+                    cross_attn_type, dim, ffn_dim, num_heads, window_size, qk_norm, cross_attn_norm, eps, attn_mode, split_attn
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        # head
+        self.head = Head(dim, out_dim, patch_size, eps)
+
+        # buffers (don't use register_buffer otherwise dtype will be changed in to())
+        assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
+        d = dim // num_heads
+        self.freqs = torch.cat(
+            [rope_params(1024, d - 4 * (d // 6)), rope_params(1024, 2 * (d // 6)), rope_params(1024, 2 * (d // 6))], dim=1
+        )
+        self.freqs_fhw = {}
+
+        if model_type == "i2v":
+            self.img_emb = MLPProj(1280, dim)
+
+        # initialize weights
+        self.init_weights()
+
+        self.gradient_checkpointing = False
+
+        # offloading
+        self.blocks_to_swap = None
+        self.offloader = None
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    def fp8_optimization(
+        self, state_dict: dict[str, torch.Tensor], device: torch.device, move_to_device: bool, use_scaled_mm: bool = False
+    ) -> int:
+        """
+        Optimize the model state_dict with fp8.
+
+        Args:
+            state_dict (dict[str, torch.Tensor]):
+                The state_dict of the model.
+            device (torch.device):
+                The device to calculate the weight.
+            move_to_device (bool):
+                Whether to move the weight to the device after optimization.
+        """
+        TARGET_KEYS = ["blocks"]
+        EXCLUDE_KEYS = [
+            "norm",
+            "patch_embedding",
+            "text_embedding",
+            "time_embedding",
+            "time_projection",
+            "head",
+            "modulation",
+            "img_emb",
+        ]
+
+        # inplace optimization
+        state_dict = optimize_state_dict_with_fp8(state_dict, device, TARGET_KEYS, EXCLUDE_KEYS, move_to_device=move_to_device)
+
+        # apply monkey patching
+        apply_fp8_monkey_patch(self, state_dict, use_scaled_mm=use_scaled_mm)
+
+        return state_dict
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+
+        for block in self.blocks:
+            block.enable_gradient_checkpointing()
+
+        print(f"WanModel: Gradient checkpointing enabled.")
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+
+        for block in self.blocks:
+            block.disable_gradient_checkpointing()
+
+        print(f"WanModel: Gradient checkpointing disabled.")
+
+    def enable_block_swap(self, blocks_to_swap: int, device: torch.device, supports_backward: bool):
+        self.blocks_to_swap = blocks_to_swap
+        self.num_blocks = len(self.blocks)
+
+        assert (
+            self.blocks_to_swap <= self.num_blocks - 1
+        ), f"Cannot swap more than {self.num_blocks - 1} blocks. Requested {self.blocks_to_swap} blocks to swap."
+
+        self.offloader = ModelOffloader(
+            "wan_attn_block", self.blocks, self.num_blocks, self.blocks_to_swap, supports_backward, device  # , debug=True
+        )
+        print(
+            f"WanModel: Block swap enabled. Swapping {self.blocks_to_swap} blocks out of {self.num_blocks} blocks. Supports backward: {supports_backward}"
+        )
+
+    def switch_block_swap_for_inference(self):
+        if self.blocks_to_swap:
+            self.offloader.set_forward_only(True)
+            self.prepare_block_swap_before_forward()
+            print(f"WanModel: Block swap set to forward only.")
+
+    def switch_block_swap_for_training(self):
+        if self.blocks_to_swap:
+            self.offloader.set_forward_only(False)
+            self.prepare_block_swap_before_forward()
+            print(f"WanModel: Block swap set to forward and backward.")
+
+    def move_to_device_except_swap_blocks(self, device: torch.device):
+        # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
+        if self.blocks_to_swap:
+            save_blocks = self.blocks
+            self.blocks = None
+
+        self.to(device)
+
+        if self.blocks_to_swap:
+            self.blocks = save_blocks
+
+    def prepare_block_swap_before_forward(self):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self.offloader.prepare_block_devices_before_forward(self.blocks)
+
+    def forward(self, x, t, context, seq_len, clip_fea=None, y=None):
+        r"""
+        Forward pass through the diffusion model
+
+        Args:
+            x (List[Tensor]):
+                List of input video tensors, each with shape [C_in, F, H, W]
+            t (Tensor):
+                Diffusion timesteps tensor of shape [B]
+            context (List[Tensor]):
+                List of text embeddings each with shape [L, C]
+            seq_len (`int`):
+                Maximum sequence length for positional encoding
+            clip_fea (Tensor, *optional*):
+                CLIP image features for image-to-video mode
+            y (List[Tensor], *optional*):
+                Conditional video inputs for image-to-video mode, same shape as x
+
+        Returns:
+            List[Tensor]:
+                List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
+        """
+        if self.model_type == "i2v":
+            assert clip_fea is not None and y is not None
+        # params
+        device = self.patch_embedding.weight.device
+        if self.freqs.device != device:
+            self.freqs = self.freqs.to(device)
+
+        if y is not None:
+            x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
+            y = None
+
+        # embeddings
+        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
+        grid_sizes = torch.stack([torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
+
+        freqs_list = []
+        for fhw in grid_sizes:
+            fhw = tuple(fhw.tolist())
+            if fhw not in self.freqs_fhw:
+                c = self.dim // self.num_heads // 2
+                self.freqs_fhw[fhw] = calculate_freqs_i(fhw, c, self.freqs)
+            freqs_list.append(self.freqs_fhw[fhw])
+
+        x = [u.flatten(2).transpose(1, 2) for u in x]
+        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
+        assert seq_lens.max() <= seq_len, f"Sequence length exceeds maximum allowed length {seq_len}. Got {seq_lens.max()}"
+        x = torch.cat([torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1) for u in x])
+
+        # time embeddings
+        # with amp.autocast(dtype=torch.float32):
+        with torch.amp.autocast(device_type=device.type, dtype=torch.float32):
+            e = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, t).float())
+            e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+            assert e.dtype == torch.float32 and e0.dtype == torch.float32
+
+        # context
+        context_lens = None
+        if type(context) is list:
+            context = torch.stack([torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))]) for u in context])
+        context = self.text_embedding(context)
+
+        if clip_fea is not None:
+            context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+            context = torch.concat([context_clip, context], dim=1)
+            clip_fea = None
+            context_clip = None
+
+        # arguments
+        kwargs = dict(e=e0, seq_lens=seq_lens, grid_sizes=grid_sizes, freqs=freqs_list, context=context, context_lens=context_lens)
+
+        if self.blocks_to_swap:
+            clean_memory_on_device(device)
+
+        # print(f"x: {x.shape}, e: {e0.shape}, context: {context.shape}, seq_lens: {seq_lens}")
+        for block_idx, block in enumerate(self.blocks):
+            if self.blocks_to_swap:
+                self.offloader.wait_for_block(block_idx)
+
+            x = block(x, **kwargs)
+
+            if self.blocks_to_swap:
+                self.offloader.submit_move_blocks_forward(self.blocks, block_idx)
+
+        # head
+        x = self.head(x, e)
+
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes)
+        return [u.float() for u in x]
+
+    def unpatchify(self, x, grid_sizes):
+        r"""
+        Reconstruct video tensors from patch embeddings.
+
+        Args:
+            x (List[Tensor]):
+                List of patchified features, each with shape [L, C_out * prod(patch_size)]
+            grid_sizes (Tensor):
+                Original spatial-temporal grid dimensions before patching,
+                    shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches)
+
+        Returns:
+            List[Tensor]:
+                Reconstructed video tensors with shape [C_out, F, H / 8, W / 8]
+        """
+
+        c = self.out_dim
+        out = []
+        for u, v in zip(x, grid_sizes.tolist()):
+            u = u[: math.prod(v)].view(*v, *self.patch_size, c)
+            u = torch.einsum("fhwpqrc->cfphqwr", u)
+            u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)])
+            out.append(u)
+        return out
+
+    def init_weights(self):
+        r"""
+        Initialize model parameters using Xavier initialization.
+        """
+
+        # basic init
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+
+        # init embeddings
+        nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1))
+        for m in self.text_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=0.02)
+        for m in self.time_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=0.02)
+
+        # init output layer
+        nn.init.zeros_(self.head.head.weight)
+
+
+def detect_wan_sd_dtype(path: str) -> torch.dtype:
+    # get dtype from model weights
+    with MemoryEfficientSafeOpen(path) as f:
+        keys = set(f.keys())
+        key1 = "model.diffusion_model.blocks.0.cross_attn.k.weight"  # 1.3B
+        key2 = "blocks.0.cross_attn.k.weight"  # 14B
+        if key1 in keys:
+            dit_dtype = f.get_tensor(key1).dtype
+        elif key2 in keys:
+            dit_dtype = f.get_tensor(key2).dtype
+        else:
+            raise ValueError(f"Could not find the dtype in the model weights: {path}")
+    logger.info(f"Detected DiT dtype: {dit_dtype}")
+    return dit_dtype
+
+
+def load_wan_model(
+    config: any,
+    i2v: bool,
+    device: Union[str, torch.device],
+    dit_path: str,
+    attn_mode: str,
+    split_attn: bool,
+    loading_device: Union[str, torch.device],
+    dit_weight_dtype: Optional[torch.dtype],
+    fp8_scaled: bool = False,
+) -> WanModel:
+    # dit_weight_dtype is None for fp8_scaled
+    assert (not fp8_scaled and dit_weight_dtype is not None) or (fp8_scaled and dit_weight_dtype is None)
+
+    device = torch.device(device)
+    loading_device = torch.device(loading_device)
+
+    with init_empty_weights():
+        logger.info(f"Creating WanModel")
+        model = WanModel(
+            model_type="i2v" if i2v else "t2v",
+            dim=config.dim,
+            eps=config.eps,
+            ffn_dim=config.ffn_dim,
+            freq_dim=config.freq_dim,
+            in_dim=36 if i2v else 16,  # 36 for I2V, 16 for T2V
+            num_heads=config.num_heads,
+            num_layers=config.num_layers,
+            out_dim=16,
+            text_len=512,
+            attn_mode=attn_mode,
+            split_attn=split_attn,
+        )
+        if dit_weight_dtype is not None:
+            model.to(dit_weight_dtype)
+
+    # if fp8_scaled, load model weights to CPU to reduce VRAM usage. Otherwise, load to the specified device (CPU for block swap or CUDA for others)
+    wan_loading_device = torch.device("cpu") if fp8_scaled else loading_device
+    logger.info(f"Loading DiT model from {dit_path}, device={wan_loading_device}, dtype={dit_weight_dtype}")
+
+    # load model weights with the specified dtype or as is
+    sd = load_safetensors(dit_path, wan_loading_device, disable_mmap=True, dtype=dit_weight_dtype)
+
+    # remove "model.diffusion_model." prefix: 1.3B model has this prefix
+    for key in list(sd.keys()):
+        if key.startswith("model.diffusion_model."):
+            sd[key[22:]] = sd.pop(key)
+
+    if fp8_scaled:
+        # fp8 optimization: calculate on CUDA, move back to CPU if loading_device is CPU (block swap)
+        logger.info(f"Optimizing model weights to fp8. This may take a while.")
+        sd = model.fp8_optimization(sd, device, move_to_device=loading_device.type == "cpu")
+
+        if loading_device.type != "cpu":
+            # make sure all the model weights are on the loading_device
+            logger.info(f"Moving weights to {loading_device}")
+            for key in sd.keys():
+                sd[key] = sd[key].to(loading_device)
+
+    info = model.load_state_dict(sd, strict=True, assign=True)
+    logger.info(f"Loaded DiT model from {dit_path}, info={info}")
+
+    return model
diff --git a/wan/modules/t5.py b/wan/modules/t5.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbc89c8342ae9c799fc4674e51fa5661131e38b4
--- /dev/null
+++ b/wan/modules/t5.py
@@ -0,0 +1,514 @@
+# Modified from transformers.models.t5.modeling_t5
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+# import logging
+import math
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .tokenizers import HuggingfaceTokenizer
+from accelerate import init_empty_weights
+from safetensors.torch import load_file
+
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+__all__ = [
+    "T5Model",
+    "T5Encoder",
+    "T5Decoder",
+    "T5EncoderModel",
+]
+
+
+def fp16_clamp(x):
+    if x.dtype == torch.float16 and torch.isinf(x).any():
+        clamp = torch.finfo(x.dtype).max - 1000
+        x = torch.clamp(x, min=-clamp, max=clamp)
+    return x
+
+
+def init_weights(m):
+    if isinstance(m, T5LayerNorm):
+        nn.init.ones_(m.weight)
+    elif isinstance(m, T5Model):
+        nn.init.normal_(m.token_embedding.weight, std=1.0)
+    elif isinstance(m, T5FeedForward):
+        nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc1.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5)
+    elif isinstance(m, T5Attention):
+        nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn) ** -0.5)
+        nn.init.normal_(m.k.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.v.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn) ** -0.5)
+    elif isinstance(m, T5RelativeEmbedding):
+        nn.init.normal_(m.embedding.weight, std=(2 * m.num_buckets * m.num_heads) ** -0.5)
+
+
+class GELU(nn.Module):
+
+    def forward(self, x):
+        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+
+
+class T5LayerNorm(nn.Module):
+
+    def __init__(self, dim, eps=1e-6):
+        super(T5LayerNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x):
+        x = x * torch.rsqrt(x.float().pow(2).mean(dim=-1, keepdim=True) + self.eps)
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            x = x.type_as(self.weight)
+        return self.weight * x
+
+
+class T5Attention(nn.Module):
+
+    def __init__(self, dim, dim_attn, num_heads, dropout=0.1):
+        assert dim_attn % num_heads == 0
+        super(T5Attention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.num_heads = num_heads
+        self.head_dim = dim_attn // num_heads
+
+        # layers
+        self.q = nn.Linear(dim, dim_attn, bias=False)
+        self.k = nn.Linear(dim, dim_attn, bias=False)
+        self.v = nn.Linear(dim, dim_attn, bias=False)
+        self.o = nn.Linear(dim_attn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, context=None, mask=None, pos_bias=None):
+        """
+        x:          [B, L1, C].
+        context:    [B, L2, C] or None.
+        mask:       [B, L2] or [B, L1, L2] or None.
+        """
+        # check inputs
+        context = x if context is None else context
+        b, n, c = x.size(0), self.num_heads, self.head_dim
+
+        # compute query, key, value
+        q = self.q(x).view(b, -1, n, c)
+        k = self.k(context).view(b, -1, n, c)
+        v = self.v(context).view(b, -1, n, c)
+
+        # attention bias
+        attn_bias = x.new_zeros(b, n, q.size(1), k.size(1))
+        if pos_bias is not None:
+            attn_bias += pos_bias
+        if mask is not None:
+            assert mask.ndim in [2, 3]
+            mask = mask.view(b, 1, 1, -1) if mask.ndim == 2 else mask.unsqueeze(1)
+            attn_bias.masked_fill_(mask == 0, torch.finfo(x.dtype).min)
+
+        # compute attention (T5 does not use scaling)
+        attn = torch.einsum("binc,bjnc->bnij", q, k) + attn_bias
+        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
+        x = torch.einsum("bnij,bjnc->binc", attn, v)
+
+        # output
+        x = x.reshape(b, -1, n * c)
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+
+
+class T5FeedForward(nn.Module):
+
+    def __init__(self, dim, dim_ffn, dropout=0.1):
+        super(T5FeedForward, self).__init__()
+        self.dim = dim
+        self.dim_ffn = dim_ffn
+
+        # layers
+        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
+        self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
+        self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        x = self.fc1(x) * self.gate(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class T5SelfAttention(nn.Module):
+
+    def __init__(self, dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos=True, dropout=0.1):
+        super(T5SelfAttention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+
+        # layers
+        self.norm1 = T5LayerNorm(dim)
+        self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm2 = T5LayerNorm(dim)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
+        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True)
+
+    def forward(self, x, mask=None, pos_bias=None):
+        e = pos_bias if self.shared_pos else self.pos_embedding(x.size(1), x.size(1))
+        x = fp16_clamp(x + self.attn(self.norm1(x), mask=mask, pos_bias=e))
+        x = fp16_clamp(x + self.ffn(self.norm2(x)))
+        return x
+
+
+class T5CrossAttention(nn.Module):
+
+    def __init__(self, dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos=True, dropout=0.1):
+        super(T5CrossAttention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+
+        # layers
+        self.norm1 = T5LayerNorm(dim)
+        self.self_attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm2 = T5LayerNorm(dim)
+        self.cross_attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm3 = T5LayerNorm(dim)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
+        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(num_buckets, num_heads, bidirectional=False)
+
+    def forward(self, x, mask=None, encoder_states=None, encoder_mask=None, pos_bias=None):
+        e = pos_bias if self.shared_pos else self.pos_embedding(x.size(1), x.size(1))
+        x = fp16_clamp(x + self.self_attn(self.norm1(x), mask=mask, pos_bias=e))
+        x = fp16_clamp(x + self.cross_attn(self.norm2(x), context=encoder_states, mask=encoder_mask))
+        x = fp16_clamp(x + self.ffn(self.norm3(x)))
+        return x
+
+
+class T5RelativeEmbedding(nn.Module):
+
+    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
+        super(T5RelativeEmbedding, self).__init__()
+        self.num_buckets = num_buckets
+        self.num_heads = num_heads
+        self.bidirectional = bidirectional
+        self.max_dist = max_dist
+
+        # layers
+        self.embedding = nn.Embedding(num_buckets, num_heads)
+
+    def forward(self, lq, lk):
+        device = self.embedding.weight.device
+        # rel_pos = torch.arange(lk).unsqueeze(0).to(device) - \
+        #     torch.arange(lq).unsqueeze(1).to(device)
+        rel_pos = torch.arange(lk, device=device).unsqueeze(0) - torch.arange(lq, device=device).unsqueeze(1)
+        rel_pos = self._relative_position_bucket(rel_pos)
+        rel_pos_embeds = self.embedding(rel_pos)
+        rel_pos_embeds = rel_pos_embeds.permute(2, 0, 1).unsqueeze(0)  # [1, N, Lq, Lk]
+        return rel_pos_embeds.contiguous()
+
+    def _relative_position_bucket(self, rel_pos):
+        # preprocess
+        if self.bidirectional:
+            num_buckets = self.num_buckets // 2
+            rel_buckets = (rel_pos > 0).long() * num_buckets
+            rel_pos = torch.abs(rel_pos)
+        else:
+            num_buckets = self.num_buckets
+            rel_buckets = 0
+            rel_pos = -torch.min(rel_pos, torch.zeros_like(rel_pos))
+
+        # embeddings for small and large positions
+        max_exact = num_buckets // 2
+        rel_pos_large = (
+            max_exact
+            + (torch.log(rel_pos.float() / max_exact) / math.log(self.max_dist / max_exact) * (num_buckets - max_exact)).long()
+        )
+        rel_pos_large = torch.min(rel_pos_large, torch.full_like(rel_pos_large, num_buckets - 1))
+        rel_buckets += torch.where(rel_pos < max_exact, rel_pos, rel_pos_large)
+        return rel_buckets
+
+
+class T5Encoder(nn.Module):
+
+    def __init__(self, vocab, dim, dim_attn, dim_ffn, num_heads, num_layers, num_buckets, shared_pos=True, dropout=0.1):
+        super(T5Encoder, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+
+        # layers
+        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) else nn.Embedding(vocab, dim)
+        self.pos_embedding = T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True) if shared_pos else None
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList(
+            [T5SelfAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos, dropout) for _ in range(num_layers)]
+        )
+        self.norm = T5LayerNorm(dim)
+
+        # initialize weights
+        self.apply(init_weights)
+
+    def prepare_fp8(self, target_dtype=torch.bfloat16):
+        def forward_hook(module):
+            def forward(hidden_states):
+                hidden_gelu = module.act(module.wi_0(hidden_states))
+                hidden_linear = module.wi_1(hidden_states)
+                hidden_states = hidden_gelu * hidden_linear
+                hidden_states = module.dropout(hidden_states)
+
+                hidden_states = module.wo(hidden_states)
+                return hidden_states
+
+            return forward
+
+        for module in self.modules():
+            if module.__class__.__name__ in ["T5LayerNorm", "Embedding"]:
+                # print("set", module.__class__.__name__, "to", target_dtype)
+                module.to(target_dtype)
+            if module.__class__.__name__ in ["T5DenseGatedActDense"]:
+                # print("set", module.__class__.__name__, "hooks")
+                module.forward = forward_hook(module)
+
+    def forward(self, ids, mask=None):
+        x = self.token_embedding(ids)
+        x = self.dropout(x)
+        e = self.pos_embedding(x.size(1), x.size(1)) if self.shared_pos else None
+        for block in self.blocks:
+            x = block(x, mask, pos_bias=e)
+        x = self.norm(x)
+        x = self.dropout(x)
+        return x
+
+
+class T5Decoder(nn.Module):
+
+    def __init__(self, vocab, dim, dim_attn, dim_ffn, num_heads, num_layers, num_buckets, shared_pos=True, dropout=0.1):
+        super(T5Decoder, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+
+        # layers
+        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) else nn.Embedding(vocab, dim)
+        self.pos_embedding = T5RelativeEmbedding(num_buckets, num_heads, bidirectional=False) if shared_pos else None
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList(
+            [T5CrossAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos, dropout) for _ in range(num_layers)]
+        )
+        self.norm = T5LayerNorm(dim)
+
+        # initialize weights
+        self.apply(init_weights)
+
+    def forward(self, ids, mask=None, encoder_states=None, encoder_mask=None):
+        b, s = ids.size()
+
+        # causal mask
+        if mask is None:
+            mask = torch.tril(torch.ones(1, s, s).to(ids.device))
+        elif mask.ndim == 2:
+            mask = torch.tril(mask.unsqueeze(1).expand(-1, s, -1))
+
+        # layers
+        x = self.token_embedding(ids)
+        x = self.dropout(x)
+        e = self.pos_embedding(x.size(1), x.size(1)) if self.shared_pos else None
+        for block in self.blocks:
+            x = block(x, mask, encoder_states, encoder_mask, pos_bias=e)
+        x = self.norm(x)
+        x = self.dropout(x)
+        return x
+
+
+class T5Model(nn.Module):
+
+    def __init__(
+        self,
+        vocab_size,
+        dim,
+        dim_attn,
+        dim_ffn,
+        num_heads,
+        encoder_layers,
+        decoder_layers,
+        num_buckets,
+        shared_pos=True,
+        dropout=0.1,
+    ):
+        super(T5Model, self).__init__()
+        self.vocab_size = vocab_size
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+        self.num_buckets = num_buckets
+
+        # layers
+        self.token_embedding = nn.Embedding(vocab_size, dim)
+        self.encoder = T5Encoder(
+            self.token_embedding, dim, dim_attn, dim_ffn, num_heads, encoder_layers, num_buckets, shared_pos, dropout
+        )
+        self.decoder = T5Decoder(
+            self.token_embedding, dim, dim_attn, dim_ffn, num_heads, decoder_layers, num_buckets, shared_pos, dropout
+        )
+        self.head = nn.Linear(dim, vocab_size, bias=False)
+
+        # initialize weights
+        self.apply(init_weights)
+
+    def forward(self, encoder_ids, encoder_mask, decoder_ids, decoder_mask):
+        x = self.encoder(encoder_ids, encoder_mask)
+        x = self.decoder(decoder_ids, decoder_mask, x, encoder_mask)
+        x = self.head(x)
+        return x
+
+
+def _t5(
+    name,
+    encoder_only=False,
+    decoder_only=False,
+    return_tokenizer=False,
+    tokenizer_kwargs={},
+    **kwargs,
+):
+    # dtype=torch.float32,
+    # device="cpu",
+    # sanity check
+    assert not (encoder_only and decoder_only)
+
+    # params
+    if encoder_only:
+        model_cls = T5Encoder
+        kwargs["vocab"] = kwargs.pop("vocab_size")
+        kwargs["num_layers"] = kwargs.pop("encoder_layers")
+        _ = kwargs.pop("decoder_layers")
+    elif decoder_only:
+        model_cls = T5Decoder
+        kwargs["vocab"] = kwargs.pop("vocab_size")
+        kwargs["num_layers"] = kwargs.pop("decoder_layers")
+        _ = kwargs.pop("encoder_layers")
+    else:
+        model_cls = T5Model
+
+    # # init model
+    # with torch.device(device):
+    model = model_cls(**kwargs)
+
+    # # set device
+    # model = model.to(dtype=dtype, device=device)
+
+    # init tokenizer
+    if return_tokenizer:
+        from .tokenizers import HuggingfaceTokenizer
+
+        tokenizer = HuggingfaceTokenizer(f"google/{name}", **tokenizer_kwargs)
+        return model, tokenizer
+    else:
+        return model
+
+
+def umt5_xxl(**kwargs):
+    cfg = dict(
+        vocab_size=256384,
+        dim=4096,
+        dim_attn=4096,
+        dim_ffn=10240,
+        num_heads=64,
+        encoder_layers=24,
+        decoder_layers=24,
+        num_buckets=32,
+        shared_pos=False,
+        dropout=0.1,
+    )
+    cfg.update(**kwargs)
+    return _t5("umt5-xxl", **cfg)
+
+
+class T5EncoderModel:
+
+    def __init__(
+        self,
+        text_len,
+        dtype=torch.bfloat16,
+        device=torch.cuda.current_device(),
+        checkpoint_path=None,
+        tokenizer_path=None,
+        shard_fn=None,
+        weight_path=None,
+        fp8=False,
+    ):
+        self.text_len = text_len
+        self.dtype = dtype if not fp8 else torch.float8_e4m3fn
+        self.device = device
+        self.checkpoint_path = checkpoint_path
+        self.tokenizer_path = tokenizer_path
+
+        # init model
+        with init_empty_weights():
+            model = umt5_xxl(encoder_only=True, return_tokenizer=False)
+
+        model = model.eval().requires_grad_(False)
+        if checkpoint_path is not None:
+            logger.info(f"loading {checkpoint_path}")
+            model.load_state_dict(torch.load(checkpoint_path, map_location="cpu"))
+        else:
+            logger.info(f"loading weights from {weight_path}")
+            if os.path.splitext(weight_path)[1] == ".safetensors":
+                sd = load_file(weight_path)
+            else:
+                sd = torch.load(weight_path, map_location="cpu", weights_only=True)
+            # remove prefix "encoder." from the state dict
+            sd = {k.replace("encoder.", ""): v for k, v in sd.items()}
+            model.load_state_dict(sd, strict=True, assign=True)
+
+        logger.info(f"moving model to {device} and casting to {self.dtype}")
+        model = model.to(device, dtype=self.dtype)
+
+        if fp8:
+            logger.info("preparing model for fp8")
+            model.prepare_fp8(dtype)
+
+        self.model = model
+        # if shard_fn is not None:
+        #     self.model = shard_fn(self.model, sync_module_states=False)
+        # else:
+        #     self.model.to(self.device)
+        # init tokenizer
+        if tokenizer_path is None:
+            tokenizer_path = "Wan-AI/Wan2.1-T2V-14B"
+            subfolder = "google/umt5-xxl"
+        else:
+            subfolder = None
+        self.tokenizer = HuggingfaceTokenizer(name=tokenizer_path, seq_len=text_len, clean="whitespace", subfolder=subfolder)
+
+    def __call__(self, texts, device):
+        ids, mask = self.tokenizer(texts, return_mask=True, add_special_tokens=True)
+        ids = ids.to(device)
+        mask = mask.to(device)
+        seq_lens = mask.gt(0).sum(dim=1).long()
+        context = self.model(ids, mask)
+        return [u[:v] for u, v in zip(context, seq_lens)]
diff --git a/wan/modules/tokenizers.py b/wan/modules/tokenizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..121e591c48f82f82daa51a6ce38ae9a27beea8d2
--- /dev/null
+++ b/wan/modules/tokenizers.py
@@ -0,0 +1,82 @@
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import html
+import string
+
+import ftfy
+import regex as re
+from transformers import AutoTokenizer
+
+__all__ = ['HuggingfaceTokenizer']
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+def canonicalize(text, keep_punctuation_exact_string=None):
+    text = text.replace('_', ' ')
+    if keep_punctuation_exact_string:
+        text = keep_punctuation_exact_string.join(
+            part.translate(str.maketrans('', '', string.punctuation))
+            for part in text.split(keep_punctuation_exact_string))
+    else:
+        text = text.translate(str.maketrans('', '', string.punctuation))
+    text = text.lower()
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+
+
+class HuggingfaceTokenizer:
+
+    def __init__(self, name, seq_len=None, clean=None, **kwargs):
+        assert clean in (None, 'whitespace', 'lower', 'canonicalize')
+        self.name = name
+        self.seq_len = seq_len
+        self.clean = clean
+
+        # init tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs)
+        self.vocab_size = self.tokenizer.vocab_size
+
+    def __call__(self, sequence, **kwargs):
+        return_mask = kwargs.pop('return_mask', False)
+
+        # arguments
+        _kwargs = {'return_tensors': 'pt'}
+        if self.seq_len is not None:
+            _kwargs.update({
+                'padding': 'max_length',
+                'truncation': True,
+                'max_length': self.seq_len
+            })
+        _kwargs.update(**kwargs)
+
+        # tokenization
+        if isinstance(sequence, str):
+            sequence = [sequence]
+        if self.clean:
+            sequence = [self._clean(u) for u in sequence]
+        ids = self.tokenizer(sequence, **_kwargs)
+
+        # output
+        if return_mask:
+            return ids.input_ids, ids.attention_mask
+        else:
+            return ids.input_ids
+
+    def _clean(self, text):
+        if self.clean == 'whitespace':
+            text = whitespace_clean(basic_clean(text))
+        elif self.clean == 'lower':
+            text = whitespace_clean(basic_clean(text)).lower()
+        elif self.clean == 'canonicalize':
+            text = canonicalize(basic_clean(text))
+        return text
diff --git a/wan/modules/vae.py b/wan/modules/vae.py
new file mode 100644
index 0000000000000000000000000000000000000000..4580e716ce492fa4fbe713cda1c8d9759f3381ef
--- /dev/null
+++ b/wan/modules/vae.py
@@ -0,0 +1,752 @@
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import logging
+import os
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+from safetensors.torch import load_file
+
+__all__ = [
+    "WanVAE",
+]
+
+CACHE_T = 2
+
+
+class CausalConv3d(nn.Conv3d):
+    """
+    Causal 3d convolusion.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._padding = (self.padding[2], self.padding[2], self.padding[1], self.padding[1], 2 * self.padding[0], 0)
+        self.padding = (0, 0, 0)
+
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        x = F.pad(x, padding)
+
+        return super().forward(x)
+
+
+class RMS_norm(nn.Module):
+
+    def __init__(self, dim, channel_first=True, images=True, bias=False):
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
+
+    def forward(self, x):
+        return F.normalize(x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma + self.bias
+
+
+class Upsample(nn.Upsample):
+
+    def forward(self, x):
+        """
+        Fix bfloat16 support for nearest neighbor interpolation.
+        """
+        return super().forward(x.float()).type_as(x)
+
+
+class Resample(nn.Module):
+
+    def __init__(self, dim, mode):
+        assert mode in ("none", "upsample2d", "upsample3d", "downsample2d", "downsample3d")
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+
+        # layers
+        if mode == "upsample2d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), nn.Conv2d(dim, dim // 2, 3, padding=1)
+            )
+        elif mode == "upsample3d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), nn.Conv2d(dim, dim // 2, 3, padding=1)
+            )
+            self.time_conv = CausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+
+        elif mode == "downsample2d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == "downsample3d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = CausalConv3d(dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+
+        else:
+            self.resample = nn.Identity()
+
+        self.cache_device = None
+
+    def set_cache_device(self, device):
+        self.cache_device = device
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        cache_device = self.cache_device if self.cache_device is not None else x.device
+
+        b, c, t, h, w = x.size()
+        if self.mode == "upsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = "Rep"
+                    feat_idx[0] += 1
+                else:
+
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone().to(cache_device)
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep":
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] == "Rep":
+                        cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device), cache_x], dim=2)
+                    if feat_cache[idx] == "Rep":
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx].to(x.device) if feat_cache[idx] is not None else None)
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.resample(x)
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
+
+        if self.mode == "downsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone().to(cache_device)
+                    feat_idx[0] += 1
+                else:
+
+                    cache_x = x[:, :, -1:, :, :].clone().to(cache_device)
+                    # if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx]!='Rep':
+                    #     # cache last frame of last two chunk
+                    #     cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+
+                    x = self.time_conv(torch.cat([feat_cache[idx][:, :, -1:, :, :].to(x.device), x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+
+    def init_weight(self, conv):
+        conv_weight = conv.weight
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        one_matrix = torch.eye(c1, c2)
+        init_matrix = one_matrix
+        nn.init.zeros_(conv_weight)
+        # conv_weight.data[:,:,-1,1,1] = init_matrix * 0.5
+        conv_weight.data[:, :, 1, 0, 0] = init_matrix  # * 0.5
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+
+    def init_weight2(self, conv):
+        conv_weight = conv.weight.data
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        init_matrix = torch.eye(c1 // 2, c2)
+        # init_matrix = repeat(init_matrix, 'o ... -> (o 2) ...').permute(1,0,2).contiguous().reshape(c1,c2)
+        conv_weight[: c1 // 2, :, -1, 0, 0] = init_matrix
+        conv_weight[c1 // 2 :, :, -1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self, in_dim, out_dim, dropout=0.0):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+
+        # layers
+        self.residual = nn.Sequential(
+            RMS_norm(in_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(in_dim, out_dim, 3, padding=1),
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            CausalConv3d(out_dim, out_dim, 3, padding=1),
+        )
+        self.shortcut = CausalConv3d(in_dim, out_dim, 1) if in_dim != out_dim else nn.Identity()
+
+        self.cache_device = None
+
+    def set_cache_device(self, device):
+        self.cache_device = device
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        cache_device = self.cache_device if self.cache_device is not None else x.device
+
+        h = self.shortcut(x)
+        for layer in self.residual:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone().to(cache_device)
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+                x = layer(x, feat_cache[idx].to(x.device) if feat_cache[idx] is not None else None)
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x + h
+
+
+class AttentionBlock(nn.Module):
+    """
+    Causal self-attention with a single head.
+    """
+
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+        # layers
+        self.norm = RMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+
+    def forward(self, x):
+        identity = x
+        b, c, t, h, w = x.size()
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.norm(x)
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).reshape(b * t, 1, c * 3, -1).permute(0, 1, 3, 2).contiguous().chunk(3, dim=-1)
+
+        # apply attention
+        x = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+        )
+        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
+
+        # output
+        x = self.proj(x)
+        x = rearrange(x, "(b t) c h w-> b c t h w", t=t)
+        return x + identity
+
+
+class Encoder3d(nn.Module):
+
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+
+        # init block
+        self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
+
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            for _ in range(num_res_blocks):
+                downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    downsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+
+            # downsample block
+            if i != len(dim_mult) - 1:
+                mode = "downsample3d" if temperal_downsample[i] else "downsample2d"
+                downsamples.append(Resample(out_dim, mode=mode))
+                scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(out_dim, out_dim, dropout), AttentionBlock(out_dim), ResidualBlock(out_dim, out_dim, dropout)
+        )
+
+        # output blocks
+        self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(), CausalConv3d(out_dim, z_dim, 3, padding=1))
+
+        self.cache_device = None
+
+    def set_cache_device(self, device):
+        self.cache_device = device
+
+        # set cache device for all layers
+        for layer in self.downsamples + self.middle + self.head:
+            if isinstance(layer, Resample) or isinstance(layer, ResidualBlock):
+                layer.set_cache_device(device)
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        cache_device = self.cache_device if self.cache_device is not None else x.device
+
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone().to(cache_device)
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv1(x, feat_cache[idx].to(x.device) if feat_cache[idx] is not None else None)
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+
+        ## downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone().to(cache_device)
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+                x = layer(x, feat_cache[idx].to(x.device) if feat_cache[idx] is not None else None)
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+
+
+class Decoder3d(nn.Module):
+
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_upsample=[False, True, True],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2 ** (len(dim_mult) - 2)
+
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(dims[0], dims[0], dropout), AttentionBlock(dims[0]), ResidualBlock(dims[0], dims[0], dropout)
+        )
+
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i == 1 or i == 2 or i == 3:
+                in_dim = in_dim // 2
+            for _ in range(num_res_blocks + 1):
+                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    upsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+
+            # upsample block
+            if i != len(dim_mult) - 1:
+                mode = "upsample3d" if temperal_upsample[i] else "upsample2d"
+                upsamples.append(Resample(out_dim, mode=mode))
+                scale *= 2.0
+        self.upsamples = nn.Sequential(*upsamples)
+
+        # output blocks
+        self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(), CausalConv3d(out_dim, 3, 3, padding=1))
+
+        self.cache_device = None
+
+    def set_cache_device(self, device):
+        self.cache_device = device
+
+        # set cache device for all layers
+        for layer in self.middle + self.upsamples + self.head:
+            if isinstance(layer, Resample) or isinstance(layer, ResidualBlock):
+                layer.set_cache_device(device)
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        cache_device = self.cache_device if self.cache_device is not None else x.device
+
+        ## conv1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone().to(cache_device)
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv1(x, feat_cache[idx].to(x.device) if feat_cache[idx] is not None else None)
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone().to(cache_device)
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+                x = layer(x, feat_cache[idx].to(x.device) if feat_cache[idx] is not None else None)
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+
+
+def count_conv3d(model):
+    count = 0
+    for m in model.modules():
+        if isinstance(m, CausalConv3d):
+            count += 1
+    return count
+
+
+class WanVAE_(nn.Module):
+
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+
+        # modules
+        self.encoder = Encoder3d(dim, z_dim * 2, dim_mult, num_res_blocks, attn_scales, self.temperal_downsample, dropout)
+        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
+        self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks, attn_scales, self.temperal_upsample, dropout)
+
+        self.cache_device = None
+
+    def set_cache_device(self, device):
+        # set cache device
+        self.cache_device = device
+        self.encoder.set_cache_device(device)
+        self.decoder.set_cache_device(device)
+
+    def forward(self, x):
+        mu, log_var = self.encode(x)
+        z = self.reparameterize(mu, log_var)
+        x_recon = self.decode(z)
+        return x_recon, mu, log_var
+
+    def encode(self, x, scale):
+        self.clear_cache()
+        ## cache
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        # ## 对encode输入的x，按时间拆分为1、4、4、4....
+
+        # if self.cache_device is None:
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(x[:, :, :1, :, :], feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx)
+            else:
+                out_ = self.encoder(
+                    x[:, :, 1 + 4 * (i - 1) : 1 + 4 * i, :, :], feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx
+                )
+                out = torch.cat([out, out_], 2)
+        # else:
+        #     # VRAM optimization
+        #     device = x.device
+        #     clean_memory_on_device(device)
+        #     outs = []
+        #     for i in range(iter_):
+        #         self._enc_conv_idx = [0]
+        #         if i == 0:
+        #             out = self.encoder(x[:, :, :1, :, :], feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx)
+        #         else:
+        #             out = self.encoder(
+        #                 x[:, :, 1 + 4 * (i - 1) : 1 + 4 * i, :, :], feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx
+        #             )
+        #         outs.append(out.to(self.cache_device))
+        #     out = torch.cat(outs, 2).to(device)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if isinstance(scale[0], torch.Tensor):
+            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(1, self.z_dim, 1, 1, 1)
+        else:
+            mu = (mu - scale[0]) * scale[1]
+        self.clear_cache()
+        return mu
+
+    def decode(self, z, scale):
+        self.clear_cache()
+        # z: [b,c,t,h,w]
+        if isinstance(scale[0], torch.Tensor):
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(1, self.z_dim, 1, 1, 1)
+        else:
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+
+        # if self.cache_device is None:
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(x[:, :, i : i + 1, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx)
+            else:
+                out_ = self.decoder(x[:, :, i : i + 1, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx)
+                out = torch.cat([out, out_], 2)
+        # else:
+        #     # VRAM optimization
+        #     device = z.device
+        #     x = x.to("cpu")
+        #     clean_memory_on_device(device)
+        #     outs = []
+        #     for i in range(iter_):
+        #         self._conv_idx = [0]
+        #         out = self.decoder(x[:, :, i : i + 1, :, :].to(device), feat_cache=self._feat_map, feat_idx=self._conv_idx).to(
+        #             self.cache_device
+        #         )
+        #         outs.append(out)
+        #     out = torch.cat(outs, 2)  # on cache_device
+        self.clear_cache()
+        return out
+
+    def reparameterize(self, mu, log_var):
+        std = torch.exp(0.5 * log_var)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+
+    def sample(self, imgs, deterministic=False):
+        mu, log_var = self.encode(imgs)
+        if deterministic:
+            return mu
+        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
+        return mu + std * torch.randn_like(std)
+
+    def clear_cache(self):
+        self._conv_num = count_conv3d(self.decoder)
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        # cache encode
+        self._enc_conv_num = count_conv3d(self.encoder)
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+
+
+def _video_vae(pretrained_path=None, z_dim=None, device="cpu", **kwargs):
+    """
+    Autoencoder3d adapted from Stable Diffusion 1.x, 2.x and XL.
+    """
+    # params
+    cfg = dict(
+        dim=96,
+        z_dim=z_dim,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[False, True, True],
+        dropout=0.0,
+    )
+    cfg.update(**kwargs)
+
+    # init model
+    with torch.device("meta"):
+        model = WanVAE_(**cfg)
+
+    # load checkpoint
+    logging.info(f"loading {pretrained_path}")
+    if os.path.splitext(pretrained_path)[-1] == ".safetensors":
+        sd = load_file(pretrained_path)
+        model.load_state_dict(sd, strict=False, assign=True)
+    else:
+        model.load_state_dict(torch.load(pretrained_path, map_location=device, weights_only=True), assign=True)
+
+    return model
+
+
+class WanVAE:
+
+    def __init__(self, z_dim=16, vae_path="cache/vae_step_411000.pth", dtype=torch.float, device="cuda", cache_device=None):
+        self.dtype = dtype
+        self.device = device
+
+        mean = [
+            -0.7571,
+            -0.7089,
+            -0.9113,
+            0.1075,
+            -0.1745,
+            0.9653,
+            -0.1517,
+            1.5508,
+            0.4134,
+            -0.0715,
+            0.5517,
+            -0.3632,
+            -0.1922,
+            -0.9497,
+            0.2503,
+            -0.2921,
+        ]
+        std = [
+            2.8184,
+            1.4541,
+            2.3275,
+            2.6558,
+            1.2196,
+            1.7708,
+            2.6052,
+            2.0743,
+            3.2687,
+            2.1526,
+            2.8652,
+            1.5579,
+            1.6382,
+            1.1253,
+            2.8251,
+            1.9160,
+        ]
+        self.mean = torch.tensor(mean, dtype=dtype, device=device)
+        self.std = torch.tensor(std, dtype=dtype, device=device)
+        self.scale = [self.mean, 1.0 / self.std]
+
+        # init model
+        self.model = (
+            _video_vae(
+                pretrained_path=vae_path,
+                z_dim=z_dim,
+            )
+            .eval()
+            .requires_grad_(False)
+            .to(device)
+        )
+        if cache_device is not None:
+            self.model.set_cache_device(torch.device(cache_device))
+
+    def to_device(self, device):
+        self.device = device
+        self.model.to(device)
+        self.mean = self.mean.to(device)
+        self.std = self.std.to(device)
+        self.scale = [t.to(device) for t in self.scale]
+
+    def to_dtype(self, dtype):
+        self.dtype = dtype
+        self.model.to(dtype=dtype)
+        self.mean = self.mean.to(dtype)
+        self.std = self.std.to(dtype)
+        self.scale = [t.to(dtype) for t in self.scale]
+
+    def eval(self):
+        self.model.eval()
+
+    def train(self, mode: bool = True):
+        self.model.train(mode)
+
+    def requires_grad_(self, requires_grad: bool = True):
+        self.model.requires_grad_(requires_grad)
+
+    def to(self, device_or_dtype: Union[torch.device, torch.dtype, str], dtype: Optional[torch.dtype] = None):
+        """
+        Add nn.Module.to() support for device and dtype.
+        """
+        if isinstance(device_or_dtype, str) or isinstance(device_or_dtype, torch.device):
+            self.to_device(device_or_dtype)
+        else:
+            self.to_dtype(device_or_dtype)
+
+        if dtype is not None:
+            self.to_dtype(dtype)
+
+    def encode(self, videos):
+        """
+        videos: A list of videos each with shape [C, T, H, W].
+        """
+        # with amp.autocast(dtype=self.dtype):
+        return [self.model.encode(u.unsqueeze(0), self.scale).float().squeeze(0) for u in videos]
+
+    def decode(self, zs):
+        # with amp.autocast(dtype=self.dtype):
+        return [self.model.decode(u.unsqueeze(0), self.scale).float().clamp_(-1, 1).squeeze(0) for u in zs]
diff --git a/wan/modules/xlm_roberta.py b/wan/modules/xlm_roberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bd38c1016fdaec90b77a6222d75d01c38c1291c
--- /dev/null
+++ b/wan/modules/xlm_roberta.py
@@ -0,0 +1,170 @@
+# Modified from transformers.models.xlm_roberta.modeling_xlm_roberta
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['XLMRoberta', 'xlm_roberta_large']
+
+
+class SelfAttention(nn.Module):
+
+    def __init__(self, dim, num_heads, dropout=0.1, eps=1e-5):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.eps = eps
+
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, mask):
+        """
+        x:   [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+
+        # compute query, key, value
+        q = self.q(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        k = self.k(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        v = self.v(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+
+        # compute attention
+        p = self.dropout.p if self.training else 0.0
+        x = F.scaled_dot_product_attention(q, k, v, mask, p)
+        x = x.permute(0, 2, 1, 3).reshape(b, s, c)
+
+        # output
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+
+
+class AttentionBlock(nn.Module):
+
+    def __init__(self, dim, num_heads, post_norm, dropout=0.1, eps=1e-5):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.post_norm = post_norm
+        self.eps = eps
+
+        # layers
+        self.attn = SelfAttention(dim, num_heads, dropout, eps)
+        self.norm1 = nn.LayerNorm(dim, eps=eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim),
+            nn.Dropout(dropout))
+        self.norm2 = nn.LayerNorm(dim, eps=eps)
+
+    def forward(self, x, mask):
+        if self.post_norm:
+            x = self.norm1(x + self.attn(x, mask))
+            x = self.norm2(x + self.ffn(x))
+        else:
+            x = x + self.attn(self.norm1(x), mask)
+            x = x + self.ffn(self.norm2(x))
+        return x
+
+
+class XLMRoberta(nn.Module):
+    """
+    XLMRobertaModel with no pooler and no LM head.
+    """
+
+    def __init__(self,
+                 vocab_size=250002,
+                 max_seq_len=514,
+                 type_size=1,
+                 pad_id=1,
+                 dim=1024,
+                 num_heads=16,
+                 num_layers=24,
+                 post_norm=True,
+                 dropout=0.1,
+                 eps=1e-5):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.type_size = type_size
+        self.pad_id = pad_id
+        self.dim = dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.post_norm = post_norm
+        self.eps = eps
+
+        # embeddings
+        self.token_embedding = nn.Embedding(vocab_size, dim, padding_idx=pad_id)
+        self.type_embedding = nn.Embedding(type_size, dim)
+        self.pos_embedding = nn.Embedding(max_seq_len, dim, padding_idx=pad_id)
+        self.dropout = nn.Dropout(dropout)
+
+        # blocks
+        self.blocks = nn.ModuleList([
+            AttentionBlock(dim, num_heads, post_norm, dropout, eps)
+            for _ in range(num_layers)
+        ])
+
+        # norm layer
+        self.norm = nn.LayerNorm(dim, eps=eps)
+
+    def forward(self, ids):
+        """
+        ids: [B, L] of torch.LongTensor.
+        """
+        b, s = ids.shape
+        mask = ids.ne(self.pad_id).long()
+
+        # embeddings
+        x = self.token_embedding(ids) + \
+            self.type_embedding(torch.zeros_like(ids)) + \
+            self.pos_embedding(self.pad_id + torch.cumsum(mask, dim=1) * mask)
+        if self.post_norm:
+            x = self.norm(x)
+        x = self.dropout(x)
+
+        # blocks
+        mask = torch.where(
+            mask.view(b, 1, 1, s).gt(0), 0.0,
+            torch.finfo(x.dtype).min)
+        for block in self.blocks:
+            x = block(x, mask)
+
+        # output
+        if not self.post_norm:
+            x = self.norm(x)
+        return x
+
+
+def xlm_roberta_large(pretrained=False,
+                      return_tokenizer=False,
+                      device='cpu',
+                      **kwargs):
+    """
+    XLMRobertaLarge adapted from Huggingface.
+    """
+    # params
+    cfg = dict(
+        vocab_size=250002,
+        max_seq_len=514,
+        type_size=1,
+        pad_id=1,
+        dim=1024,
+        num_heads=16,
+        num_layers=24,
+        post_norm=True,
+        dropout=0.1,
+        eps=1e-5)
+    cfg.update(**kwargs)
+
+    # init a model on device
+    with torch.device(device):
+        model = XLMRoberta(**cfg)
+    return model
diff --git a/wan/utils/__init__.py b/wan/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e9a339e69fd55dd226d3ce242613c19bd690522
--- /dev/null
+++ b/wan/utils/__init__.py
@@ -0,0 +1,8 @@
+from .fm_solvers import (FlowDPMSolverMultistepScheduler, get_sampling_sigmas,
+                         retrieve_timesteps)
+from .fm_solvers_unipc import FlowUniPCMultistepScheduler
+
+__all__ = [
+    'HuggingfaceTokenizer', 'get_sampling_sigmas', 'retrieve_timesteps',
+    'FlowDPMSolverMultistepScheduler', 'FlowUniPCMultistepScheduler'
+]
diff --git a/wan/utils/fm_solvers.py b/wan/utils/fm_solvers.py
new file mode 100644
index 0000000000000000000000000000000000000000..c908969e24849ce1381a8df9d5eb401dccf66524
--- /dev/null
+++ b/wan/utils/fm_solvers.py
@@ -0,0 +1,857 @@
+# Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+# Convert dpm solver for flow matching
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+
+import inspect
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import (KarrasDiffusionSchedulers,
+                                                   SchedulerMixin,
+                                                   SchedulerOutput)
+from diffusers.utils import deprecate, is_scipy_available
+from diffusers.utils.torch_utils import randn_tensor
+
+if is_scipy_available():
+    pass
+
+
+def get_sampling_sigmas(sampling_steps, shift):
+    sigma = np.linspace(1, 0, sampling_steps + 1)[:sampling_steps]
+    sigma = (shift * sigma / (1 + (shift - 1) * sigma))
+
+    return sigma
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps=None,
+    device=None,
+    timesteps=None,
+    sigmas=None,
+    **kwargs,
+):
+    if timesteps is not None and sigmas is not None:
+        raise ValueError(
+            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+        )
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class FlowDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `FlowDPMSolverMultistepScheduler` is a fast dedicated high-order solver for diffusion ODEs.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model. This determines the resolution of the diffusion process.
+        solver_order (`int`, defaults to 2):
+            The DPMSolver order which can be `1`, `2`, or `3`. It is recommended to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling. This affects the number of model outputs stored
+            and used in multistep updates.
+        prediction_type (`str`, defaults to "flow_prediction"):
+            Prediction type of the scheduler function; must be `flow_prediction` for this scheduler, which predicts
+            the flow of the diffusion process.
+        shift (`float`, *optional*, defaults to 1.0):
+            A factor used to adjust the sigmas in the noise schedule. It modifies the step sizes during the sampling
+            process.
+        use_dynamic_shifting (`bool`, defaults to `False`):
+            Whether to apply dynamic shifting to the timesteps based on image resolution. If `True`, the shifting is
+            applied on the fly.
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This method adjusts the predicted sample to prevent
+            saturation and improve photorealism.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++"`.
+        algorithm_type (`str`, defaults to `dpmsolver++`):
+            Algorithm type for the solver; can be `dpmsolver`, `dpmsolver++`, `sde-dpmsolver` or `sde-dpmsolver++`. The
+            `dpmsolver` type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927)
+            paper, and the `dpmsolver++` type implements the algorithms in the
+            [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or
+            `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
+        solver_type (`str`, defaults to `midpoint`):
+            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
+            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        euler_at_final (`bool`, defaults to `False`):
+            Whether to use Euler's method in the final step. It is a trade-off between numerical stability and detail
+            richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference
+            steps, but sometimes may result in blurring.
+        final_sigmas_type (`str`, *optional*, defaults to "zero"):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+        lambda_min_clipped (`float`, defaults to `-inf`):
+            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
+            cosine (`squaredcos_cap_v2`) noise schedule.
+        variance_type (`str`, *optional*):
+            Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
+            contains the predicted Gaussian variance.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        solver_order: int = 2,
+        prediction_type: str = "flow_prediction",
+        shift: Optional[float] = 1.0,
+        use_dynamic_shifting=False,
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+        euler_at_final: bool = False,
+        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+        lambda_min_clipped: float = -float("inf"),
+        variance_type: Optional[str] = None,
+        invert_sigmas: bool = False,
+    ):
+        if algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
+            deprecation_message = f"algorithm_type {algorithm_type} is deprecated and will be removed in a future version. Choose from `dpmsolver++` or `sde-dpmsolver++` instead"
+            deprecate("algorithm_types dpmsolver and sde-dpmsolver", "1.0.0",
+                      deprecation_message)
+
+        # settings for DPM-Solver
+        if algorithm_type not in [
+                "dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"
+        ]:
+            if algorithm_type == "deis":
+                self.register_to_config(algorithm_type="dpmsolver++")
+            else:
+                raise NotImplementedError(
+                    f"{algorithm_type} is not implemented for {self.__class__}")
+
+        if solver_type not in ["midpoint", "heun"]:
+            if solver_type in ["logrho", "bh1", "bh2"]:
+                self.register_to_config(solver_type="midpoint")
+            else:
+                raise NotImplementedError(
+                    f"{solver_type} is not implemented for {self.__class__}")
+
+        if algorithm_type not in ["dpmsolver++", "sde-dpmsolver++"
+                                 ] and final_sigmas_type == "zero":
+            raise ValueError(
+                f"`final_sigmas_type` {final_sigmas_type} is not supported for `algorithm_type` {algorithm_type}. Please choose `sigma_min` instead."
+            )
+
+        # setable values
+        self.num_inference_steps = None
+        alphas = np.linspace(1, 1 / num_train_timesteps,
+                             num_train_timesteps)[::-1].copy()
+        sigmas = 1.0 - alphas
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32)
+
+        if not use_dynamic_shifting:
+            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
+            sigmas = shift * sigmas / (1 +
+                                       (shift - 1) * sigmas)  # pyright: ignore
+
+        self.sigmas = sigmas
+        self.timesteps = sigmas * num_train_timesteps
+
+        self.model_outputs = [None] * solver_order
+        self.lower_order_nums = 0
+        self._step_index = None
+        self._begin_index = None
+
+        # self.sigmas = self.sigmas.to(
+        #     "cpu")  # to avoid too much CPU/GPU communication
+        self.sigma_min = self.sigmas[-1].item()
+        self.sigma_max = self.sigmas[0].item()
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    # Modified from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler.set_timesteps
+    def set_timesteps(
+        self,
+        num_inference_steps: Union[int, None] = None,
+        device: Union[str, torch.device] = None,
+        sigmas: Optional[List[float]] = None,
+        mu: Optional[Union[float, None]] = None,
+        shift: Optional[Union[float, None]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                Total number of the spacing of the time steps.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+
+        if self.config.use_dynamic_shifting and mu is None:
+            raise ValueError(
+                " you have to pass a value for `mu` when `use_dynamic_shifting` is set to be `True`"
+            )
+
+        if sigmas is None:
+            sigmas = np.linspace(self.sigma_max, self.sigma_min,
+                                 num_inference_steps +
+                                 1).copy()[:-1]  # pyright: ignore
+
+        if self.config.use_dynamic_shifting:
+            sigmas = self.time_shift(mu, 1.0, sigmas)  # pyright: ignore
+        else:
+            if shift is None:
+                shift = self.config.shift
+            sigmas = shift * sigmas / (1 +
+                                       (shift - 1) * sigmas)  # pyright: ignore
+
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = ((1 - self.alphas_cumprod[0]) /
+                          self.alphas_cumprod[0])**0.5
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+            )
+
+        timesteps = sigmas * self.config.num_train_timesteps
+        sigmas = np.concatenate([sigmas, [sigma_last]
+                                ]).astype(np.float32)  # pyright: ignore
+
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(
+            device=device, dtype=torch.int64)
+
+        self.num_inference_steps = len(timesteps)
+
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+
+        self._step_index = None
+        self._begin_index = None
+        # self.sigmas = self.sigmas.to(
+        #     "cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float(
+            )  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(
+            abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(
+            1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(
+            sample, -s, s
+        ) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        return 1 - sigma, sigma
+
+    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.set_timesteps
+    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1)**sigma)
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.convert_model_output
+    def convert_model_output(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
+        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
+        integral of the data prediction model.
+        <Tip>
+        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
+        prediction and data prediction models.
+        </Tip>
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError(
+                    "missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        # DPM-Solver++ needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]:
+            if self.config.prediction_type == "flow_prediction":
+                sigma_t = self.sigmas[self.step_index]
+                x0_pred = sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
+                    " `v_prediction`, or `flow_prediction` for the FlowDPMSolverMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+
+            return x0_pred
+
+        # DPM-Solver needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
+            if self.config.prediction_type == "flow_prediction":
+                sigma_t = self.sigmas[self.step_index]
+                epsilon = sample - (1 - sigma_t) * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
+                    " `v_prediction` or `flow_prediction` for the FlowDPMSolverMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                sigma_t = self.sigmas[self.step_index]
+                x0_pred = sample - sigma_t * model_output
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = model_output + x0_pred
+
+            return epsilon
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.dpm_solver_first_order_update
+    def dpm_solver_first_order_update(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        noise: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the first-order DPMSolver (equivalent to DDIM).
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop(
+            "prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(
+                    " missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[
+            self.step_index]  # pyright: ignore
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t /
+                   sigma_s) * sample - (alpha_t *
+                                        (torch.exp(-h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "dpmsolver":
+            x_t = (alpha_t /
+                   alpha_s) * sample - (sigma_t *
+                                        (torch.exp(h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            x_t = ((sigma_t / sigma_s * torch.exp(-h)) * sample +
+                   (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output +
+                   sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise)
+        elif self.config.algorithm_type == "sde-dpmsolver":
+            assert noise is not None
+            x_t = ((alpha_t / alpha_s) * sample - 2.0 *
+                   (sigma_t * (torch.exp(h) - 1.0)) * model_output +
+                   sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise)
+        return x_t  # pyright: ignore
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_second_order_update
+    def multistep_dpm_solver_second_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        *args,
+        sample: torch.Tensor = None,
+        noise: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the second-order multistep DPMSolver.
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop(
+            "timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop(
+            "prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(
+                    " missing `sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1 = (
+            self.sigmas[self.step_index + 1],  # pyright: ignore
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],  # pyright: ignore
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = ((sigma_t / sigma_s0) * sample -
+                       (alpha_t * (torch.exp(-h) - 1.0)) * D0 - 0.5 *
+                       (alpha_t * (torch.exp(-h) - 1.0)) * D1)
+            elif self.config.solver_type == "heun":
+                x_t = ((sigma_t / sigma_s0) * sample -
+                       (alpha_t * (torch.exp(-h) - 1.0)) * D0 +
+                       (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1)
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = ((alpha_t / alpha_s0) * sample -
+                       (sigma_t * (torch.exp(h) - 1.0)) * D0 - 0.5 *
+                       (sigma_t * (torch.exp(h) - 1.0)) * D1)
+            elif self.config.solver_type == "heun":
+                x_t = ((alpha_t / alpha_s0) * sample -
+                       (sigma_t * (torch.exp(h) - 1.0)) * D0 -
+                       (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1)
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = ((sigma_t / sigma_s0 * torch.exp(-h)) * sample +
+                       (alpha_t * (1 - torch.exp(-2.0 * h))) * D0 + 0.5 *
+                       (alpha_t * (1 - torch.exp(-2.0 * h))) * D1 +
+                       sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise)
+            elif self.config.solver_type == "heun":
+                x_t = ((sigma_t / sigma_s0 * torch.exp(-h)) * sample +
+                       (alpha_t * (1 - torch.exp(-2.0 * h))) * D0 +
+                       (alpha_t * ((1.0 - torch.exp(-2.0 * h)) /
+                                   (-2.0 * h) + 1.0)) * D1 +
+                       sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise)
+        elif self.config.algorithm_type == "sde-dpmsolver":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = ((alpha_t / alpha_s0) * sample - 2.0 *
+                       (sigma_t * (torch.exp(h) - 1.0)) * D0 -
+                       (sigma_t * (torch.exp(h) - 1.0)) * D1 +
+                       sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise)
+            elif self.config.solver_type == "heun":
+                x_t = ((alpha_t / alpha_s0) * sample - 2.0 *
+                       (sigma_t * (torch.exp(h) - 1.0)) * D0 - 2.0 *
+                       (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1 +
+                       sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise)
+        return x_t  # pyright: ignore
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_third_order_update
+    def multistep_dpm_solver_third_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the third-order multistep DPMSolver.
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by diffusion process.
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop(
+            "timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop(
+            "prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(
+                    " missing`sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1, sigma_s2 = (
+            self.sigmas[self.step_index + 1],  # pyright: ignore
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],  # pyright: ignore
+            self.sigmas[self.step_index - 2],  # pyright: ignore
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+        alpha_s2, sigma_s2 = self._sigma_to_alpha_sigma_t(sigma_s2)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+        lambda_s2 = torch.log(alpha_s2) - torch.log(sigma_s2)
+
+        m0, m1, m2 = model_output_list[-1], model_output_list[
+            -2], model_output_list[-3]
+
+        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m0
+        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = ((sigma_t / sigma_s0) * sample -
+                   (alpha_t * (torch.exp(-h) - 1.0)) * D0 +
+                   (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1 -
+                   (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2)
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = ((alpha_t / alpha_s0) * sample - (sigma_t *
+                                                    (torch.exp(h) - 1.0)) * D0 -
+                   (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1 -
+                   (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2)
+        return x_t  # pyright: ignore
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    # Modified from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.step
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[int, torch.Tensor],
+        sample: torch.Tensor,
+        generator=None,
+        variance_noise: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep DPMSolver.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.Tensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`LEdits++`].
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Improve numerical stability for small number of steps
+        lower_order_final = (self.step_index == len(self.timesteps) - 1) and (
+            self.config.euler_at_final or
+            (self.config.lower_order_final and len(self.timesteps) < 15) or
+            self.config.final_sigmas_type == "zero")
+        lower_order_second = ((self.step_index == len(self.timesteps) - 2) and
+                              self.config.lower_order_final and
+                              len(self.timesteps) < 15)
+
+        model_output = self.convert_model_output(model_output, sample=sample)
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"
+                                         ] and variance_noise is None:
+            noise = randn_tensor(
+                model_output.shape,
+                generator=generator,
+                device=model_output.device,
+                dtype=torch.float32)
+        elif self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
+            noise = variance_noise.to(
+                device=model_output.device,
+                dtype=torch.float32)  # pyright: ignore
+        else:
+            noise = None
+
+        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.dpm_solver_first_order_update(
+                model_output, sample=sample, noise=noise)
+        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+            prev_sample = self.multistep_dpm_solver_second_order_update(
+                self.model_outputs, sample=sample, noise=noise)
+        else:
+            prev_sample = self.multistep_dpm_solver_third_order_update(
+                self.model_outputs, sample=sample)
+
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+
+        # Cast sample back to expected dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        self._step_index += 1  # pyright: ignore
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.scale_model_input
+    def scale_model_input(self, sample: torch.Tensor, *args,
+                          **kwargs) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.scale_model_input
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(
+            device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(
+                timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(
+                original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(
+                original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [
+                self.index_for_timestep(t, schedule_timesteps)
+                for t in timesteps
+            ]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/wan/utils/fm_solvers_unipc.py b/wan/utils/fm_solvers_unipc.py
new file mode 100644
index 0000000000000000000000000000000000000000..57321baa35359782b33143321cd31c8d934a7b29
--- /dev/null
+++ b/wan/utils/fm_solvers_unipc.py
@@ -0,0 +1,800 @@
+# Copied from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_unipc_multistep.py
+# Convert unipc for flow matching
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import (KarrasDiffusionSchedulers,
+                                                   SchedulerMixin,
+                                                   SchedulerOutput)
+from diffusers.utils import deprecate, is_scipy_available
+
+if is_scipy_available():
+    import scipy.stats
+
+
+class FlowUniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `UniPCMultistepScheduler` is a training-free framework designed for the fast sampling of diffusion models.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        solver_order (`int`, default `2`):
+            The UniPC order which can be any positive integer. The effective order of accuracy is `solver_order + 1`
+            due to the UniC. It is recommended to use `solver_order=2` for guided sampling, and `solver_order=3` for
+            unconditional sampling.
+        prediction_type (`str`, defaults to "flow_prediction"):
+            Prediction type of the scheduler function; must be `flow_prediction` for this scheduler, which predicts
+            the flow of the diffusion process.
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `predict_x0=True`.
+        predict_x0 (`bool`, defaults to `True`):
+            Whether to use the updating algorithm on the predicted x0.
+        solver_type (`str`, default `bh2`):
+            Solver type for UniPC. It is recommended to use `bh1` for unconditional sampling when steps < 10, and `bh2`
+            otherwise.
+        lower_order_final (`bool`, default `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        disable_corrector (`list`, default `[]`):
+            Decides which step to disable the corrector to mitigate the misalignment between `epsilon_theta(x_t, c)`
+            and `epsilon_theta(x_t^c, c)` which can influence convergence for a large guidance scale. Corrector is
+            usually disabled during the first few steps.
+        solver_p (`SchedulerMixin`, default `None`):
+            Any other scheduler that if specified, the algorithm becomes `solver_p + UniC`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        final_sigmas_type (`str`, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+            self,
+            num_train_timesteps: int = 1000,
+            solver_order: int = 2,
+            prediction_type: str = "flow_prediction",
+            shift: Optional[float] = 1.0,
+            use_dynamic_shifting=False,
+            thresholding: bool = False,
+            dynamic_thresholding_ratio: float = 0.995,
+            sample_max_value: float = 1.0,
+            predict_x0: bool = True,
+            solver_type: str = "bh2",
+            lower_order_final: bool = True,
+            disable_corrector: List[int] = [],
+            solver_p: SchedulerMixin = None,
+            timestep_spacing: str = "linspace",
+            steps_offset: int = 0,
+            final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+    ):
+
+        if solver_type not in ["bh1", "bh2"]:
+            if solver_type in ["midpoint", "heun", "logrho"]:
+                self.register_to_config(solver_type="bh2")
+            else:
+                raise NotImplementedError(
+                    f"{solver_type} is not implemented for {self.__class__}")
+
+        self.predict_x0 = predict_x0
+        # setable values
+        self.num_inference_steps = None
+        alphas = np.linspace(1, 1 / num_train_timesteps,
+                             num_train_timesteps)[::-1].copy()
+        sigmas = 1.0 - alphas
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32)
+
+        if not use_dynamic_shifting:
+            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
+            sigmas = shift * sigmas / (1 +
+                                       (shift - 1) * sigmas)  # pyright: ignore
+
+        self.sigmas = sigmas
+        self.timesteps = sigmas * num_train_timesteps
+
+        self.model_outputs = [None] * solver_order
+        self.timestep_list = [None] * solver_order
+        self.lower_order_nums = 0
+        self.disable_corrector = disable_corrector
+        self.solver_p = solver_p
+        self.last_sample = None
+        self._step_index = None
+        self._begin_index = None
+
+        self.sigmas = self.sigmas.to(
+            "cpu")  # to avoid too much CPU/GPU communication
+        self.sigma_min = self.sigmas[-1].item()
+        self.sigma_max = self.sigmas[0].item()
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    # Modified from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler.set_timesteps
+    def set_timesteps(
+        self,
+        num_inference_steps: Union[int, None] = None,
+        device: Union[str, torch.device] = None,
+        sigmas: Optional[List[float]] = None,
+        mu: Optional[Union[float, None]] = None,
+        shift: Optional[Union[float, None]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                Total number of the spacing of the time steps.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+
+        if self.config.use_dynamic_shifting and mu is None:
+            raise ValueError(
+                " you have to pass a value for `mu` when `use_dynamic_shifting` is set to be `True`"
+            )
+
+        if sigmas is None:
+            sigmas = np.linspace(self.sigma_max, self.sigma_min,
+                                 num_inference_steps +
+                                 1).copy()[:-1]  # pyright: ignore
+
+        if self.config.use_dynamic_shifting:
+            sigmas = self.time_shift(mu, 1.0, sigmas)  # pyright: ignore
+        else:
+            if shift is None:
+                shift = self.config.shift
+            sigmas = shift * sigmas / (1 +
+                                       (shift - 1) * sigmas)  # pyright: ignore
+
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = ((1 - self.alphas_cumprod[0]) /
+                          self.alphas_cumprod[0])**0.5
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+            )
+
+        timesteps = sigmas * self.config.num_train_timesteps
+        sigmas = np.concatenate([sigmas, [sigma_last]
+                                ]).astype(np.float32)  # pyright: ignore
+
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(
+            device=device, dtype=torch.int64)
+
+        self.num_inference_steps = len(timesteps)
+
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+        self.last_sample = None
+        if self.solver_p:
+            self.solver_p.set_timesteps(self.num_inference_steps, device=device)
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to(
+            "cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float(
+            )  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(
+            abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(
+            1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(
+            sample, -s, s
+        ) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        return 1 - sigma, sigma
+
+    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.set_timesteps
+    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1)**sigma)
+
+    def convert_model_output(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Convert the model output to the corresponding type the UniPC algorithm needs.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError(
+                    "missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma = self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+
+        if self.predict_x0:
+            if self.config.prediction_type == "flow_prediction":
+                sigma_t = self.sigmas[self.step_index]
+                x0_pred = sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
+                    " `v_prediction` or `flow_prediction` for the UniPCMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+
+            return x0_pred
+        else:
+            if self.config.prediction_type == "flow_prediction":
+                sigma_t = self.sigmas[self.step_index]
+                epsilon = sample - (1 - sigma_t) * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
+                    " `v_prediction` or `flow_prediction` for the UniPCMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                sigma_t = self.sigmas[self.step_index]
+                x0_pred = sample - sigma_t * model_output
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = model_output + x0_pred
+
+            return epsilon
+
+    def multistep_uni_p_bh_update(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        order: int = None,  # pyright: ignore
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model at the current timestep.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            order (`int`):
+                The order of UniP at this timestep (corresponds to the *p* in UniPC-p).
+
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        prev_timestep = args[0] if len(args) > 0 else kwargs.pop(
+            "prev_timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError(
+                    " missing `sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 2:
+                order = args[2]
+            else:
+                raise ValueError(
+                    " missing `order` as a required keyward argument")
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        model_output_list = self.model_outputs
+
+        s0 = self.timestep_list[-1]
+        m0 = model_output_list[-1]
+        x = sample
+
+        if self.solver_p:
+            x_t = self.solver_p.step(model_output, s0, x).prev_sample
+            return x_t
+
+        sigma_t, sigma_s0 = self.sigmas[self.step_index + 1], self.sigmas[
+            self.step_index]  # pyright: ignore
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+
+        h = lambda_t - lambda_s0
+        device = sample.device
+
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            si = self.step_index - i  # pyright: ignore
+            mi = model_output_list[-(i + 1)]
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            rk = (lambda_si - lambda_s0) / h
+            rks.append(rk)
+            D1s.append((mi - m0) / rk)  # pyright: ignore
+
+        rks.append(1.0)
+        rks = torch.tensor(rks, device=device)
+
+        R = []
+        b = []
+
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+
+        factorial_i = 1
+
+        if self.config.solver_type == "bh1":
+            B_h = hh
+        elif self.config.solver_type == "bh2":
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= i + 1
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+
+        R = torch.stack(R)
+        b = torch.tensor(b, device=device)
+
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1)  # (B, K)
+            # for order 2, we use a simplified version
+            if order == 2:
+                rhos_p = torch.tensor([0.5], dtype=x.dtype, device=device)
+            else:
+                rhos_p = torch.linalg.solve(R[:-1, :-1],
+                                            b[:-1]).to(device).to(x.dtype)
+        else:
+            D1s = None
+
+        if self.predict_x0:
+            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
+            if D1s is not None:
+                pred_res = torch.einsum("k,bkc...->bc...", rhos_p,
+                                        D1s)  # pyright: ignore
+            else:
+                pred_res = 0
+            x_t = x_t_ - alpha_t * B_h * pred_res
+        else:
+            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
+            if D1s is not None:
+                pred_res = torch.einsum("k,bkc...->bc...", rhos_p,
+                                        D1s)  # pyright: ignore
+            else:
+                pred_res = 0
+            x_t = x_t_ - sigma_t * B_h * pred_res
+
+        x_t = x_t.to(x.dtype)
+        return x_t
+
+    def multistep_uni_c_bh_update(
+        self,
+        this_model_output: torch.Tensor,
+        *args,
+        last_sample: torch.Tensor = None,
+        this_sample: torch.Tensor = None,
+        order: int = None,  # pyright: ignore
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the UniC (B(h) version).
+
+        Args:
+            this_model_output (`torch.Tensor`):
+                The model outputs at `x_t`.
+            this_timestep (`int`):
+                The current timestep `t`.
+            last_sample (`torch.Tensor`):
+                The generated sample before the last predictor `x_{t-1}`.
+            this_sample (`torch.Tensor`):
+                The generated sample after the last predictor `x_{t}`.
+            order (`int`):
+                The `p` of UniC-p at this step. The effective order of accuracy should be `order + 1`.
+
+        Returns:
+            `torch.Tensor`:
+                The corrected sample tensor at the current timestep.
+        """
+        this_timestep = args[0] if len(args) > 0 else kwargs.pop(
+            "this_timestep", None)
+        if last_sample is None:
+            if len(args) > 1:
+                last_sample = args[1]
+            else:
+                raise ValueError(
+                    " missing`last_sample` as a required keyward argument")
+        if this_sample is None:
+            if len(args) > 2:
+                this_sample = args[2]
+            else:
+                raise ValueError(
+                    " missing`this_sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 3:
+                order = args[3]
+            else:
+                raise ValueError(
+                    " missing`order` as a required keyward argument")
+        if this_timestep is not None:
+            deprecate(
+                "this_timestep",
+                "1.0.0",
+                "Passing `this_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        model_output_list = self.model_outputs
+
+        m0 = model_output_list[-1]
+        x = last_sample
+        x_t = this_sample
+        model_t = this_model_output
+
+        sigma_t, sigma_s0 = self.sigmas[self.step_index], self.sigmas[
+            self.step_index - 1]  # pyright: ignore
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+
+        h = lambda_t - lambda_s0
+        device = this_sample.device
+
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            si = self.step_index - (i + 1)  # pyright: ignore
+            mi = model_output_list[-(i + 1)]
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            rk = (lambda_si - lambda_s0) / h
+            rks.append(rk)
+            D1s.append((mi - m0) / rk)  # pyright: ignore
+
+        rks.append(1.0)
+        rks = torch.tensor(rks, device=device)
+
+        R = []
+        b = []
+
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+
+        factorial_i = 1
+
+        if self.config.solver_type == "bh1":
+            B_h = hh
+        elif self.config.solver_type == "bh2":
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= i + 1
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+
+        R = torch.stack(R)
+        b = torch.tensor(b, device=device)
+
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1)
+        else:
+            D1s = None
+
+        # for order 1, we use a simplified version
+        if order == 1:
+            rhos_c = torch.tensor([0.5], dtype=x.dtype, device=device)
+        else:
+            rhos_c = torch.linalg.solve(R, b).to(device).to(x.dtype)
+
+        if self.predict_x0:
+            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
+            if D1s is not None:
+                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = model_t - m0
+            x_t = x_t_ - alpha_t * B_h * (corr_res + rhos_c[-1] * D1_t)
+        else:
+            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
+            if D1s is not None:
+                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = model_t - m0
+            x_t = x_t_ - sigma_t * B_h * (corr_res + rhos_c[-1] * D1_t)
+        x_t = x_t.to(x.dtype)
+        return x_t
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(self,
+             model_output: torch.Tensor,
+             timestep: Union[int, torch.Tensor],
+             sample: torch.Tensor,
+             return_dict: bool = True,
+             generator=None) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep UniPC.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        use_corrector = (
+            self.step_index > 0 and
+            self.step_index - 1 not in self.disable_corrector and
+            self.last_sample is not None  # pyright: ignore
+        )
+
+        model_output_convert = self.convert_model_output(
+            model_output, sample=sample)
+        if use_corrector:
+            sample = self.multistep_uni_c_bh_update(
+                this_model_output=model_output_convert,
+                last_sample=self.last_sample,
+                this_sample=sample,
+                order=self.this_order,
+            )
+
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+            self.timestep_list[i] = self.timestep_list[i + 1]
+
+        self.model_outputs[-1] = model_output_convert
+        self.timestep_list[-1] = timestep  # pyright: ignore
+
+        if self.config.lower_order_final:
+            this_order = min(self.config.solver_order,
+                             len(self.timesteps) -
+                             self.step_index)  # pyright: ignore
+        else:
+            this_order = self.config.solver_order
+
+        self.this_order = min(this_order,
+                              self.lower_order_nums + 1)  # warmup for multistep
+        assert self.this_order > 0
+
+        self.last_sample = sample
+        prev_sample = self.multistep_uni_p_bh_update(
+            model_output=model_output,  # pass the original non-converted model output, in case solver-p is used
+            sample=sample,
+            order=self.this_order,
+        )
+
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+
+        # upon completion increase step index by one
+        self._step_index += 1  # pyright: ignore
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.Tensor, *args,
+                          **kwargs) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(
+            device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(
+                timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(
+                original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(
+                original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [
+                self.index_for_timestep(t, schedule_timesteps)
+                for t in timesteps
+            ]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/wan/utils/utils.py b/wan/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d72599967f0a5a491e722e7d7a942efe5137b210
--- /dev/null
+++ b/wan/utils/utils.py
@@ -0,0 +1,118 @@
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import argparse
+import binascii
+import os
+import os.path as osp
+
+import imageio
+import torch
+import torchvision
+
+__all__ = ['cache_video', 'cache_image', 'str2bool']
+
+
+def rand_name(length=8, suffix=''):
+    name = binascii.b2a_hex(os.urandom(length)).decode('utf-8')
+    if suffix:
+        if not suffix.startswith('.'):
+            suffix = '.' + suffix
+        name += suffix
+    return name
+
+
+def cache_video(tensor,
+                save_file=None,
+                fps=30,
+                suffix='.mp4',
+                nrow=8,
+                normalize=True,
+                value_range=(-1, 1),
+                retry=5):
+    # cache file
+    cache_file = osp.join('/tmp', rand_name(
+        suffix=suffix)) if save_file is None else save_file
+
+    # save to cache
+    error = None
+    for _ in range(retry):
+        try:
+            # preprocess
+            tensor = tensor.clamp(min(value_range), max(value_range))
+            tensor = torch.stack([
+                torchvision.utils.make_grid(
+                    u, nrow=nrow, normalize=normalize, value_range=value_range)
+                for u in tensor.unbind(2)
+            ],
+                                 dim=1).permute(1, 2, 3, 0)
+            tensor = (tensor * 255).type(torch.uint8).cpu()
+
+            # write video
+            writer = imageio.get_writer(
+                cache_file, fps=fps, codec='libx264', quality=8)
+            for frame in tensor.numpy():
+                writer.append_data(frame)
+            writer.close()
+            return cache_file
+        except Exception as e:
+            error = e
+            continue
+    else:
+        print(f'cache_video failed, error: {error}', flush=True)
+        return None
+
+
+def cache_image(tensor,
+                save_file,
+                nrow=8,
+                normalize=True,
+                value_range=(-1, 1),
+                retry=5):
+    # cache file
+    suffix = osp.splitext(save_file)[1]
+    if suffix.lower() not in [
+            '.jpg', '.jpeg', '.png', '.tiff', '.gif', '.webp'
+    ]:
+        suffix = '.png'
+
+    # save to cache
+    error = None
+    for _ in range(retry):
+        try:
+            tensor = tensor.clamp(min(value_range), max(value_range))
+            torchvision.utils.save_image(
+                tensor,
+                save_file,
+                nrow=nrow,
+                normalize=normalize,
+                value_range=value_range)
+            return save_file
+        except Exception as e:
+            error = e
+            continue
+
+
+def str2bool(v):
+    """
+    Convert a string to a boolean.
+
+    Supported true values: 'yes', 'true', 't', 'y', '1'
+    Supported false values: 'no', 'false', 'f', 'n', '0'
+
+    Args:
+        v (str): String to convert.
+
+    Returns:
+        bool: Converted boolean value.
+
+    Raises:
+        argparse.ArgumentTypeError: If the value cannot be converted to boolean.
+    """
+    if isinstance(v, bool):
+        return v
+    v_lower = v.lower()
+    if v_lower in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v_lower in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected (True/False)')
diff --git a/wan_cache_latents.py b/wan_cache_latents.py
new file mode 100644
index 0000000000000000000000000000000000000000..be35655c7f4ac9b2f8b5fc50b5890d43d41c3e39
--- /dev/null
+++ b/wan_cache_latents.py
@@ -0,0 +1,159 @@
+import argparse
+import os
+import glob
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from dataset import config_utils
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+from PIL import Image
+
+import logging
+
+from dataset.image_video_dataset import ItemInfo, save_latent_cache_wan, ARCHITECTURE_WAN
+from utils.model_utils import str_to_dtype
+from wan.configs import wan_i2v_14B
+from wan.modules.vae import WanVAE
+from wan.modules.clip import CLIPModel
+import cache_latents
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def encode_and_save_batch(vae: WanVAE, clip: Optional[CLIPModel], batch: list[ItemInfo]):
+    contents = torch.stack([torch.from_numpy(item.content) for item in batch])
+    if len(contents.shape) == 4:
+        contents = contents.unsqueeze(1)  # B, H, W, C -> B, F, H, W, C
+
+    contents = contents.permute(0, 4, 1, 2, 3).contiguous()  # B, C, F, H, W
+    contents = contents.to(vae.device, dtype=vae.dtype)
+    contents = contents / 127.5 - 1.0  # normalize to [-1, 1]
+
+    h, w = contents.shape[3], contents.shape[4]
+    if h < 8 or w < 8:
+        item = batch[0]  # other items should have the same size
+        raise ValueError(f"Image or video size too small: {item.item_key} and {len(batch) - 1} more, size: {item.original_size}")
+
+    # print(f"encode batch: {contents.shape}")
+    with torch.amp.autocast(device_type=vae.device.type, dtype=vae.dtype), torch.no_grad():
+        latent = vae.encode(contents)  # list of Tensor[C, F, H, W]
+    latent = torch.stack(latent, dim=0)  # B, C, F, H, W
+    latent = latent.to(vae.dtype)  # convert to bfloat16, we are not sure if this is correct
+
+    if clip is not None:
+        # extract first frame of contents
+        images = contents[:, :, 0:1, :, :]  # B, C, F, H, W, non contiguous view is fine
+
+        with torch.amp.autocast(device_type=clip.device.type, dtype=torch.float16), torch.no_grad():
+            clip_context = clip.visual(images)
+        clip_context = clip_context.to(torch.float16)  # convert to fp16
+
+        # encode image latent for I2V
+        B, _, _, lat_h, lat_w = latent.shape
+        F = contents.shape[2]
+
+        # Create mask for the required number of frames
+        msk = torch.ones(1, F, lat_h, lat_w, dtype=vae.dtype, device=vae.device)
+        msk[:, 1:] = 0
+        msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
+        msk = msk.view(1, msk.shape[1] // 4, 4, lat_h, lat_w)
+        msk = msk.transpose(1, 2)  # 1, F, 4, H, W -> 1, 4, F, H, W
+        msk = msk.repeat(B, 1, 1, 1, 1)  # B, 4, F, H, W
+
+        # Zero padding for the required number of frames only
+        padding_frames = F - 1  # The first frame is the input image
+        images_resized = torch.concat([images, torch.zeros(B, 3, padding_frames, h, w, device=vae.device)], dim=2)
+        with torch.amp.autocast(device_type=vae.device.type, dtype=vae.dtype), torch.no_grad():
+            y = vae.encode(images_resized)
+        y = torch.stack(y, dim=0)  # B, C, F, H, W
+
+        y = y[:, :, :F]  # may be not needed
+        y = y.to(vae.dtype)  # convert to bfloat16
+        y = torch.concat([msk, y], dim=1)  # B, 4 + C, F, H, W
+
+    else:
+        clip_context = None
+        y = None
+
+    # # debug: decode and save
+    # with torch.no_grad():
+    #     latent_to_decode = latent / vae.config.scaling_factor
+    #     images = vae.decode(latent_to_decode, return_dict=False)[0]
+    #     images = (images / 2 + 0.5).clamp(0, 1)
+    #     images = images.cpu().float().numpy()
+    #     images = (images * 255).astype(np.uint8)
+    #     images = images.transpose(0, 2, 3, 4, 1)  # B, C, F, H, W -> B, F, H, W, C
+    #     for b in range(images.shape[0]):
+    #         for f in range(images.shape[1]):
+    #             fln = os.path.splitext(os.path.basename(batch[b].item_key))[0]
+    #             img = Image.fromarray(images[b, f])
+    #             img.save(f"./logs/decode_{fln}_{b}_{f:03d}.jpg")
+
+    for i, item in enumerate(batch):
+        l = latent[i]
+        cctx = clip_context[i] if clip is not None else None
+        y_i = y[i] if clip is not None else None
+        # print(f"save latent cache: {item.latent_cache_path}, latent shape: {l.shape}")
+        save_latent_cache_wan(item, l, cctx, y_i)
+
+
+def main(args):
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+
+    # Load dataset config
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+    logger.info(f"Load dataset config from {args.dataset_config}")
+    user_config = config_utils.load_user_config(args.dataset_config)
+    blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_WAN)
+    train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+
+    datasets = train_dataset_group.datasets
+
+    if args.debug_mode is not None:
+        cache_latents.show_datasets(datasets, args.debug_mode, args.console_width, args.console_back, args.console_num_images)
+        return
+
+    assert args.vae is not None, "vae checkpoint is required"
+
+    vae_path = args.vae
+
+    logger.info(f"Loading VAE model from {vae_path}")
+    vae_dtype = torch.bfloat16 if args.vae_dtype is None else str_to_dtype(args.vae_dtype)
+    cache_device = torch.device("cpu") if args.vae_cache_cpu else None
+    vae = WanVAE(vae_path=vae_path, device=device, dtype=vae_dtype, cache_device=cache_device)
+
+    if args.clip is not None:
+        clip_dtype = wan_i2v_14B.i2v_14B["clip_dtype"]
+        clip = CLIPModel(dtype=clip_dtype, device=device, weight_path=args.clip)
+    else:
+        clip = None
+
+    # Encode images
+    def encode(one_batch: list[ItemInfo]):
+        encode_and_save_batch(vae, clip, one_batch)
+
+    cache_latents.encode_datasets(datasets, encode, args)
+
+
+def wan_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    parser.add_argument("--vae_cache_cpu", action="store_true", help="cache features in VAE on CPU")
+    parser.add_argument(
+        "--clip",
+        type=str,
+        default=None,
+        help="text encoder (CLIP) checkpoint path, optional. If training I2V model, this is required",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = cache_latents.setup_parser_common()
+    parser = wan_setup_parser(parser)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/wan_cache_text_encoder_outputs.py b/wan_cache_text_encoder_outputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..6253715c2231a5188f2c88718475ab0d377fbad1
--- /dev/null
+++ b/wan_cache_text_encoder_outputs.py
@@ -0,0 +1,107 @@
+import argparse
+import os
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from dataset import config_utils
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+import accelerate
+
+from dataset.image_video_dataset import ARCHITECTURE_WAN, ItemInfo, save_text_encoder_output_cache_wan
+
+# for t5 config: all Wan2.1 models have the same config for t5
+from wan.configs import wan_t2v_14B
+
+import cache_text_encoder_outputs
+import logging
+
+from utils.model_utils import str_to_dtype
+from wan.modules.t5 import T5EncoderModel
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def encode_and_save_batch(
+    text_encoder: T5EncoderModel, batch: list[ItemInfo], device: torch.device, accelerator: Optional[accelerate.Accelerator]
+):
+    prompts = [item.caption for item in batch]
+    # print(prompts)
+
+    # encode prompt
+    with torch.no_grad():
+        if accelerator is not None:
+            with accelerator.autocast():
+                context = text_encoder(prompts, device)
+        else:
+            context = text_encoder(prompts, device)
+
+    # save prompt cache
+    for item, ctx in zip(batch, context):
+        save_text_encoder_output_cache_wan(item, ctx)
+
+
+def main(args):
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+
+    # Load dataset config
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+    logger.info(f"Load dataset config from {args.dataset_config}")
+    user_config = config_utils.load_user_config(args.dataset_config)
+    blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_WAN)
+    train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+
+    datasets = train_dataset_group.datasets
+
+    # define accelerator for fp8 inference
+    config = wan_t2v_14B.t2v_14B  # all Wan2.1 models have the same config for t5
+    accelerator = None
+    if args.fp8_t5:
+        accelerator = accelerate.Accelerator(mixed_precision="bf16" if config.t5_dtype == torch.bfloat16 else "fp16")
+
+    # prepare cache files and paths: all_cache_files_for_dataset = exisiting cache files, all_cache_paths_for_dataset = all cache paths in the dataset
+    all_cache_files_for_dataset, all_cache_paths_for_dataset = cache_text_encoder_outputs.prepare_cache_files_and_paths(datasets)
+
+    # Load T5
+    logger.info(f"Loading T5: {args.t5}")
+    text_encoder = T5EncoderModel(
+        text_len=config.text_len, dtype=config.t5_dtype, device=device, weight_path=args.t5, fp8=args.fp8_t5
+    )
+
+    # Encode with T5
+    logger.info("Encoding with T5")
+
+    def encode_for_text_encoder(batch: list[ItemInfo]):
+        encode_and_save_batch(text_encoder, batch, device, accelerator)
+
+    cache_text_encoder_outputs.process_text_encoder_batches(
+        args.num_workers,
+        args.skip_existing,
+        args.batch_size,
+        datasets,
+        all_cache_files_for_dataset,
+        all_cache_paths_for_dataset,
+        encode_for_text_encoder,
+    )
+    del text_encoder
+
+    # remove cache files not in dataset
+    cache_text_encoder_outputs.post_process_cache_files(datasets, all_cache_files_for_dataset, all_cache_paths_for_dataset)
+
+
+def wan_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    parser.add_argument("--t5", type=str, default=None, required=True, help="text encoder (T5) checkpoint path")
+    parser.add_argument("--fp8_t5", action="store_true", help="use fp8 for Text Encoder model")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = cache_text_encoder_outputs.setup_parser_common()
+    parser = wan_setup_parser(parser)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/wan_generate_video.py b/wan_generate_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..01dac297139321702455ea0902b38a551bc80a8d
--- /dev/null
+++ b/wan_generate_video.py
@@ -0,0 +1,1059 @@
+import argparse
+from datetime import datetime
+import gc
+import random
+import os
+import time
+import math
+from typing import Tuple, Optional, List, Union, Any
+
+import torch
+import accelerate
+from accelerate import Accelerator
+from safetensors.torch import load_file, save_file
+from safetensors import safe_open
+from PIL import Image
+import cv2
+import numpy as np
+import torchvision.transforms.functional as TF
+from tqdm import tqdm
+
+from networks import lora_wan
+from utils.safetensors_utils import mem_eff_save_file, load_safetensors
+from wan.configs import WAN_CONFIGS, SUPPORTED_SIZES
+import wan
+from wan.modules.model import WanModel, load_wan_model, detect_wan_sd_dtype
+from wan.modules.vae import WanVAE
+from wan.modules.t5 import T5EncoderModel
+from wan.modules.clip import CLIPModel
+from modules.scheduling_flow_match_discrete import FlowMatchDiscreteScheduler
+from wan.utils.fm_solvers import FlowDPMSolverMultistepScheduler, get_sampling_sigmas, retrieve_timesteps
+from wan.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
+
+try:
+    from lycoris.kohya import create_network_from_weights
+except:
+    pass
+
+from utils.model_utils import str_to_dtype
+from utils.device_utils import clean_memory_on_device
+from hv_generate_video import save_images_grid, save_videos_grid, synchronize_device
+
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def parse_args() -> argparse.Namespace:
+    """parse command line arguments"""
+    parser = argparse.ArgumentParser(description="Wan 2.1 inference script")
+
+    # WAN arguments
+    parser.add_argument("--ckpt_dir", type=str, default=None, help="The path to the checkpoint directory (Wan 2.1 official).")
+    parser.add_argument("--task", type=str, default="t2v-14B", choices=list(WAN_CONFIGS.keys()), help="The task to run.")
+    parser.add_argument(
+        "--sample_solver", type=str, default="unipc", choices=["unipc", "dpm++", "vanilla"], help="The solver used to sample."
+    )
+
+    parser.add_argument("--dit", type=str, default=None, help="DiT checkpoint path")
+    parser.add_argument("--vae", type=str, default=None, help="VAE checkpoint path")
+    parser.add_argument("--vae_dtype", type=str, default=None, help="data type for VAE, default is bfloat16")
+    parser.add_argument("--vae_cache_cpu", action="store_true", help="cache features in VAE on CPU")
+    parser.add_argument("--t5", type=str, default=None, help="text encoder (T5) checkpoint path")
+    parser.add_argument("--clip", type=str, default=None, help="text encoder (CLIP) checkpoint path")
+    # LoRA
+    parser.add_argument("--lora_weight", type=str, nargs="*", required=False, default=None, help="LoRA weight path")
+    parser.add_argument("--lora_multiplier", type=float, nargs="*", default=1.0, help="LoRA multiplier")
+    parser.add_argument(
+        "--save_merged_model",
+        type=str,
+        default=None,
+        help="Save merged model to path. If specified, no inference will be performed.",
+    )
+
+    # inference
+    parser.add_argument("--prompt", type=str, required=True, help="prompt for generation")
+    parser.add_argument(
+        "--negative_prompt",
+        type=str,
+        default=None,
+        help="negative prompt for generation, use default negative prompt if not specified",
+    )
+    parser.add_argument("--video_size", type=int, nargs=2, default=[256, 256], help="video size, height and width")
+    parser.add_argument("--video_length", type=int, default=None, help="video length, Default depends on task")
+    parser.add_argument("--fps", type=int, default=16, help="video fps, Default is 16")
+    parser.add_argument("--infer_steps", type=int, default=None, help="number of inference steps")
+    parser.add_argument("--save_path", type=str, required=True, help="path to save generated video")
+    parser.add_argument("--seed", type=int, default=None, help="Seed for evaluation.")
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=5.0,
+        help="Guidance scale for classifier free guidance. Default is 5.0.",
+    )
+    parser.add_argument("--video_path", type=str, default=None, help="path to video for video2video inference")
+    parser.add_argument("--image_path", type=str, default=None, help="path to image for image2video inference")
+
+    # Flow Matching
+    parser.add_argument(
+        "--flow_shift",
+        type=float,
+        default=None,
+        help="Shift factor for flow matching schedulers. Default depends on task.",
+    )
+
+    parser.add_argument("--fp8", action="store_true", help="use fp8 for DiT model")
+    parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT, only for fp8")
+    parser.add_argument("--fp8_fast", action="store_true", help="Enable fast FP8 arithmetic (RTX 4XXX+), only for fp8_scaled")
+    parser.add_argument("--fp8_t5", action="store_true", help="use fp8 for Text Encoder model")
+    parser.add_argument(
+        "--device", type=str, default=None, help="device to use for inference. If None, use CUDA if available, otherwise use CPU"
+    )
+    parser.add_argument(
+        "--attn_mode",
+        type=str,
+        default="torch",
+        choices=["flash", "flash2", "flash3", "torch", "sageattn", "xformers", "sdpa"],
+        help="attention mode",
+    )
+    parser.add_argument("--blocks_to_swap", type=int, default=0, help="number of blocks to swap in the model")
+    parser.add_argument(
+        "--output_type", type=str, default="video", choices=["video", "images", "latent", "both"], help="output type"
+    )
+    parser.add_argument("--no_metadata", action="store_true", help="do not save metadata")
+    parser.add_argument("--latent_path", type=str, nargs="*", default=None, help="path to latent for decode. no inference")
+    parser.add_argument("--lycoris", action="store_true", help="use lycoris for inference")
+    parser.add_argument("--compile", action="store_true", help="Enable torch.compile")
+    parser.add_argument(
+        "--compile_args",
+        nargs=4,
+        metavar=("BACKEND", "MODE", "DYNAMIC", "FULLGRAPH"),
+        default=["inductor", "max-autotune-no-cudagraphs", "False", "False"],
+        help="Torch.compile settings",
+    )
+
+    args = parser.parse_args()
+
+    assert (args.latent_path is None or len(args.latent_path) == 0) or (
+        args.output_type == "images" or args.output_type == "video"
+    ), "latent_path is only supported for images or video output"
+
+    return args
+
+
+def get_task_defaults(task: str, size: Optional[Tuple[int, int]] = None) -> Tuple[int, float, int, bool]:
+    """Return default values for each task
+
+    Args:
+        task: task name (t2v, t2i, i2v etc.)
+        size: size of the video (width, height)
+
+    Returns:
+        Tuple[int, float, int, bool]: (infer_steps, flow_shift, video_length, needs_clip)
+    """
+    width, height = size if size else (0, 0)
+
+    if "t2i" in task:
+        return 50, 5.0, 1, False
+    elif "i2v" in task:
+        flow_shift = 3.0 if (width == 832 and height == 480) or (width == 480 and height == 832) else 5.0
+        return 40, flow_shift, 81, True
+    else:  # t2v or default
+        return 50, 5.0, 81, False
+
+
+def setup_args(args: argparse.Namespace) -> argparse.Namespace:
+    """Validate and set default values for optional arguments
+
+    Args:
+        args: command line arguments
+
+    Returns:
+        argparse.Namespace: updated arguments
+    """
+    # Get default values for the task
+    infer_steps, flow_shift, video_length, _ = get_task_defaults(args.task, tuple(args.video_size))
+
+    # Apply default values to unset arguments
+    if args.infer_steps is None:
+        args.infer_steps = infer_steps
+    if args.flow_shift is None:
+        args.flow_shift = flow_shift
+    if args.video_length is None:
+        args.video_length = video_length
+
+    # Force video_length to 1 for t2i tasks
+    if "t2i" in args.task:
+        assert args.video_length == 1, f"video_length should be 1 for task {args.task}"
+
+    return args
+
+
+def check_inputs(args: argparse.Namespace) -> Tuple[int, int, int]:
+    """Validate video size and length
+
+    Args:
+        args: command line arguments
+
+    Returns:
+        Tuple[int, int, int]: (height, width, video_length)
+    """
+    height = args.video_size[0]
+    width = args.video_size[1]
+    size = f"{width}*{height}"
+
+    if size not in SUPPORTED_SIZES[args.task]:
+        logger.warning(f"Size {size} is not supported for task {args.task}. Supported sizes are {SUPPORTED_SIZES[args.task]}.")
+
+    video_length = args.video_length
+
+    if height % 8 != 0 or width % 8 != 0:
+        raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+    return height, width, video_length
+
+
+def calculate_dimensions(video_size: Tuple[int, int], video_length: int, config) -> Tuple[Tuple[int, int, int, int], int]:
+    """calculate dimensions for the generation
+
+    Args:
+        video_size: video frame size (height, width)
+        video_length: number of frames in the video
+        config: model configuration
+
+    Returns:
+        Tuple[Tuple[int, int, int, int], int]:
+            ((channels, frames, height, width), seq_len)
+    """
+    height, width = video_size
+    frames = video_length
+
+    # calculate latent space dimensions
+    lat_f = (frames - 1) // config.vae_stride[0] + 1
+    lat_h = height // config.vae_stride[1]
+    lat_w = width // config.vae_stride[2]
+
+    # calculate sequence length
+    seq_len = math.ceil((lat_h * lat_w) / (config.patch_size[1] * config.patch_size[2]) * lat_f)
+
+    return ((16, lat_f, lat_h, lat_w), seq_len)
+
+
+def load_vae(args: argparse.Namespace, config, device: torch.device, dtype: torch.dtype) -> WanVAE:
+    """load VAE model
+
+    Args:
+        args: command line arguments
+        config: model configuration
+        device: device to use
+        dtype: data type for the model
+
+    Returns:
+        WanVAE: loaded VAE model
+    """
+    vae_path = args.vae if args.vae is not None else os.path.join(args.ckpt_dir, config.vae_checkpoint)
+
+    logger.info(f"Loading VAE model from {vae_path}")
+    cache_device = torch.device("cpu") if args.vae_cache_cpu else None
+    vae = WanVAE(vae_path=vae_path, device=device, dtype=dtype, cache_device=cache_device)
+    return vae
+
+
+def load_text_encoder(args: argparse.Namespace, config, device: torch.device) -> T5EncoderModel:
+    """load text encoder (T5) model
+
+    Args:
+        args: command line arguments
+        config: model configuration
+        device: device to use
+
+    Returns:
+        T5EncoderModel: loaded text encoder model
+    """
+    checkpoint_path = None if args.ckpt_dir is None else os.path.join(args.ckpt_dir, config.t5_checkpoint)
+    tokenizer_path = None if args.ckpt_dir is None else os.path.join(args.ckpt_dir, config.t5_tokenizer)
+
+    text_encoder = T5EncoderModel(
+        text_len=config.text_len,
+        dtype=config.t5_dtype,
+        device=device,
+        checkpoint_path=checkpoint_path,
+        tokenizer_path=tokenizer_path,
+        weight_path=args.t5,
+        fp8=args.fp8_t5,
+    )
+
+    return text_encoder
+
+
+def load_clip_model(args: argparse.Namespace, config, device: torch.device) -> CLIPModel:
+    """load CLIP model (for I2V only)
+
+    Args:
+        args: command line arguments
+        config: model configuration
+        device: device to use
+
+    Returns:
+        CLIPModel: loaded CLIP model
+    """
+    checkpoint_path = None if args.ckpt_dir is None else os.path.join(args.ckpt_dir, config.clip_checkpoint)
+    tokenizer_path = None if args.ckpt_dir is None else os.path.join(args.ckpt_dir, config.clip_tokenizer)
+
+    clip = CLIPModel(
+        dtype=config.clip_dtype,
+        device=device,
+        checkpoint_path=checkpoint_path,
+        tokenizer_path=tokenizer_path,
+        weight_path=args.clip,
+    )
+
+    return clip
+
+
+def load_dit_model(
+    args: argparse.Namespace,
+    config,
+    device: torch.device,
+    dit_dtype: torch.dtype,
+    dit_weight_dtype: Optional[torch.dtype] = None,
+    is_i2v: bool = False,
+) -> WanModel:
+    """load DiT model
+
+    Args:
+        args: command line arguments
+        config: model configuration
+        device: device to use
+        dit_dtype: data type for the model
+        dit_weight_dtype: data type for the model weights. None for as-is
+        is_i2v: I2V mode
+
+    Returns:
+        WanModel: loaded DiT model
+    """
+    loading_device = "cpu"
+    if args.blocks_to_swap == 0 and args.lora_weight is None and not args.fp8_scaled:
+        loading_device = device
+
+    loading_weight_dtype = dit_weight_dtype
+    if args.fp8_scaled or args.lora_weight is not None:
+        loading_weight_dtype = dit_dtype  # load as-is
+
+    # do not fp8 optimize because we will merge LoRA weights
+    model = load_wan_model(config, is_i2v, device, args.dit, args.attn_mode, False, loading_device, loading_weight_dtype, False)
+
+    return model
+
+
+def merge_lora_weights(model: WanModel, args: argparse.Namespace, device: torch.device) -> None:
+    """merge LoRA weights to the model
+
+    Args:
+        model: DiT model
+        args: command line arguments
+        device: device to use
+    """
+    if args.lora_weight is None or len(args.lora_weight) == 0:
+        return
+
+    for i, lora_weight in enumerate(args.lora_weight):
+        if args.lora_multiplier is not None and len(args.lora_multiplier) > i:
+            lora_multiplier = args.lora_multiplier[i]
+        else:
+            lora_multiplier = 1.0
+
+        logger.info(f"Loading LoRA weights from {lora_weight} with multiplier {lora_multiplier}")
+        weights_sd = load_file(lora_weight)
+        if args.lycoris:
+            lycoris_net, _ = create_network_from_weights(
+                multiplier=lora_multiplier,
+                file=None,
+                weights_sd=weights_sd,
+                unet=model,
+                text_encoder=None,
+                vae=None,
+                for_inference=True,
+            )
+            lycoris_net.merge_to(None, model, weights_sd, dtype=None, device=device)
+        else:
+            network = lora_wan.create_arch_network_from_weights(lora_multiplier, weights_sd, unet=model, for_inference=True)
+            network.merge_to(None, model, weights_sd, device=device, non_blocking=True)
+
+        synchronize_device(device)
+        logger.info("LoRA weights loaded")
+
+    # save model here before casting to dit_weight_dtype
+    if args.save_merged_model:
+        logger.info(f"Saving merged model to {args.save_merged_model}")
+        mem_eff_save_file(model.state_dict(), args.save_merged_model)  # save_file needs a lot of memory
+        logger.info("Merged model saved")
+
+
+def optimize_model(
+    model: WanModel, args: argparse.Namespace, device: torch.device, dit_dtype: torch.dtype, dit_weight_dtype: torch.dtype
+) -> None:
+    """optimize the model (FP8 conversion, device move etc.)
+
+    Args:
+        model: dit model
+        args: command line arguments
+        device: device to use
+        dit_dtype: dtype for the model
+        dit_weight_dtype: dtype for the model weights
+    """
+    if args.fp8_scaled:
+        # load state dict as-is and optimize to fp8
+        state_dict = model.state_dict()
+
+        # if no blocks to swap, we can move the weights to GPU after optimization on GPU (omit redundant CPU->GPU copy)
+        move_to_device = args.blocks_to_swap == 0  # if blocks_to_swap > 0, we will keep the model on CPU
+        state_dict = model.fp8_optimization(state_dict, device, move_to_device, use_scaled_mm=args.fp8_fast)
+
+        info = model.load_state_dict(state_dict, strict=True, assign=True)
+        logger.info(f"Loaded FP8 optimized weights: {info}")
+
+        if args.blocks_to_swap == 0:
+            model.to(device)  # make sure all parameters are on the right device (e.g. RoPE etc.)
+    else:
+        # simple cast to dit_dtype
+        target_dtype = None  # load as-is (dit_weight_dtype == dtype of the weights in state_dict)
+        target_device = None
+
+        if dit_weight_dtype is not None:  # in case of args.fp8 and not args.fp8_scaled
+            logger.info(f"Convert model to {dit_weight_dtype}")
+            target_dtype = dit_weight_dtype
+
+        if args.blocks_to_swap == 0:
+            logger.info(f"Move model to device: {device}")
+            target_device = device
+
+        model.to(target_device, target_dtype)  # move and cast  at the same time. this reduces redundant copy operations
+
+    if args.compile:
+        compile_backend, compile_mode, compile_dynamic, compile_fullgraph = args.compile_args
+        logger.info(
+            f"Torch Compiling[Backend: {compile_backend}; Mode: {compile_mode}; Dynamic: {compile_dynamic}; Fullgraph: {compile_fullgraph}]"
+        )
+        torch._dynamo.config.cache_size_limit = 32
+        for i in range(len(model.blocks)):
+            model.blocks[i] = torch.compile(
+                model.blocks[i],
+                backend=compile_backend,
+                mode=compile_mode,
+                dynamic=compile_dynamic.lower() in "true",
+                fullgraph=compile_fullgraph.lower() in "true",
+            )
+
+    if args.blocks_to_swap > 0:
+        logger.info(f"Enable swap {args.blocks_to_swap} blocks to CPU from device: {device}")
+        model.enable_block_swap(args.blocks_to_swap, device, supports_backward=False)
+        model.move_to_device_except_swap_blocks(device)
+        model.prepare_block_swap_before_forward()
+    else:
+        # make sure the model is on the right device
+        model.to(device)
+
+    model.eval().requires_grad_(False)
+    clean_memory_on_device(device)
+
+
+def prepare_t2v_inputs(
+    args: argparse.Namespace, config, accelerator: Accelerator, device: torch.device
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Tuple[dict, dict]]:
+    """Prepare inputs for T2V
+
+    Args:
+        args: command line arguments
+        config: model configuration
+        accelerator: Accelerator instance
+        device: device to use
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Tuple[dict, dict]]:
+            (noise, context, context_null, (arg_c, arg_null))
+    """
+    # Prepare inputs for T2V
+    # calculate dimensions and sequence length
+    (_, lat_f, lat_h, lat_w), seq_len = calculate_dimensions(args.video_size, args.video_length, config)
+    target_shape = (16, lat_f, lat_h, lat_w)
+
+    # configure negative prompt
+    n_prompt = args.negative_prompt if args.negative_prompt else config.sample_neg_prompt
+
+    # set seed
+    seed = args.seed if args.seed is not None else random.randint(0, 2**32 - 1)
+    seed_g = torch.Generator(device=device)
+    seed_g.manual_seed(seed)
+
+    # load text encoder
+    text_encoder = load_text_encoder(args, config, device)
+    text_encoder.model.to(device)
+
+    # encode prompt
+    with torch.no_grad():
+        if args.fp8_t5:
+            with torch.amp.autocast(device_type=device.type, dtype=config.t5_dtype):
+                context = text_encoder([args.prompt], device)
+                context_null = text_encoder([n_prompt], device)
+        else:
+            context = text_encoder([args.prompt], device)
+            context_null = text_encoder([n_prompt], device)
+
+    # free text encoder and clean memory
+    del text_encoder
+    clean_memory_on_device(device)
+
+    # generate noise
+    noise = torch.randn(
+        target_shape[0], target_shape[1], target_shape[2], target_shape[3], dtype=torch.float32, device=device, generator=seed_g
+    )
+
+    # prepare model input arguments
+    arg_c = {"context": context, "seq_len": seq_len}
+    arg_null = {"context": context_null, "seq_len": seq_len}
+
+    return noise, context, context_null, (arg_c, arg_null)
+
+
+def prepare_i2v_inputs(
+    args: argparse.Namespace, config, accelerator: Accelerator, device: torch.device, vae: WanVAE
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[dict, dict]]:
+    """Prepare inputs for I2V
+
+    Args:
+        args: command line arguments
+        config: model configuration
+        accelerator: Accelerator instance
+        device: device to use
+        vae: VAE model, used for image encoding
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[dict, dict]]:
+            (noise, context, context_null, y, (arg_c, arg_null))
+    """
+    # get video dimensions
+    height, width = args.video_size
+    frames = args.video_length
+    max_area = width * height
+
+    # load image
+    img = Image.open(args.image_path).convert("RGB")
+
+    # convert to numpy
+    img_cv2 = np.array(img)  # PIL to numpy
+    img_cv2 = cv2.cvtColor(img_cv2, cv2.COLOR_BGR2RGB)
+
+    # convert to tensor (-1 to 1)
+    img_tensor = TF.to_tensor(img).sub_(0.5).div_(0.5).to(device)
+
+    # calculate latent dimensions: keep aspect ratio
+    h, w = img_tensor.shape[1:]
+    aspect_ratio = h / w
+    lat_h = round(np.sqrt(max_area * aspect_ratio) // config.vae_stride[1] // config.patch_size[1] * config.patch_size[1])
+    lat_w = round(np.sqrt(max_area / aspect_ratio) // config.vae_stride[2] // config.patch_size[2] * config.patch_size[2])
+    h = lat_h * config.vae_stride[1]
+    w = lat_w * config.vae_stride[2]
+    lat_f = (frames - 1) // config.vae_stride[0] + 1  # size of latent frames
+    max_seq_len = lat_f * lat_h * lat_w // (config.patch_size[1] * config.patch_size[2])
+
+    # set seed
+    seed = args.seed if args.seed is not None else random.randint(0, 2**32 - 1)
+    seed_g = torch.Generator(device=device)
+    seed_g.manual_seed(seed)
+
+    # generate noise
+    noise = torch.randn(16, lat_f, lat_h, lat_w, dtype=torch.float32, generator=seed_g, device=device)
+
+    # configure negative prompt
+    n_prompt = args.negative_prompt if args.negative_prompt else config.sample_neg_prompt
+
+    # load text encoder
+    text_encoder = load_text_encoder(args, config, device)
+    text_encoder.model.to(device)
+
+    # encode prompt
+    with torch.no_grad():
+        if args.fp8_t5:
+            with torch.amp.autocast(device_type=device.type, dtype=config.t5_dtype):
+                context = text_encoder([args.prompt], device)
+                context_null = text_encoder([n_prompt], device)
+        else:
+            context = text_encoder([args.prompt], device)
+            context_null = text_encoder([n_prompt], device)
+
+    # free text encoder and clean memory
+    del text_encoder
+    clean_memory_on_device(device)
+
+    # load CLIP model
+    clip = load_clip_model(args, config, device)
+    clip.model.to(device)
+
+    # encode image to CLIP context
+    logger.info(f"Encoding image to CLIP context")
+    with torch.amp.autocast(device_type=device.type, dtype=torch.float16), torch.no_grad():
+        clip_context = clip.visual([img_tensor[:, None, :, :]])
+    logger.info(f"Encoding complete")
+
+    # free CLIP model and clean memory
+    del clip
+    clean_memory_on_device(device)
+
+    # encode image to latent space with VAE
+    logger.info(f"Encoding image to latent space")
+    vae.to_device(device)
+
+    # resize image
+    interpolation = cv2.INTER_AREA if h < img_cv2.shape[0] else cv2.INTER_CUBIC
+    img_resized = cv2.resize(img_cv2, (w, h), interpolation=interpolation)
+    img_resized = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB)
+    img_resized = TF.to_tensor(img_resized).sub_(0.5).div_(0.5).to(device)  # -1 to 1, CHW
+    img_resized = img_resized.unsqueeze(1)  # CFHW
+
+    # create mask for the first frame
+    # msk = torch.ones(1, frames, lat_h, lat_w, device=device)
+    # msk[:, 1:] = 0
+    # msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
+    # msk = msk.view(1, msk.shape[1] // 4, 4, lat_h, lat_w)
+    # msk = msk.transpose(1, 2)[0]
+
+    # rewrite to simpler version
+    msk = torch.zeros(4, lat_f, lat_h, lat_w, device=device)
+    msk[:, 0] = 1
+
+    # encode image to latent space
+    with accelerator.autocast(), torch.no_grad():
+        # padding to match the required number of frames
+        padding_frames = frames - 1  # the first frame is image
+        img_resized = torch.concat([img_resized, torch.zeros(3, padding_frames, h, w, device=device)], dim=1)
+        y = vae.encode([img_resized])[0]
+
+    y = torch.concat([msk, y])
+    logger.info(f"Encoding complete")
+
+    # move VAE to CPU
+    vae.to_device("cpu")
+    clean_memory_on_device(device)
+
+    # prepare model input arguments
+    arg_c = {
+        "context": [context[0]],
+        "clip_fea": clip_context,
+        "seq_len": max_seq_len,
+        "y": [y],
+    }
+
+    arg_null = {
+        "context": context_null,
+        "clip_fea": clip_context,
+        "seq_len": max_seq_len,
+        "y": [y],
+    }
+
+    return noise, context, context_null, y, (arg_c, arg_null)
+
+
+def setup_scheduler(args: argparse.Namespace, config, device: torch.device) -> Tuple[Any, torch.Tensor]:
+    """setup scheduler for sampling
+
+    Args:
+        args: command line arguments
+        config: model configuration
+        device: device to use
+
+    Returns:
+        Tuple[Any, torch.Tensor]: (scheduler, timesteps)
+    """
+    if args.sample_solver == "unipc":
+        scheduler = FlowUniPCMultistepScheduler(num_train_timesteps=config.num_train_timesteps, shift=1, use_dynamic_shifting=False)
+        scheduler.set_timesteps(args.infer_steps, device=device, shift=args.flow_shift)
+        timesteps = scheduler.timesteps
+    elif args.sample_solver == "dpm++":
+        scheduler = FlowDPMSolverMultistepScheduler(
+            num_train_timesteps=config.num_train_timesteps, shift=1, use_dynamic_shifting=False
+        )
+        sampling_sigmas = get_sampling_sigmas(args.infer_steps, args.flow_shift)
+        timesteps, _ = retrieve_timesteps(scheduler, device=device, sigmas=sampling_sigmas)
+    elif args.sample_solver == "vanilla":
+        scheduler = FlowMatchDiscreteScheduler(num_train_timesteps=config.num_train_timesteps, shift=args.flow_shift)
+        scheduler.set_timesteps(args.infer_steps, device=device)
+        timesteps = scheduler.timesteps
+
+        # FlowMatchDiscreteScheduler does not support generator argument in step method
+        org_step = scheduler.step
+
+        def step_wrapper(
+            model_output: torch.Tensor,
+            timestep: Union[int, torch.Tensor],
+            sample: torch.Tensor,
+            return_dict: bool = True,
+            generator=None,
+        ):
+            return org_step(model_output, timestep, sample, return_dict=return_dict)
+
+        scheduler.step = step_wrapper
+    else:
+        raise NotImplementedError("Unsupported solver.")
+
+    return scheduler, timesteps
+
+
+def run_sampling(
+    model: WanModel,
+    noise: torch.Tensor,
+    scheduler: Any,
+    timesteps: torch.Tensor,
+    args: argparse.Namespace,
+    inputs: Tuple[dict, dict],
+    device: torch.device,
+    seed_g: torch.Generator,
+    accelerator: Accelerator,
+    is_i2v: bool = False,
+    use_cpu_offload: bool = True,
+) -> torch.Tensor:
+    """run sampling
+    Args:
+        model: dit model
+        noise: initial noise
+        scheduler: scheduler for sampling
+        timesteps: time steps for sampling
+        args: command line arguments
+        inputs: model input (arg_c, arg_null)
+        device: device to use
+        seed_g: random generator
+        accelerator: Accelerator instance
+        is_i2v: I2V mode (False means T2V mode)
+        use_cpu_offload: Whether to offload tensors to CPU during processing
+    Returns:
+        torch.Tensor: generated latent
+    """
+    arg_c, arg_null = inputs
+
+    latent = noise
+    if use_cpu_offload:
+        latent = latent.to("cpu")
+
+    for _, t in enumerate(tqdm(timesteps)):
+        # latent is on CPU if use_cpu_offload is True
+        latent_model_input = [latent.to(device)]
+        timestep = torch.stack([t]).to(device)
+
+        with accelerator.autocast(), torch.no_grad():
+            noise_pred_cond = model(latent_model_input, t=timestep, **arg_c)[0]
+            noise_pred_uncond = model(latent_model_input, t=timestep, **arg_null)[0]
+            del latent_model_input
+
+            if use_cpu_offload:
+                noise_pred_cond = noise_pred_cond.to("cpu")
+                noise_pred_uncond = noise_pred_uncond.to("cpu")
+
+            # apply guidance
+            noise_pred = noise_pred_uncond + args.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+
+            # step
+            latent_input = latent.unsqueeze(0)
+            temp_x0 = scheduler.step(noise_pred.unsqueeze(0), t, latent_input, return_dict=False, generator=seed_g)[0]
+
+            # update latent
+            latent = temp_x0.squeeze(0)
+
+    return latent
+
+
+def generate(args: argparse.Namespace) -> torch.Tensor:
+    """main function for generation
+
+    Args:
+        args: command line arguments
+
+    Returns:
+        torch.Tensor: generated latent
+    """
+    device = torch.device(args.device)
+
+    cfg = WAN_CONFIGS[args.task]
+
+    # select dtype
+    dit_dtype = detect_wan_sd_dtype(args.dit) if args.dit is not None else torch.bfloat16
+    if dit_dtype.itemsize == 1:
+        # if weight is in fp8, use bfloat16 for DiT (input/output)
+        dit_dtype = torch.bfloat16
+        if args.fp8_scaled:
+            raise ValueError(
+                "DiT weights is already in fp8 format, cannot scale to fp8. Please use fp16/bf16 weights / DiTの重みはすでにfp8形式です。fp8にスケーリングできません。fp16/bf16の重みを使用してください"
+            )
+
+    dit_weight_dtype = dit_dtype  # default
+    if args.fp8_scaled:
+        dit_weight_dtype = None  # various precision weights, so don't cast to specific dtype
+    elif args.fp8:
+        dit_weight_dtype = torch.float8_e4m3fn
+
+    vae_dtype = str_to_dtype(args.vae_dtype) if args.vae_dtype is not None else dit_dtype
+    logger.info(
+        f"Using device: {device}, DiT precision: {dit_dtype}, weight precision: {dit_weight_dtype}, VAE precision: {vae_dtype}"
+    )
+
+    # prepare accelerator
+    mixed_precision = "bf16" if dit_dtype == torch.bfloat16 else "fp16"
+    accelerator = accelerate.Accelerator(mixed_precision=mixed_precision)
+
+    # I2V or T2V
+    is_i2v = "i2v" in args.task
+
+    # prepare seed
+    seed = args.seed if args.seed is not None else random.randint(0, 2**32 - 1)
+    args.seed = seed  # set seed to args for saving
+
+    # prepare inputs
+    if is_i2v:
+        # I2V: need text encoder, VAE and CLIP
+        vae = load_vae(args, cfg, device, vae_dtype)
+        noise, context, context_null, y, inputs = prepare_i2v_inputs(args, cfg, accelerator, device, vae)
+        # vae is on CPU
+    else:
+        # T2V: need text encoder
+        noise, context, context_null, inputs = prepare_t2v_inputs(args, cfg, accelerator, device)
+        vae = None
+
+    # load DiT model
+    model = load_dit_model(args, cfg, device, dit_dtype, dit_weight_dtype, is_i2v)
+
+    # merge LoRA weights
+    if args.lora_weight is not None and len(args.lora_weight) > 0:
+        merge_lora_weights(model, args, device)
+
+        # if we only want to save the model, we can skip the rest
+        if args.save_merged_model:
+            return None
+
+    # optimize model: fp8 conversion, block swap etc.
+    optimize_model(model, args, device, dit_dtype, dit_weight_dtype)
+
+    # setup scheduler
+    scheduler, timesteps = setup_scheduler(args, cfg, device)
+
+    # set random generator
+    seed_g = torch.Generator(device=device)
+    seed_g.manual_seed(seed)
+
+    # run sampling
+    latent = run_sampling(model, noise, scheduler, timesteps, args, inputs, device, seed_g, accelerator, is_i2v)
+
+    # free memory
+    del model
+    del scheduler
+    synchronize_device(device)
+
+    # wait for 5 seconds until block swap is done
+    logger.info("Waiting for 5 seconds to finish block swap")
+    time.sleep(5)
+
+    gc.collect()
+    clean_memory_on_device(device)
+
+    # save VAE model for decoding
+    if vae is None:
+        args._vae = None
+    else:
+        args._vae = vae
+
+    return latent
+
+
+def decode_latent(latent: torch.Tensor, args: argparse.Namespace, cfg) -> torch.Tensor:
+    """decode latent
+
+    Args:
+        latent: latent tensor
+        args: command line arguments
+        cfg: model configuration
+
+    Returns:
+        torch.Tensor: decoded video or image
+    """
+    device = torch.device(args.device)
+
+    # load VAE model or use the one from the generation
+    vae_dtype = str_to_dtype(args.vae_dtype) if args.vae_dtype is not None else torch.bfloat16
+    if hasattr(args, "_vae") and args._vae is not None:
+        vae = args._vae
+    else:
+        vae = load_vae(args, cfg, device, vae_dtype)
+
+    vae.to_device(device)
+
+    logger.info(f"Decoding video from latents: {latent.shape}")
+    x0 = latent.to(device)
+
+    with torch.autocast(device_type=device.type, dtype=vae_dtype), torch.no_grad():
+        videos = vae.decode(x0)
+
+    logger.info(f"Decoding complete")
+    video = videos[0]
+    del videos
+    video = video.to(torch.float32).cpu()
+
+    return video
+
+
+def save_output(
+    latent: torch.Tensor, args: argparse.Namespace, cfg, height: int, width: int, original_base_names: Optional[List[str]] = None
+) -> None:
+    """save output
+
+    Args:
+        latent: latent tensor
+        args: command line arguments
+        cfg: model configuration
+        height: height of frame
+        width: width of frame
+        original_base_names: original base names (if latents are loaded from files)
+    """
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
+
+    seed = args.seed
+    video_length = args.video_length
+
+    if args.output_type == "latent" or args.output_type == "both":
+        # save latent
+        latent_path = f"{save_path}/{time_flag}_{seed}_latent.safetensors"
+
+        if args.no_metadata:
+            metadata = None
+        else:
+            metadata = {
+                "seeds": f"{seed}",
+                "prompt": f"{args.prompt}",
+                "height": f"{height}",
+                "width": f"{width}",
+                "video_length": f"{video_length}",
+                "infer_steps": f"{args.infer_steps}",
+                "guidance_scale": f"{args.guidance_scale}",
+            }
+            if args.negative_prompt is not None:
+                metadata["negative_prompt"] = f"{args.negative_prompt}"
+
+        sd = {"latent": latent}
+        save_file(sd, latent_path, metadata=metadata)
+        logger.info(f"Latent save to: {latent_path}")
+
+    if args.output_type == "video" or args.output_type == "both":
+        # save video
+        sample = decode_latent(latent.unsqueeze(0), args, cfg)
+        original_name = "" if original_base_names is None else f"_{original_base_names[0]}"
+        sample = sample.unsqueeze(0)
+        video_path = f"{save_path}/{time_flag}_{seed}{original_name}.mp4"
+        save_videos_grid(sample, video_path, fps=args.fps, rescale=True)
+        logger.info(f"Sample save to: {video_path}")
+
+    elif args.output_type == "images":
+        # save images
+        sample = decode_latent(latent.unsqueeze(0), args, cfg)
+        original_name = "" if original_base_names is None else f"_{original_base_names[0]}"
+        sample = sample.unsqueeze(0)
+        image_name = f"{time_flag}_{seed}{original_name}"
+        save_images_grid(sample, save_path, image_name, rescale=True)
+        logger.info(f"Sample images save to: {save_path}/{image_name}")
+
+
+def main():
+    # 引数解析
+    args = parse_args()
+
+    # check if latents are provided
+    latents_mode = args.latent_path is not None and len(args.latent_path) > 0
+
+    # set device
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    logger.info(f"Using device: {device}")
+    args.device = device
+
+    if not latents_mode:
+        # generation mode
+        # setup arguments
+        args = setup_args(args)
+        height, width, video_length = check_inputs(args)
+
+        logger.info(
+            f"video size: {height}x{width}@{video_length} (HxW@F), fps: {args.fps}, "
+            f"infer_steps: {args.infer_steps}, flow_shift: {args.flow_shift}"
+        )
+
+        # generate latent
+        latent = generate(args)
+
+        # make sure the model is freed from GPU memory
+        gc.collect()
+        clean_memory_on_device(args.device)
+
+        # save latent and video
+        if args.save_merged_model:
+            return
+
+        # add batch dimension
+        latent = latent.unsqueeze(0)
+        original_base_names = None
+    else:
+        # latents mode
+        original_base_names = []
+        latents_list = []
+        seeds = []
+
+        assert len(args.latent_path) == 1, "Only one latent path is supported for now"
+
+        for latent_path in args.latent_path:
+            original_base_names.append(os.path.splitext(os.path.basename(latent_path))[0])
+            seed = 0
+
+            if os.path.splitext(latent_path)[1] != ".safetensors":
+                latents = torch.load(latent_path, map_location="cpu")
+            else:
+                latents = load_file(latent_path)["latent"]
+                with safe_open(latent_path, framework="pt") as f:
+                    metadata = f.metadata()
+                if metadata is None:
+                    metadata = {}
+                logger.info(f"Loaded metadata: {metadata}")
+
+                if "seeds" in metadata:
+                    seed = int(metadata["seeds"])
+                if "height" in metadata and "width" in metadata:
+                    height = int(metadata["height"])
+                    width = int(metadata["width"])
+                    args.video_size = [height, width]
+                if "video_length" in metadata:
+                    args.video_length = int(metadata["video_length"])
+
+            seeds.append(seed)
+            latents_list.append(latents)
+
+            logger.info(f"Loaded latent from {latent_path}. Shape: {latents.shape}")
+
+        latent = torch.stack(latents_list, dim=0)  # [N, ...], must be same shape
+
+        # # use the arguments TODO get from latent shape
+        # height, width = args.video_size
+        # video_length = args.video_length
+        height = latents.shape[-2]
+        width = latents.shape[-1]
+        height *= cfg.patch_size[1] * cfg.vae_stride[1]
+        width *= cfg.patch_size[2] * cfg.vae_stride[2]
+        video_length = latents.shape[1]
+        video_length = (video_length - 1) * cfg.vae_stride[0] + 1
+        args.seed = seeds[0]
+
+    # decode and save
+    cfg = WAN_CONFIGS[args.task]
+    save_output(latent[0], args, cfg, height, width, original_base_names)
+
+    logger.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/wan_lora_trainer_gui.py b/wan_lora_trainer_gui.py
new file mode 100644
index 0000000000000000000000000000000000000000..3056c2134424c5660656a9e8bd902a087fece839
--- /dev/null
+++ b/wan_lora_trainer_gui.py
@@ -0,0 +1,864 @@
+import tkinter as tk
+from tkinter import ttk, filedialog, messagebox, Menu
+import subprocess
+import threading
+import json
+import os
+import sys
+import signal
+
+# Dark theme color scheme
+BG_COLOR = "#2C3E50"       # Main background (dark gray with blue tint)
+FG_COLOR = "#ECF0F1"       # Light text
+ACCENT_COLOR = "#2980B9"   # Blue accent for tabs
+ENTRY_BG = "#1B2A38"       # Entry field background (darker than main)
+BUTTON_ACTIVE = "#1B2A38"  # Active button background
+BORDER_COLOR = "#333333"   # Dark border color
+ACTIVE_ENTRY_BG = "white"  # Background color for active entry field
+ACTIVE_ENTRY_FG = "black"  # Text color for active entry field
+
+class LoRATrainerGUI:
+    def __init__(self, master):
+        self.master = master
+        master.title("Wan 2.1 LoRA Trainer")
+        master.geometry("900x1024")
+        master.configure(bg=BG_COLOR)
+
+        self.current_process = None
+        self.training_thread = None
+        self.process_group_id = None
+        self.user_scrolled = False  # Flag for manual console scrolling
+
+        # Initialize settings with default values, including conversion settings
+        self.settings = {
+            "DATASET_CONFIG": "dataset/dataset_example.toml",
+            "VAE_MODEL": "Models/Wan/Wan2.1_VAE.pth",
+            "CLIP_MODEL": "Models/Wan/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth",
+            "T5_MODEL": "Models/Wan/models_t5_umt5-xxl-enc-bf16.pth",
+            "DIT_MODEL": "Models/Wan/wan2.1_i2v_720p_14B_fp8_e4m3fn.safetensors",
+            "LORA_OUTPUT_DIR": "Output_LoRAs/",
+            "LORA_NAME": "My_Best_Lora_v1",
+            "MODEL_TYPE": "i2v-14B",
+            "FLOW_SHIFT": 3.0,
+            "LEARNING_RATE": 2e-5,
+            "LORA_LR_RATIO": 4,
+            "NETWORK_DIM": 32,
+            "NETWORK_ALPHA": 4,
+            "MAX_TRAIN_EPOCHS": 70,
+            "SAVE_EVERY_N_EPOCHS": 10,
+            "SEED": 1234,
+            "BLOCKS_SWAP": 16,
+            "RESUME_TRAINING": "",
+            "OPTIMIZER_TYPE": "adamw8bit",
+            "OPTIMIZER_ARGS": "",
+            "ATTENTION_MECHANISM": "none",
+            "LOGGING_DIR": "",
+            "LOG_WITH": "none",
+            "LOG_PREFIX": "",
+            "IMG_IN_TXT_IN_OFFLOADING": False,
+            "LR_SCHEDULER": "constant",
+            "LR_WARMUP_STEPS": "",
+            "LR_DECAY_STEPS": "",
+            "TIMESTEP_SAMPLING": "shift",
+            "DISCRETE_FLOW_SHIFT": "3.0",
+            "WEIGHTING_SCHEME": "none",
+            "METADATA_TITLE": "",
+            "METADATA_AUTHOR": "",
+            "METADATA_DESCRIPTION": "",
+            "METADATA_LICENSE": "",
+            "METADATA_TAGS": "",
+            "INPUT_LORA": "",
+            "OUTPUT_DIR": "",
+            "CONVERTED_LORA_NAME": "",
+            "FP8": True,  # Default FP8 setting
+            "SCALED": False  # Default Scaled setting
+        }
+
+        self.model_types = ["t2v-1.3B", "t2v-14B", "i2v-14B", "t2i-14B"]
+        self.optimizer_types = ["adamw", "adamw8bit", "adafactor", "torch.optim.AdamW", "bitsandbytes.optim.AdEMAMix8bit", "bitsandbytes.optim.PagedAdEMAMix8bit", "came"]
+
+        self.setup_styles()
+
+        # Create notebook and tabs
+        self.notebook = ttk.Notebook(master)
+        self.notebook.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
+
+        # Создание вкладок с привязкой события клика мыши
+        self.training_tab = ttk.Frame(self.notebook)
+        self.training_tab.bind("<Button-1>", self.remove_focus)  # Привязка клика для снятия фокуса
+        self.notebook.add(self.training_tab, text="Training settings")
+
+        self.advanced_tab = ttk.Frame(self.notebook)
+        self.advanced_tab.bind("<Button-1>", self.remove_focus)  # Привязка клика для снятия фокуса
+        self.notebook.add(self.advanced_tab, text="Advanced settings")
+
+        self.conversion_tab = ttk.Frame(self.notebook)
+        self.conversion_tab.bind("<Button-1>", self.remove_focus)  # Привязка клика для снятия фокуса
+        self.notebook.add(self.conversion_tab, text="LoRA Conversion")
+
+        # Initialize tab contents
+        self.create_training_settings()
+        self.create_advanced_settings()
+        self.create_conversion_settings()
+
+        # Create context menu for copying console text
+        self.context_menu = Menu(self.master, tearoff=0)
+        self.context_menu.add_command(label="Copy", command=self.copy_selected_text)
+
+    def remove_focus(self, event):
+        """Снимает фокус с активного виджета при клике по фону"""
+        self.master.focus_set()
+
+    def setup_styles(self):
+        """Set up styles for dark theme"""
+        style = ttk.Style()
+        style.theme_use("clam")
+
+        style.configure(".", background=BG_COLOR, foreground=FG_COLOR)
+        style.configure("TFrame", background=BG_COLOR)
+        style.configure("TLabel", background=BG_COLOR, foreground=FG_COLOR)
+
+        style.configure(
+            "TButton",
+            background=BG_COLOR,
+            foreground=FG_COLOR,
+            bordercolor=BORDER_COLOR,
+            borderwidth=1,
+            focusthickness=3,
+            focuscolor=BG_COLOR,
+            padding=[5, 1]
+        )
+        style.map(
+            "TButton",
+            background=[("active", BUTTON_ACTIVE), ("pressed", BUTTON_ACTIVE)],
+            foreground=[("active", FG_COLOR), ("pressed", FG_COLOR)]
+        )
+
+        style.configure("TCheckbutton", background=BG_COLOR, foreground=FG_COLOR)
+        style.map("TCheckbutton", background=[("active", BG_COLOR)], foreground=[("active", FG_COLOR)])
+
+        style.configure("TNotebook", background=BG_COLOR, borderwidth=0)
+        style.configure("TNotebook.Tab", background=BG_COLOR, foreground=FG_COLOR, padding=[5, 2])
+        style.map("TNotebook.Tab", background=[("selected", ACCENT_COLOR)], foreground=[("selected", FG_COLOR)])
+
+        style.configure(
+            "TEntry",
+            fieldbackground=ENTRY_BG,
+            foreground=FG_COLOR,
+            bordercolor=BORDER_COLOR
+        )
+        style.map("TEntry",
+            fieldbackground=[("focus", ACTIVE_ENTRY_BG)],
+            foreground=[("focus", ACTIVE_ENTRY_FG)]
+        )
+
+        style.configure(
+            "TCombobox",
+            fieldbackground=ENTRY_BG,
+            background=BG_COLOR,
+            foreground=FG_COLOR,
+            bordercolor=BORDER_COLOR
+        )
+        style.map("TCombobox",
+            fieldbackground=[("focus", ACTIVE_ENTRY_BG), ("readonly", ENTRY_BG), ("!disabled", ENTRY_BG)],
+            foreground=[("focus", ACTIVE_ENTRY_FG), ("readonly", FG_COLOR), ("!disabled", FG_COLOR)],
+            selectbackground=[("readonly", ENTRY_BG), ("!disabled", ENTRY_BG)],
+            selectforeground=[("readonly", FG_COLOR), ("!disabled", FG_COLOR)]
+        )
+
+        style.configure(
+            "Vertical.TScrollbar",
+            background=ENTRY_BG,
+            troughcolor=BG_COLOR,
+            bordercolor=BORDER_COLOR,
+            arrowcolor=FG_COLOR,
+            darkcolor=BG_COLOR,
+            lightcolor=BG_COLOR
+        )
+        style.map(
+            "Vertical.TScrollbar",
+            background=[("active", BUTTON_ACTIVE), ("pressed", BUTTON_ACTIVE)]
+        )
+
+    def create_training_settings(self):
+        row = 0
+
+        ttk.Label(self.training_tab, text="Training Settings", font=("Arial", 12, "bold")).grid(
+            row=row, column=0, columnspan=3, pady=(10, 10)
+        )
+        row += 1
+
+        button_frame_top = ttk.Frame(self.training_tab)
+        button_frame_top.grid(row=row, column=0, columnspan=3, pady=5)
+        ttk.Button(button_frame_top, text="Load Settings", command=self.load_settings).pack(side=tk.LEFT, padx=10)
+        ttk.Button(button_frame_top, text="Save Settings", command=self.save_settings).pack(side=tk.LEFT, padx=10)
+        row += 1
+
+        settings_config = [
+            ("Dataset Config", "DATASET_CONFIG", "file"),
+            ("VAE Model", "VAE_MODEL", "file"),
+            ("Clip Model", "CLIP_MODEL", "file"),
+            ("T5 Model", "T5_MODEL", "file"),
+            ("Dit Model", "DIT_MODEL", "file"),
+            ("LoRA Output Dir", "LORA_OUTPUT_DIR", "directory"),
+            ("LoRA Name", "LORA_NAME", "text"),
+            ("Model Type", "MODEL_TYPE", "dropdown"),
+            ("Flow Shift", "FLOW_SHIFT", "float"),
+            ("Learning Rate", "LEARNING_RATE", "float"),
+            ("LoRA LR Ratio", "LORA_LR_RATIO", "int"),
+            ("Network Dim", "NETWORK_DIM", "int"),
+            ("Network Alpha", "NETWORK_ALPHA", "float"),
+            ("Max Train Epochs", "MAX_TRAIN_EPOCHS", "int"),
+            ("Save Every N Epochs", "SAVE_EVERY_N_EPOCHS", "int"),
+            ("Seed", "SEED", "int"),
+            ("Blocks Swap", "BLOCKS_SWAP", "int"),
+            ("Resume Training", "RESUME_TRAINING", "directory"),
+            ("Optimizer Type", "OPTIMIZER_TYPE", "dropdown"),
+            ("Optimizer Args", "OPTIMIZER_ARGS", "text"),
+        ]
+
+        self.entries = {}
+
+        for label_text, key, input_type in settings_config:
+            ttk.Label(self.training_tab, text=f"{label_text}:").grid(
+                row=row, column=0, sticky=tk.W, padx=5, pady=2
+            )
+
+            if input_type == "dropdown":
+                if key == "MODEL_TYPE":
+                    var = tk.StringVar(value=self.settings[key])
+                    self.entries[key] = ttk.Combobox(
+                        self.training_tab, textvariable=var, values=self.model_types, state="readonly"
+                    )
+                    self.entries[key].current(self.model_types.index(self.settings[key]))
+                elif key == "OPTIMIZER_TYPE":
+                    var = tk.StringVar(value=self.settings[key])
+                    self.entries[key] = ttk.Combobox(
+                        self.training_tab, textvariable=var, values=self.optimizer_types, state="readonly"
+                    )
+                    self.entries[key].current(self.optimizer_types.index(self.settings[key]))
+            else:
+                self.entries[key] = ttk.Entry(self.training_tab, width=40)
+                self.entries[key].insert(0, self.settings[key])
+
+            self.entries[key].grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+
+            if input_type in ["file", "directory"]:
+                ttk.Button(
+                    self.training_tab,
+                    text="Browse",
+                    command=lambda k=key, t=input_type: self.browse_file(k, t)
+                ).grid(row=row, column=2, sticky=tk.W, padx=5)
+
+            row += 1
+
+        # Weight Optimization Checkboxes
+        ttk.Label(self.training_tab, text="Weight Optimization:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.fp8_var = tk.BooleanVar(value=self.settings["FP8"])
+        self.scaled_var = tk.BooleanVar(value=self.settings["SCALED"])
+        self.fp8_check = ttk.Checkbutton(self.training_tab, text="FP8 Base", variable=self.fp8_var, command=self.toggle_scaled)
+        self.fp8_check.grid(row=row, column=1, sticky=tk.W, padx=5, pady=2)
+        self.scaled_check = ttk.Checkbutton(self.training_tab, text="FP8 Scaled", variable=self.scaled_var, state=tk.DISABLED if not self.fp8_var.get() else tk.NORMAL)
+        self.scaled_check.grid(row=row, column=1, sticky=tk.W, padx=100, pady=2)
+        row += 1
+
+        self.enable_cache_var = tk.BooleanVar(value=True)
+        ttk.Checkbutton(
+            self.training_tab, text="Enable Cache Preparation", variable=self.enable_cache_var
+        ).grid(row=row, column=0, columnspan=3, pady=5)
+        row += 1
+
+        button_frame = ttk.Frame(self.training_tab)
+        button_frame.grid(row=row, column=0, columnspan=3, pady=10)
+
+        ttk.Button(button_frame, text="Start Training", command=self.start_training).pack(side=tk.LEFT, padx=10)
+        ttk.Button(button_frame, text="Stop Training", command=self.stop_training).pack(side=tk.LEFT, padx=10)
+        row += 1
+
+        self.console_frame = ttk.Frame(self.training_tab)
+        self.console_frame.grid(row=row, column=0, columnspan=3, padx=5, pady=5, sticky="nsew")
+
+        self.console_output = tk.Text(
+            self.console_frame,
+            height=10,
+            width=80,
+            bg=ENTRY_BG,
+            fg=FG_COLOR,
+            wrap="word",
+            state="disabled",
+            selectbackground="white",
+            selectforeground="black"
+        )
+        self.console_output.grid(row=0, column=0, sticky="nsew")
+
+        self.console_scrollbar = ttk.Scrollbar(
+            self.console_frame,
+            orient="vertical",
+            command=self.console_output.yview,
+            style="Vertical.TScrollbar"
+        )
+        self.console_scrollbar.grid(row=0, column=1, sticky="ns")
+
+        self.console_output.configure(yscrollcommand=self.console_scrollbar.set)
+
+        self.console_output.bind("<MouseWheel>", self.on_mousewheel)
+        self.console_output.bind("<Button-4>", self.on_mousewheel)  # For Linux
+        self.console_output.bind("<Button-5>", self.on_mousewheel)  # For Linux
+        self.console_output.bind("<Button-3>", self.show_context_menu)
+
+        self.training_tab.grid_rowconfigure(row, weight=1)
+        self.training_tab.grid_columnconfigure(1, weight=1)
+        self.console_frame.grid_rowconfigure(0, weight=1)
+        self.console_frame.grid_columnconfigure(0, weight=1)
+
+    def toggle_scaled(self):
+        """Enable or disable the Scaled checkbox based on FP8 checkbox state"""
+        if self.fp8_var.get():
+            self.scaled_check.config(state=tk.NORMAL)
+        else:
+            self.scaled_check.config(state=tk.DISABLED)
+            self.scaled_var.set(False)
+
+    def create_advanced_settings(self):
+        row = 0
+
+        ttk.Label(self.advanced_tab, text="Advanced Settings", font=("Arial", 12, "bold")).grid(
+            row=row, column=0, columnspan=3, pady=(10, 10)
+        )
+        row += 1
+
+        # Attention Mechanism
+        ttk.Label(self.advanced_tab, text="Attention Mechanism:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.attention_var = tk.StringVar(value=self.settings["ATTENTION_MECHANISM"])
+        attention_options = ["none", "sdpa", "flash_attn", "sage_attn", "xformers", "flash3", "split_attn"]
+        self.entries["ATTENTION_MECHANISM"] = ttk.Combobox(self.advanced_tab, textvariable=self.attention_var, values=attention_options, state="readonly")
+        self.entries["ATTENTION_MECHANISM"].grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        row += 1
+
+        # Logging
+        ttk.Label(self.advanced_tab, text="Logging Directory:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.entries["LOGGING_DIR"] = ttk.Entry(self.advanced_tab, width=40)
+        self.entries["LOGGING_DIR"].grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        ttk.Button(self.advanced_tab, text="Browse", command=lambda: self.browse_directory("LOGGING_DIR")).grid(row=row, column=2, sticky=tk.W, padx=5)
+        row += 1
+
+        ttk.Label(self.advanced_tab, text="Log With:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.log_with_var = tk.StringVar(value=self.settings["LOG_WITH"])
+        log_with_options = ["none", "tensorboard", "wandb", "all"]
+        self.entries["LOG_WITH"] = ttk.Combobox(self.advanced_tab, textvariable=self.log_with_var, values=log_with_options, state="readonly")
+        self.entries["LOG_WITH"].grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        row += 1
+
+        ttk.Label(self.advanced_tab, text="Log Prefix:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.entries["LOG_PREFIX"] = ttk.Entry(self.advanced_tab, width=40)
+        self.entries["LOG_PREFIX"].grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        row += 1
+
+        # Memory Management
+        self.img_in_txt_in_offloading_var = tk.BooleanVar(value=self.settings["IMG_IN_TXT_IN_OFFLOADING"])
+        ttk.Checkbutton(self.advanced_tab, text="Offload img_in and txt_in to CPU", variable=self.img_in_txt_in_offloading_var).grid(row=row, column=0, columnspan=3, pady=5)
+        self.entries["IMG_IN_TXT_IN_OFFLOADING"] = self.img_in_txt_in_offloading_var
+        row += 1
+
+        # Learning Rate Scheduler
+        ttk.Label(self.advanced_tab, text="LR Scheduler:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.lr_scheduler_var = tk.StringVar(value=self.settings["LR_SCHEDULER"])
+        lr_scheduler_options = ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup", "adafactor"]
+        self.entries["LR_SCHEDULER"] = ttk.Combobox(self.advanced_tab, textvariable=self.lr_scheduler_var, values=lr_scheduler_options, state="readonly")
+        self.entries["LR_SCHEDULER"].grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        row += 1
+
+        ttk.Label(self.advanced_tab, text="LR Warmup Steps:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.entries["LR_WARMUP_STEPS"] = ttk.Entry(self.advanced_tab, width=40)
+        self.entries["LR_WARMUP_STEPS"].grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        row += 1
+
+        ttk.Label(self.advanced_tab, text="LR Decay Steps:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.entries["LR_DECAY_STEPS"] = ttk.Entry(self.advanced_tab, width=40)
+        self.entries["LR_DECAY_STEPS"].grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        row += 1
+
+        # Timestep Sampling
+        ttk.Label(self.advanced_tab, text="Timestep Sampling:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.timestep_sampling_var = tk.StringVar(value=self.settings["TIMESTEP_SAMPLING"])
+        timestep_sampling_options = ["sigma", "uniform", "sigmoid", "shift"]
+        self.entries["TIMESTEP_SAMPLING"] = ttk.Combobox(self.advanced_tab, textvariable=self.timestep_sampling_var, values=timestep_sampling_options, state="readonly")
+        self.entries["TIMESTEP_SAMPLING"].grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        row += 1
+
+        ttk.Label(self.advanced_tab, text="Discrete Flow Shift:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.entries["DISCRETE_FLOW_SHIFT"] = ttk.Entry(self.advanced_tab, width=40)
+        self.entries["DISCRETE_FLOW_SHIFT"].insert(0, self.settings["DISCRETE_FLOW_SHIFT"])
+        self.entries["DISCRETE_FLOW_SHIFT"].grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        row += 1
+
+        # Weighting Scheme
+        ttk.Label(self.advanced_tab, text="Weighting Scheme:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.weighting_scheme_var = tk.StringVar(value=self.settings["WEIGHTING_SCHEME"])
+        weighting_scheme_options = ["logit_normal", "mode", "cosmap", "sigma_sqrt", "none"]
+        self.entries["WEIGHTING_SCHEME"] = ttk.Combobox(self.advanced_tab, textvariable=self.weighting_scheme_var, values=weighting_scheme_options, state="readonly")
+        self.entries["WEIGHTING_SCHEME"].grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        row += 1
+
+        # Metadata
+        ttk.Label(self.advanced_tab, text="Metadata Title:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.entries["METADATA_TITLE"] = ttk.Entry(self.advanced_tab, width=40)
+        self.entries["METADATA_TITLE"].grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        row += 1
+
+        ttk.Label(self.advanced_tab, text="Metadata Author:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.entries["METADATA_AUTHOR"] = ttk.Entry(self.advanced_tab, width=40)
+        self.entries["METADATA_AUTHOR"].grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        row += 1
+
+        ttk.Label(self.advanced_tab, text="Metadata Description:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.entries["METADATA_DESCRIPTION"] = ttk.Entry(self.advanced_tab, width=40)
+        self.entries["METADATA_DESCRIPTION"].grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        row += 1
+
+        ttk.Label(self.advanced_tab, text="Metadata License:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.entries["METADATA_LICENSE"] = ttk.Entry(self.advanced_tab, width=40)
+        self.entries["METADATA_LICENSE"].grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        row += 1
+
+        ttk.Label(self.advanced_tab, text="Metadata Tags:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.entries["METADATA_TAGS"] = ttk.Entry(self.advanced_tab, width=40)
+        self.entries["METADATA_TAGS"].grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        row += 1
+
+        # Настройка столбца для автоматического расширения
+        self.advanced_tab.grid_columnconfigure(1, weight=1)
+
+    def create_conversion_settings(self):
+        """Create the LoRA Conversion tab with input fields and buttons"""
+        row = 0
+
+        ttk.Label(self.conversion_tab, text="LoRA Conversion Settings", font=("Arial", 12, "bold")).grid(
+            row=row, column=0, columnspan=3, pady=(10, 10)
+        )
+        row += 1
+
+        # Input LoRA File
+        ttk.Label(self.conversion_tab, text="Input LoRA File:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.input_lora_entry = ttk.Entry(self.conversion_tab, width=40)
+        self.input_lora_entry.grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        self.input_lora_entry.insert(0, self.settings["INPUT_LORA"])
+        ttk.Button(self.conversion_tab, text="Browse", command=self.browse_input_lora).grid(row=row, column=2, sticky=tk.W, padx=5)
+        row += 1
+
+        # Output Directory
+        ttk.Label(self.conversion_tab, text="Output Directory:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.output_dir_entry = ttk.Entry(self.conversion_tab, width=40)
+        self.output_dir_entry.grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        self.output_dir_entry.insert(0, self.settings["OUTPUT_DIR"])
+        ttk.Button(self.conversion_tab, text="Browse", command=self.browse_output_dir).grid(row=row, column=2, sticky=tk.W, padx=5)
+        row += 1
+
+        # Converted LoRA Name
+        ttk.Label(self.conversion_tab, text="Converted LoRA Name:").grid(row=row, column=0, sticky=tk.W, padx=5, pady=2)
+        self.converted_lora_name_entry = ttk.Entry(self.conversion_tab, width=40)
+        self.converted_lora_name_entry.grid(row=row, column=1, sticky=tk.EW, padx=5, pady=2)
+        self.converted_lora_name_entry.insert(0, self.settings["CONVERTED_LORA_NAME"])
+        row += 1
+
+        # Convert Button
+        ttk.Button(self.conversion_tab, text="Convert", command=self.convert_lora).grid(row=row, column=0, columnspan=3, pady=10)
+
+        # Configure grid to expand horizontally
+        self.conversion_tab.grid_columnconfigure(1, weight=1)
+
+        # Add entries to self.entries for saving/loading
+        self.entries["INPUT_LORA"] = self.input_lora_entry
+        self.entries["OUTPUT_DIR"] = self.output_dir_entry
+        self.entries["CONVERTED_LORA_NAME"] = self.converted_lora_name_entry
+
+    def show_context_menu(self, event):
+        """Show context menu on right-click"""
+        try:
+            self.context_menu.tk_popup(event.x_root, event.y_root)
+        finally:
+            self.context_menu.grab_release()
+
+    def copy_selected_text(self):
+        """Copy selected text to clipboard"""
+        if self.console_output.selection_get():
+            self.master.clipboard_clear()
+            self.master.clipboard_append(self.console_output.selection_get())
+
+    def browse_directory(self, setting_name):
+        path = filedialog.askdirectory()
+        if path:
+            self.entries[setting_name].delete(0, tk.END)
+            self.entries[setting_name].insert(0, path)
+
+    def on_mousewheel(self, event):
+        """Handle scroll event"""
+        if self.console_output.yview()[1] < 1.0:
+            self.user_scrolled = True
+        else:
+            self.user_scrolled = False
+
+    def update_console(self, line):
+        """Update console with scroll handling"""
+        self.console_output.configure(state="normal")
+        self.console_output.insert(tk.END, line)
+        if not self.user_scrolled:
+            self.console_output.yview(tk.END)
+        self.console_output.configure(state="disabled")
+
+    def browse_file(self, setting_name, input_type):
+        if input_type == "directory":
+            path = filedialog.askdirectory()
+        else:
+            path = filedialog.askopenfilename()
+        if path:
+            self.settings[setting_name] = path
+            self.entries[setting_name].delete(0, tk.END)
+            self.entries[setting_name].insert(0, self.settings[setting_name])
+
+    def browse_input_lora(self):
+        """Browse for input LoRA file"""
+        file_path = filedialog.askopenfilename(filetypes=[("LoRA files", "*.safetensors")])
+        if file_path:
+            self.input_lora_entry.delete(0, tk.END)
+            self.input_lora_entry.insert(0, file_path)
+
+    def browse_output_dir(self):
+        """Browse for output directory"""
+        dir_path = filedialog.askdirectory()
+        if dir_path:
+            self.output_dir_entry.delete(0, tk.END)
+            self.output_dir_entry.insert(0, dir_path)
+
+    def convert_lora(self):
+        """Convert the LoRA model using specified settings"""
+        input_path = self.input_lora_entry.get()
+        output_dir = self.output_dir_entry.get()
+        converted_name = self.converted_lora_name_entry.get()
+
+        if not input_path or not output_dir or not converted_name:
+            messagebox.showerror("Error", "Please fill in all fields.")
+            return
+
+        output_path = os.path.join(output_dir, converted_name + ".safetensors")
+
+        command = [
+            sys.executable, "convert_lora.py",
+            "--input", input_path,
+            "--output", output_path,
+            "--target", "other"
+        ]
+
+        self.run_subprocess(command, "Conversion")
+
+    def run_subprocess(self, cmd, name, callback=None):
+        """Run a subprocess and handle its output with UTF-8 encoding"""
+        env = os.environ.copy()
+        env["PYTHONIOENCODING"] = "utf-8"  # Устанавливаем UTF-8 для среды выполнения
+
+        if os.name == 'nt':
+            creationflags = subprocess.CREATE_NEW_PROCESS_GROUP
+            preexec_fn = None
+        else:
+            creationflags = 0
+            preexec_fn = os.setsid
+
+        # Запускаем подпроцесс с явным указанием кодировки UTF-8
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,  # Включаем текстовый режим для автоматической декодировки
+            bufsize=1,  # Построчная буферизация
+            universal_newlines=True,  # Поддержка универсальных переносов строк
+            encoding='utf-8',  # Явно указываем кодировку UTF-8 для вывода
+            env=env,
+            creationflags=creationflags,
+            preexec_fn=preexec_fn
+        )
+        self.current_process = process
+        if os.name == 'nt':
+            self.process_group_id = process.pid
+
+        def read_output(pipe, output_type):
+            """Читает вывод подпроцесса построчно"""
+            while True:
+                line = pipe.readline()
+                if not line:
+                    break
+                self.master.after(0, self.update_console, f"{name} {output_type}: {line}")
+            pipe.close()
+
+        # Запускаем потоки для чтения stdout и stderr
+        threading.Thread(target=read_output, args=(process.stdout, "STDOUT"), daemon=True).start()
+        threading.Thread(target=read_output, args=(process.stderr, "STDERR"), daemon=True).start()
+
+        def check_process():
+            """Проверяет завершение подпроцесса"""
+            process.wait()
+            self.master.after(0, self.update_console, f"{name} process completed.\n")
+            self.current_process = None
+            if callback:
+                callback()
+
+        threading.Thread(target=check_process, daemon=True).start()
+
+    def start_training(self):
+        """Запускает обучение с последовательным выполнением процессов кэширования"""
+        # Check for unsupported optimizer
+        optimizer_type = self.entries["OPTIMIZER_TYPE"].get()
+        if optimizer_type == "came":
+            messagebox.showwarning(
+                "Предупреждение",
+                "Оптимизатор 'came' не поддерживается в текущей версии. Пожалуйста, выберите другой оптимизатор, например 'adamw' или 'adamw8bit'."
+            )
+            return
+
+        # Update settings from entries
+        self.settings.update({
+            "MODEL_TYPE": self.entries["MODEL_TYPE"].get(),
+            "FLOW_SHIFT": float(self.entries["FLOW_SHIFT"].get()),
+            "LEARNING_RATE": float(self.entries["LEARNING_RATE"].get()),
+            "LORA_LR_RATIO": int(self.entries["LORA_LR_RATIO"].get()),
+            "NETWORK_DIM": int(self.entries["NETWORK_DIM"].get()),
+            "NETWORK_ALPHA": float(self.entries["NETWORK_ALPHA"].get()),
+            "MAX_TRAIN_EPOCHS": int(self.entries["MAX_TRAIN_EPOCHS"].get()),
+            "SAVE_EVERY_N_EPOCHS": int(self.entries["SAVE_EVERY_N_EPOCHS"].get()),
+            "SEED": int(self.entries["SEED"].get()),
+            "BLOCKS_SWAP": int(self.entries["BLOCKS_SWAP"].get()),
+            "DATASET_CONFIG": self.entries["DATASET_CONFIG"].get(),
+            "VAE_MODEL": self.entries["VAE_MODEL"].get(),
+            "CLIP_MODEL": self.entries["CLIP_MODEL"].get(),
+            "T5_MODEL": self.entries["T5_MODEL"].get(),
+            "DIT_MODEL": self.entries["DIT_MODEL"].get(),
+            "LORA_OUTPUT_DIR": self.entries["LORA_OUTPUT_DIR"].get(),
+            "LORA_NAME": self.entries["LORA_NAME"].get(),
+            "RESUME_TRAINING": self.entries["RESUME_TRAINING"].get(),
+            "OPTIMIZER_TYPE": optimizer_type,
+            "OPTIMIZER_ARGS": self.entries["OPTIMIZER_ARGS"].get(),
+            "ATTENTION_MECHANISM": self.entries["ATTENTION_MECHANISM"].get(),
+            "LOGGING_DIR": self.entries["LOGGING_DIR"].get(),
+            "LOG_WITH": self.entries["LOG_WITH"].get(),
+            "LOG_PREFIX": self.entries["LOG_PREFIX"].get(),
+            "IMG_IN_TXT_IN_OFFLOADING": self.entries["IMG_IN_TXT_IN_OFFLOADING"].get(),
+            "LR_SCHEDULER": self.entries["LR_SCHEDULER"].get(),
+            "LR_WARMUP_STEPS": self.entries["LR_WARMUP_STEPS"].get(),
+            "LR_DECAY_STEPS": self.entries["LR_DECAY_STEPS"].get(),
+            "TIMESTEP_SAMPLING": self.entries["TIMESTEP_SAMPLING"].get(),
+            "DISCRETE_FLOW_SHIFT": self.entries["DISCRETE_FLOW_SHIFT"].get(),
+            "WEIGHTING_SCHEME": self.entries["WEIGHTING_SCHEME"].get(),
+            "METADATA_TITLE": self.entries["METADATA_TITLE"].get(),
+            "METADATA_AUTHOR": self.entries["METADATA_AUTHOR"].get(),
+            "METADATA_DESCRIPTION": self.entries["METADATA_DESCRIPTION"].get(),
+            "METADATA_LICENSE": self.entries["METADATA_LICENSE"].get(),
+            "METADATA_TAGS": self.entries["METADATA_TAGS"].get(),
+            "FP8": self.fp8_var.get(),
+            "SCALED": self.scaled_var.get()
+        })
+
+        # Build training command
+        command = [
+            "accelerate", "launch",
+            "--num_cpu_threads_per_process", "2",
+            "--mixed_precision", "bf16",
+            "wan_train_network.py",
+            "--task", self.settings["MODEL_TYPE"],
+            "--dit", self.settings["DIT_MODEL"],
+            "--dataset_config", self.settings["DATASET_CONFIG"],
+            "--sdpa",
+            "--mixed_precision", "bf16",
+        ]
+
+        # Добавляем параметры для Weight Optimization
+        if self.settings["FP8"]:
+            command.append("--fp8_base")
+            if self.settings["SCALED"]:
+                command.append("--fp8_scaled")
+
+        command.extend([
+            "--blocks_to_swap", str(self.settings["BLOCKS_SWAP"]),
+            "--optimizer_type", self.settings["OPTIMIZER_TYPE"],
+            "--learning_rate", str(self.settings["LEARNING_RATE"]),
+            "--gradient_checkpointing",
+            "--max_data_loader_n_workers", "2",
+            "--persistent_data_loader_workers",
+            "--network_module", "networks.lora_wan",
+            "--network_dim", str(self.settings["NETWORK_DIM"]),
+            "--network_alpha", str(self.settings["NETWORK_ALPHA"]),
+            "--network_args", f"loraplus_lr_ratio={self.settings['LORA_LR_RATIO']}",
+            "--timestep_sampling", self.settings["TIMESTEP_SAMPLING"],
+            "--discrete_flow_shift", str(self.settings["DISCRETE_FLOW_SHIFT"]),
+            "--max_train_epochs", str(self.settings["MAX_TRAIN_EPOCHS"]),
+            "--save_every_n_epochs", str(self.settings["SAVE_EVERY_N_EPOCHS"]),
+            "--save_state",
+            "--seed", str(self.settings["SEED"]),
+            "--output_dir", self.settings["LORA_OUTPUT_DIR"],
+            "--output_name", self.settings["LORA_NAME"],
+        ])
+
+        if self.settings["OPTIMIZER_ARGS"]:
+            command.extend(["--optimizer_args", self.settings["OPTIMIZER_ARGS"]])
+
+        attention = self.settings["ATTENTION_MECHANISM"]
+        if attention != "none":
+            command.append(f"--{attention}")
+
+        logging_dir = self.settings["LOGGING_DIR"]
+        if logging_dir:
+            command.extend(["--logging_dir", logging_dir])
+
+        log_with = self.settings["LOG_WITH"]
+        if log_with != "none":
+            command.extend(["--log_with", log_with])
+
+        log_prefix = self.settings["LOG_PREFIX"]
+        if log_prefix:
+            command.extend(["--log_prefix", log_prefix])
+
+        if self.settings["IMG_IN_TXT_IN_OFFLOADING"]:
+            command.append("--img_in_txt_in_offloading")
+
+        lr_scheduler = self.settings["LR_SCHEDULER"]
+        if lr_scheduler:
+            command.extend(["--lr_scheduler", lr_scheduler])
+
+        lr_warmup_steps = self.settings["LR_WARMUP_STEPS"]
+        if lr_warmup_steps:
+            command.extend(["--lr_warmup_steps", lr_warmup_steps])
+
+        lr_decay_steps = self.settings["LR_DECAY_STEPS"]
+        if lr_decay_steps:
+            command.extend(["--lr_decay_steps", lr_decay_steps])
+
+        weighting_scheme = self.settings["WEIGHTING_SCHEME"]
+        if weighting_scheme != "none":
+            command.extend(["--weighting_scheme", weighting_scheme])
+
+        metadata_title = self.settings["METADATA_TITLE"]
+        if metadata_title:
+            command.extend(["--metadata_title", metadata_title])
+
+        metadata_author = self.settings["METADATA_AUTHOR"]
+        if metadata_author:
+            command.extend(["--metadata_author", metadata_author])
+
+        metadata_description = self.settings["METADATA_DESCRIPTION"]
+        if metadata_description:
+            command.extend(["--metadata_description", metadata_description])
+
+        metadata_license = self.settings["METADATA_LICENSE"]
+        if metadata_license:
+            command.extend(["--metadata_license", metadata_license])
+
+        metadata_tags = self.settings["METADATA_TAGS"]
+        if metadata_tags:
+            command.extend(["--metadata_tags", metadata_tags])
+
+        if self.settings["RESUME_TRAINING"].strip():
+            command.append(f"--resume={self.settings['RESUME_TRAINING']}")
+
+        cache_preparation_command = [
+            sys.executable, "wan_cache_latents.py",
+            "--dataset_config", self.settings["DATASET_CONFIG"],
+            "--vae", self.settings["VAE_MODEL"],
+            "--clip", self.settings["CLIP_MODEL"]
+        ]
+
+        text_encoder_caching_command = [
+            sys.executable, "wan_cache_text_encoder_outputs.py",
+            "--dataset_config", self.settings["DATASET_CONFIG"],
+            "--t5", self.settings["T5_MODEL"],
+            "--batch_size", "16",
+            "--fp8_t5"
+        ]
+
+        self.console_output.configure(state="normal")
+        self.console_output.delete(1.0, tk.END)
+        self.console_output.configure(state="disabled")
+
+        if self.enable_cache_var.get():
+            self.update_console("Starting cache preparation...\n")
+
+            def on_text_encoder_caching_complete():
+                self.update_console("Text encoder caching completed.\nStarting training...\n")
+                self.run_subprocess(command, "Training")
+
+            def on_cache_preparation_complete():
+                self.update_console("Cache preparation completed.\nStarting text encoder caching...\n")
+                self.run_subprocess(text_encoder_caching_command, "Text Encoder Caching", on_text_encoder_caching_complete)
+
+            self.run_subprocess(cache_preparation_command, "Cache Preparation", on_cache_preparation_complete)
+        else:
+            self.update_console("Starting training without caching...\n")
+            self.run_subprocess(command, "Training")
+
+    def stop_training(self):
+        """Stop the current running process"""
+        if self.current_process and self.current_process.poll() is None:
+            try:
+                if os.name == 'nt':
+                    self.current_process.send_signal(signal.CTRL_BREAK_EVENT)
+                else:
+                    os.killpg(os.getpgid(self.current_process.pid), signal.SIGTERM)
+            except Exception as e:
+                self.update_console("Error stopping process: " + str(e) + "\n")
+            try:
+                self.current_process.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                try:
+                    self.current_process.kill()
+                    self.current_process.wait()
+                except Exception as e:
+                    self.update_console("Error killing process: " + str(e) + "\n")
+            self.current_process = None
+            if self.training_thread:
+                self.training_thread.join(timeout=1)
+                self.training_thread = None
+            self.update_console("Training stopped\n")
+        else:
+            self.update_console("No active process to stop\n")
+
+    def save_settings(self):
+        """Save all settings, including conversion settings, to a JSON file"""
+        current_settings = {}
+        for key, entry in self.entries.items():
+            if isinstance(entry, ttk.Combobox):
+                current_settings[key] = entry.get()
+            elif isinstance(entry, tk.BooleanVar):
+                current_settings[key] = entry.get()
+            else:
+                current_settings[key] = entry.get()
+        current_settings["FP8"] = self.fp8_var.get()
+        current_settings["SCALED"] = self.scaled_var.get()
+        current_settings["ENABLE_CACHE"] = self.enable_cache_var.get()
+        file_path = filedialog.asksaveasfilename(defaultextension=".json", filetypes=[("JSON files", "*.json")])
+        if file_path:
+            with open(file_path, "w") as f:
+                json.dump(current_settings, f, indent=4)
+
+    def load_settings(self):
+        """Load settings from a JSON file, including conversion settings"""
+        file_path = filedialog.askopenfilename(filetypes=[("JSON files", "*.json")])
+        if file_path:
+            with open(file_path, "r") as f:
+                loaded_settings = json.load(f)
+            for key, value in loaded_settings.items():
+                if key in self.entries:
+                    if isinstance(self.entries[key], ttk.Combobox):
+                        self.entries[key].set(value)
+                    elif isinstance(self.entries[key], tk.BooleanVar):
+                        self.entries[key].set(value)
+                    else:
+                        self.entries[key].delete(0, tk.END)
+                        self.entries[key].insert(0, value)
+            if "FP8" in loaded_settings:
+                self.fp8_var.set(loaded_settings["FP8"])
+            if "SCALED" in loaded_settings:
+                self.scaled_var.set(loaded_settings["SCALED"])
+            if "ENABLE_CACHE" in loaded_settings:
+                self.enable_cache_var.set(loaded_settings["ENABLE_CACHE"])
+            self.toggle_scaled()  # Update Scaled checkbox state based on FP8
+
+root = tk.Tk()
+gui = LoRATrainerGUI(root)
+root.mainloop()
\ No newline at end of file
diff --git a/wan_train_network.py b/wan_train_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..b01f48e6d855b6e03aa5df60874a8e0eb8689e55
--- /dev/null
+++ b/wan_train_network.py
@@ -0,0 +1,419 @@
+import argparse
+from typing import Optional
+from PIL import Image
+
+import torch
+import torchvision.transforms.functional as TF
+from tqdm import tqdm
+from accelerate import Accelerator, init_empty_weights
+
+# --- FIX STARTS HERE ---
+import torch._dynamo
+torch._dynamo.config.suppress_errors = True
+# --- FIX ENDS HERE ---
+
+from dataset.image_video_dataset import ARCHITECTURE_WAN, ARCHITECTURE_WAN_FULL
+from hv_generate_video import resize_image_to_bucket
+from hv_train_network import NetworkTrainer, load_prompts, clean_memory_on_device, setup_parser_common, read_config_from_file
+
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+from utils import model_utils
+from utils.safetensors_utils import load_safetensors, MemoryEfficientSafeOpen
+from wan.configs import WAN_CONFIGS
+from wan.modules.clip import CLIPModel
+from wan.modules.model import WanModel, detect_wan_sd_dtype, load_wan_model
+from wan.modules.t5 import T5EncoderModel
+from wan.modules.vae import WanVAE
+from wan.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
+
+
+class WanNetworkTrainer(NetworkTrainer):
+    def __init__(self):
+        super().__init__()
+
+    # region model specific
+
+    @property
+    def architecture(self) -> str:
+        return ARCHITECTURE_WAN
+
+    @property
+    def architecture_full_name(self) -> str:
+        return ARCHITECTURE_WAN_FULL
+
+    def handle_model_specific_args(self, args):
+        self.config = WAN_CONFIGS[args.task]
+        self._i2v_training = "i2v" in args.task
+
+        self.dit_dtype = detect_wan_sd_dtype(args.dit)
+
+        if self.dit_dtype == torch.float16:
+            assert args.mixed_precision in ["fp16", "no"], "DiT weights are in fp16, mixed precision must be fp16 or no"
+        elif self.dit_dtype == torch.bfloat16:
+            assert args.mixed_precision in ["bf16", "no"], "DiT weights are in bf16, mixed precision must be bf16 or no"
+
+        if args.fp8_scaled and self.dit_dtype.itemsize == 1:
+            raise ValueError(
+                "DiT weights is already in fp8 format, cannot scale to fp8. Please use fp16/bf16 weights / DiTの重みはすでにfp8形式です。fp8にスケーリングできません。fp16/bf16の重みを使用してください"
+            )
+
+        args.dit_dtype = model_utils.dtype_to_str(self.dit_dtype)
+
+    @property
+    def i2v_training(self) -> bool:
+        return self._i2v_training
+
+    def process_sample_prompts(
+        self,
+        args: argparse.Namespace,
+        accelerator: Accelerator,
+        sample_prompts: str,
+    ):
+        config = self.config
+        device = accelerator.device
+        t5_path, clip_path, fp8_t5 = args.t5, args.clip, args.fp8_t5
+
+        logger.info(f"cache Text Encoder outputs for sample prompt: {sample_prompts}")
+        prompts = load_prompts(sample_prompts)
+
+        def encode_for_text_encoder(text_encoder):
+            sample_prompts_te_outputs = {}  # (prompt) -> (embeds, mask)
+            # with accelerator.autocast(), torch.no_grad(): # this causes NaN if dit_dtype is fp16
+            t5_dtype = config.t5_dtype
+            with torch.amp.autocast(device_type=device.type, dtype=t5_dtype), torch.no_grad():
+                for prompt_dict in prompts:
+                    if "negative_prompt" not in prompt_dict:
+                        prompt_dict["negative_prompt"] = self.config["sample_neg_prompt"]
+                    for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", None)]:
+                        if p is None:
+                            continue
+                        if p not in sample_prompts_te_outputs:
+                            logger.info(f"cache Text Encoder outputs for prompt: {p}")
+
+                            prompt_outputs = text_encoder([p], device)
+                            sample_prompts_te_outputs[p] = prompt_outputs
+
+            return sample_prompts_te_outputs
+
+        # Load Text Encoder 1 and encode
+        logger.info(f"loading T5: {t5_path}")
+        t5 = T5EncoderModel(text_len=config.text_len, dtype=config.t5_dtype, device=device, weight_path=t5_path, fp8=fp8_t5)
+
+        logger.info("encoding with Text Encoder 1")
+        te_outputs_1 = encode_for_text_encoder(t5)
+        del t5
+
+        # load CLIP and encode image (for I2V training)
+        sample_prompts_image_embs = {}
+        for prompt_dict in prompts:
+            if prompt_dict.get("image_path", None) is not None:
+                sample_prompts_image_embs[prompt_dict["image_path"]] = None
+
+        if len(sample_prompts_image_embs) > 0:
+            logger.info(f"loading CLIP: {clip_path}")
+            assert clip_path is not None, "CLIP path is required for I2V training / I2V学習にはCLIPのパスが必要です"
+            clip = CLIPModel(dtype=config.clip_dtype, device=device, weight_path=clip_path)
+            clip.model.to(device)
+
+            logger.info(f"Encoding image to CLIP context")
+            with torch.amp.autocast(device_type=device.type, dtype=torch.float16), torch.no_grad():
+                for image_path in sample_prompts_image_embs:
+                    logger.info(f"Encoding image: {image_path}")
+                    img = Image.open(image_path).convert("RGB")
+                    img = TF.to_tensor(img).sub_(0.5).div_(0.5).to(device)  # -1 to 1
+                    clip_context = clip.visual([img[:, None, :, :]])
+                    sample_prompts_image_embs[image_path] = clip_context
+
+            del clip
+            clean_memory_on_device(device)
+
+        # prepare sample parameters
+        sample_parameters = []
+        for prompt_dict in prompts:
+            prompt_dict_copy = prompt_dict.copy()
+
+            p = prompt_dict.get("prompt", "")
+            prompt_dict_copy["t5_embeds"] = te_outputs_1[p][0]
+
+            p = prompt_dict.get("negative_prompt", None)
+            if p is not None:
+                prompt_dict_copy["negative_t5_embeds"] = te_outputs_1[p][0]
+
+            p = prompt_dict.get("image_path", None)
+            if p is not None:
+                prompt_dict_copy["clip_embeds"] = sample_prompts_image_embs[p]
+
+            sample_parameters.append(prompt_dict_copy)
+
+        clean_memory_on_device(accelerator.device)
+
+        return sample_parameters
+
+    def do_inference(
+        self,
+        accelerator,
+        args,
+        sample_parameter,
+        vae,
+        dit_dtype,
+        transformer,
+        discrete_flow_shift,
+        sample_steps,
+        width,
+        height,
+        frame_count,
+        generator,
+        do_classifier_free_guidance,
+        guidance_scale,
+        cfg_scale,
+        image_path=None,
+    ):
+        """architecture dependent inference"""
+        model: WanModel = transformer
+        device = accelerator.device
+        if cfg_scale is None:
+            cfg_scale = 5.0
+        do_classifier_free_guidance = do_classifier_free_guidance and cfg_scale != 1.0
+
+        # Calculate latent video length based on VAE version
+        latent_video_length = (frame_count - 1) // self.config["vae_stride"][0] + 1
+
+        # Get embeddings
+        context = sample_parameter["t5_embeds"].to(device=device)
+        if do_classifier_free_guidance:
+            context_null = sample_parameter["negative_t5_embeds"].to(device=device)
+        else:
+            context_null = None
+
+        num_channels_latents = 16  # model.in_dim
+        vae_scale_factor = self.config["vae_stride"][1]
+
+        # Initialize latents
+        lat_h = height // vae_scale_factor
+        lat_w = width // vae_scale_factor
+        shape_or_frame = (1, num_channels_latents, 1, lat_h, lat_w)
+        latents = []
+        for _ in range(latent_video_length):
+            latents.append(torch.randn(shape_or_frame, generator=generator, device=device, dtype=dit_dtype))
+        latents = torch.cat(latents, dim=2)
+
+        if self.i2v_training:
+            # Move VAE to the appropriate device for sampling: consider to cache image latents in CPU in advance
+            vae.to(device)
+            vae.eval()
+
+            image = Image.open(image_path)
+            image = resize_image_to_bucket(image, (width, height))  # returns a numpy array
+            image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(1).float()  # C, 1, H, W
+            image = image / 127.5 - 1  # -1 to 1
+
+            # Create mask for the required number of frames
+            msk = torch.ones(1, frame_count, lat_h, lat_w, device=device)
+            msk[:, 1:] = 0
+            msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
+            msk = msk.view(1, msk.shape[1] // 4, 4, lat_h, lat_w)
+            msk = msk.transpose(1, 2)  # B, C, T, H, W
+
+            with torch.amp.autocast(device_type=device.type, dtype=vae.dtype), torch.no_grad():
+                # Zero padding for the required number of frames only
+                padding_frames = frame_count - 1  # The first frame is the input image
+                image = torch.concat([image, torch.zeros(3, padding_frames, height, width)], dim=1).to(device=device)
+                y = vae.encode([image])[0]
+
+            y = y[:, :latent_video_length]  # may be not needed
+            y = y.unsqueeze(0)  # add batch dim
+            image_latents = torch.concat([msk, y], dim=1)
+
+            vae.to("cpu")
+            clean_memory_on_device(device)
+        else:
+            image_latents = None
+
+        # use the default value for num_train_timesteps (1000)
+        scheduler = FlowUniPCMultistepScheduler(shift=1, use_dynamic_shifting=False)
+        scheduler.set_timesteps(sample_steps, device=device, shift=discrete_flow_shift)
+        timesteps = scheduler.timesteps
+
+        # Generate noise for the required number of frames only
+        noise = torch.randn(16, latent_video_length, lat_h, lat_w, dtype=torch.float32, generator=generator, device=device).to(
+            "cpu"
+        )
+
+        # prepare the model input
+        max_seq_len = latent_video_length * lat_h * lat_w // (self.config.patch_size[1] * self.config.patch_size[2])
+        arg_c = {"context": [context], "seq_len": max_seq_len}
+        arg_null = {"context": [context_null], "seq_len": max_seq_len}
+
+        if self.i2v_training:
+            # I2V training
+            arg_c["clip_fea"] = sample_parameter["clip_embeds"].to(device=device, dtype=dit_dtype)
+            arg_c["y"] = image_latents
+            arg_null["clip_fea"] = arg_c["clip_fea"]
+            arg_null["y"] = image_latents
+
+        # Wrap the inner loop with tqdm to track progress over timesteps
+        prompt_idx = sample_parameter.get("enum", 0)
+        latent = noise
+        with torch.no_grad():
+            for i, t in enumerate(tqdm(timesteps, desc=f"Sampling timesteps for prompt {prompt_idx+1}")):
+                latent_model_input = [latent.to(device=device)]
+                timestep = t.unsqueeze(0)
+
+                with accelerator.autocast():
+                    noise_pred_cond = model(latent_model_input, t=timestep, **arg_c)[0].to("cpu")
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond = model(latent_model_input, t=timestep, **arg_null)[0].to("cpu")
+                    else:
+                        noise_pred_uncond = None
+
+                if do_classifier_free_guidance:
+                    noise_pred = noise_pred_uncond + cfg_scale * (noise_pred_cond - noise_pred_uncond)
+                else:
+                    noise_pred = noise_pred_cond
+
+                temp_x0 = scheduler.step(noise_pred.unsqueeze(0), t, latent.unsqueeze(0), return_dict=False, generator=generator)[0]
+                latent = temp_x0.squeeze(0)
+
+        # Move VAE to the appropriate device for sampling
+        vae.to(device)
+        vae.eval()
+
+        # Decode latents to video
+        logger.info(f"Decoding video from latents: {latent.shape}")
+        latent = latent.unsqueeze(0)  # add batch dim
+        latent = latent.to(device=device)
+
+        with torch.amp.autocast(device_type=device.type, dtype=vae.dtype), torch.no_grad():
+            video = vae.decode(latent)[0]  # vae returns list
+        video = video.unsqueeze(0)  # add batch dim
+        del latent
+
+        logger.info(f"Decoding complete")
+        video = video.to(torch.float32).cpu()
+        video = (video / 2 + 0.5).clamp(0, 1)  # -1 to 1 -> 0 to 1
+
+        vae.to("cpu")
+        clean_memory_on_device(device)
+
+        return video
+
+    def load_vae(self, args: argparse.Namespace, vae_dtype: torch.dtype, vae_path: str):
+        vae_path = args.vae
+
+        logger.info(f"Loading VAE model from {vae_path}")
+        cache_device = torch.device("cpu") if args.vae_cache_cpu else None
+        vae = WanVAE(vae_path=vae_path, device="cpu", dtype=vae_dtype, cache_device=cache_device)
+        return vae
+
+    def load_transformer(
+        self,
+        accelerator: Accelerator,
+        args: argparse.Namespace,
+        dit_path: str,
+        attn_mode: str,
+        split_attn: bool,
+        loading_device: str,
+        dit_weight_dtype: Optional[torch.dtype],
+    ):
+        model = load_wan_model(
+            self.config,
+            self.i2v_training,
+            accelerator.device,
+            dit_path,
+            attn_mode,
+            split_attn,
+            loading_device,
+            dit_weight_dtype,
+            args.fp8_scaled,
+        )
+        return model
+
+    def scale_shift_latents(self, latents):
+        return latents
+
+    def call_dit(
+        self,
+        args: argparse.Namespace,
+        accelerator: Accelerator,
+        transformer,
+        latents: torch.Tensor,
+        batch: dict[str, torch.Tensor],
+        noise: torch.Tensor,
+        noisy_model_input: torch.Tensor,
+        timesteps: torch.Tensor,
+        network_dtype: torch.dtype,
+    ):
+        model: WanModel = transformer
+
+        # I2V training
+        if self.i2v_training:
+            image_latents = batch["latents_image"]
+            clip_fea = batch["clip"]
+            image_latents = image_latents.to(device=accelerator.device, dtype=network_dtype)
+            clip_fea = clip_fea.to(device=accelerator.device, dtype=network_dtype)
+        else:
+            image_latents = None
+            clip_fea = None
+
+        context = [t.to(device=accelerator.device, dtype=network_dtype) for t in batch["t5"]]
+
+        # ensure the hidden state will require grad
+        if args.gradient_checkpointing:
+            noisy_model_input.requires_grad_(True)
+            for t in context:
+                t.requires_grad_(True)
+            if image_latents is not None:
+                image_latents.requires_grad_(True)
+            if clip_fea is not None:
+                clip_fea.requires_grad_(True)
+
+        # call DiT
+        lat_f, lat_h, lat_w = latents.shape[2:5]
+        seq_len = lat_f * lat_h * lat_w // (self.config.patch_size[0] * self.config.patch_size[1] * self.config.patch_size[2])
+        latents = latents.to(device=accelerator.device, dtype=network_dtype)
+        noisy_model_input = noisy_model_input.to(device=accelerator.device, dtype=network_dtype)
+        with accelerator.autocast():
+            model_pred = model(noisy_model_input, t=timesteps, context=context, clip_fea=clip_fea, seq_len=seq_len, y=image_latents)
+        model_pred = torch.stack(model_pred, dim=0)  # list to tensor
+
+        # flow matching loss
+        target = noise - latents
+
+        return model_pred, target
+
+    # endregion model specific
+
+
+def wan_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    """Wan2.1 specific parser setup"""
+    parser.add_argument("--task", type=str, default="t2v-14B", choices=list(WAN_CONFIGS.keys()), help="The task to run.")
+    parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT / DiTにスケーリングされたfp8を使う")
+    parser.add_argument("--t5", type=str, default=None, help="text encoder (T5) checkpoint path")
+    parser.add_argument("--fp8_t5", action="store_true", help="use fp8 for Text Encoder model")
+    parser.add_argument(
+        "--clip",
+        type=str,
+        default=None,
+        help="text encoder (CLIP) checkpoint path, optional. If training I2V model, this is required",
+    )
+    parser.add_argument("--vae_cache_cpu", action="store_true", help="cache features in VAE on CPU")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser_common()
+    parser = wan_setup_parser(parser)
+
+    args = parser.parse_args()
+    args = read_config_from_file(args, parser)
+
+    args.dit_dtype = None  # automatically detected
+    if args.vae_dtype is None:
+        args.vae_dtype = "bfloat16"  # make bfloat16 as default for VAE
+
+    trainer = WanNetworkTrainer()
+    trainer.train(args)
\ No newline at end of file