Spaces:

jujutechnology
/

wanloratrainer-gui

Runtime error

App Files Files Community

kundaja-green commited on Jun 23

Commit

0d8b1b0

1 Parent(s): ebb79f2

Implement persistent storage workflow to fix rate limiting

Browse files

Files changed (2) hide show

Dockerfile +7 -31
start.sh +39 -0

Dockerfile CHANGED Viewed

@@ -14,36 +14,12 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
 RUN pip install --no-cache-dir -r requirements.txt
-# --- NEW SECTION: DOWNLOAD MODELS ---
-# Download the official Wan2.1 models from their Hugging Face repository
-# This downloads them into a "Models/Wan" folder inside the container
-RUN huggingface-cli download wan-video/wan2.1 \
-    --repo-type model \
-    --include "*.pth" "*.json" "*.safetensors" \
-    --local-dir Models/Wan --local-dir-use-symlinks False
-# Copy all your project files (code, dataset configs, etc.) into the container
 COPY . .
-# This is the command that will run when the Space starts.
-# It uses the models we just downloaded.
-CMD ["accelerate", "launch", "wan_train_network.py", \
-     "--task", "i2v-14B", \
-     "--dit", "Models/Wan/wan2.1_i2v_720p_14B_fp8_e4m3fn.safetensors", \
-     "--vae", "Models/Wan/Wan2.1_VAE.pth", \
-     "--clip", "Models/Wan/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", \
-     "--t5", "Models/Wan/models_t5_umt5-xxl-enc-bf16.pth", \
-     "--dataset_config", "dataset/testtoml.toml", \
-     "--output_dir", "/data/output", \
-     "--output_name", "My_HF_Lora_v1", \
-     "--save_every_n_epochs", "10", \
-     "--max_train_epochs", "70", \
-     "--network_module", "networks.lora_wan", \
-     "--network_dim", "32", \
-     "--network_alpha", "4", \
-     "--learning_rate", "2e-5", \
-     "--optimizer_type", "adamw", \
-     "--mixed_precision", "bf16", \
-     "--gradient_checkpointing", \
-     "--sdpa" \
-]

 RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
 RUN pip install --no-cache-dir -r requirements.txt
+# Copy all project files, including the new start.sh script
 COPY . .
+# Make the startup script executable
+RUN chmod +x start.sh
+# The new command is just to run the script.
+# The script itself handles downloading models and starting the training.
+CMD ["./start.sh"]

start.sh ADDED Viewed

	@@ -0,0 +1,39 @@

+#!/bin/bash
+# Define the target directory for the models on persistent storage
+MODEL_DIR="/data/Models/Wan"
+# Check if a key model file already exists in the persistent storage
+if [ -f "$MODEL_DIR/wan2.1_i2v_720p_14B_fp8_e4m3fn.safetensors" ]; then
+    echo "Models already exist in persistent storage. Skipping download."
+else
+    echo "Models not found. Downloading to persistent storage..."
+    # If models don't exist, download them to the /data directory
+    huggingface-cli download Wan-AI/Wan2.1-I2V-14B-720P \
+        --repo-type model \
+        --include "*.pth" "*.json" "*.safetensors" \
+        --local-dir $MODEL_DIR --local-dir-use-symlinks False
+    echo "Download complete."
+fi
+# Now, run the actual training command using the models from persistent storage
+echo "Starting training..."
+accelerate launch wan_train_network.py \
+     --task "i2v-14B" \
+     --dit "$MODEL_DIR/wan2.1_i2v_720p_14B_fp8_e4m3fn.safetensors" \
+     --vae "$MODEL_DIR/Wan2.1_VAE.pth" \
+     --clip "$MODEL_DIR/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
+     --t5 "$MODEL_DIR/models_t5_umt5-xxl-enc-bf16.pth" \
+     --dataset_config "dataset/testtoml.toml" \
+     --output_dir "/data/output" \
+     --output_name "My_HF_Lora_v1" \
+     --save_every_n_epochs "10" \
+     --max_train_epochs "70" \
+     --network_module "networks.lora_wan" \
+     --network_dim "32" \
+     --network_alpha "4" \
+     --learning_rate "2e-5" \
+     --optimizer_type "adamw" \
+     --mixed_precision "bf16" \
+     --gradient_checkpointing \
+     --sdpa