Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitattributes +3 -29
- .gitignore +12 -16
- README.md +38 -53
- README_SPACE.md +52 -0
- app.py +189 -97
- requirements.txt +8 -13
.gitattributes
CHANGED
@@ -1,35 +1,9 @@
|
|
1 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.
|
35 |
-
|
|
|
|
1 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
4 |
*.gz filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
6 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -22,8 +22,8 @@ wheels/
|
|
22 |
|
23 |
# Virtual Environment
|
24 |
venv/
|
25 |
-
ENV/
|
26 |
env/
|
|
|
27 |
|
28 |
# IDE
|
29 |
.idea/
|
@@ -31,23 +31,19 @@ env/
|
|
31 |
*.swp
|
32 |
*.swo
|
33 |
|
34 |
-
#
|
35 |
-
*.log
|
36 |
-
|
37 |
-
# Local development
|
38 |
-
.env
|
39 |
-
.env.local
|
40 |
-
|
41 |
-
# Model weights
|
42 |
weights/
|
43 |
-
|
44 |
-
# Generated content
|
45 |
outputs/
|
46 |
-
temp/
|
47 |
*.mp4
|
48 |
*.wav
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
-
#
|
51 |
-
|
52 |
-
!assets/examples/*
|
53 |
-
!assets/audio/*
|
|
|
22 |
|
23 |
# Virtual Environment
|
24 |
venv/
|
|
|
25 |
env/
|
26 |
+
ENV/
|
27 |
|
28 |
# IDE
|
29 |
.idea/
|
|
|
31 |
*.swp
|
32 |
*.swo
|
33 |
|
34 |
+
# Project specific
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
weights/
|
|
|
|
|
36 |
outputs/
|
|
|
37 |
*.mp4
|
38 |
*.wav
|
39 |
+
*.jpg
|
40 |
+
*.png
|
41 |
+
*.safetensors
|
42 |
+
*.bin
|
43 |
+
|
44 |
+
# Logs
|
45 |
+
*.log
|
46 |
+
logs/
|
47 |
|
48 |
+
# OS
|
49 |
+
.DS_Store
|
|
|
|
README.md
CHANGED
@@ -1,74 +1,59 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: π¬
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
-
#
|
14 |
|
15 |
-
This is a
|
16 |
|
17 |
-
##
|
18 |
|
19 |
-
- π¬ Generate
|
20 |
-
- π₯ Support for both single and multi-person
|
21 |
- π― High-quality lip synchronization
|
22 |
-
- πΊ Support for
|
23 |
-
-
|
24 |
|
25 |
-
##
|
26 |
|
27 |
-
1.
|
28 |
-
2. Upload
|
29 |
-
3.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
##
|
32 |
|
33 |
-
|
34 |
-
-
|
35 |
-
-
|
|
|
36 |
|
37 |
-
|
38 |
-
- Controls lip movement influence
|
39 |
-
- Recommended: 4.0
|
40 |
-
- Higher values = more pronounced articulation
|
41 |
|
42 |
-
|
43 |
-
-
|
44 |
-
-
|
45 |
-
-
|
46 |
|
47 |
-
|
48 |
-
- Limits output video length
|
49 |
-
- Maximum: 15 seconds
|
50 |
-
- Default: 10 seconds
|
51 |
|
52 |
-
|
53 |
|
54 |
-
|
55 |
-
2. Provide detailed prompts
|
56 |
-
3. Start with example settings
|
57 |
-
4. Experiment with CFG values
|
58 |
-
5. Ensure good lighting in reference images
|
59 |
-
|
60 |
-
## π Requirements
|
61 |
-
|
62 |
-
- Input Image: Clear face photo(s)
|
63 |
-
- Audio: WAV format
|
64 |
-
- Prompt: Detailed scene description
|
65 |
-
|
66 |
-
## π Technical Details
|
67 |
-
|
68 |
-
- Model: MeiGen MultiTalk
|
69 |
-
- Framework: Gradio 4.12.0
|
70 |
-
- GPU: T4 (recommended)
|
71 |
-
|
72 |
-
## π¬ Contact
|
73 |
-
|
74 |
-
For questions or issues, please visit the [GitHub repository](https://github.com/yourusername/phunter_space) or create an issue on Hugging Face Spaces.
|
|
|
1 |
---
|
2 |
+
title: MeiGen MultiTalk Demo
|
3 |
emoji: π¬
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: blue
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.44.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
+
python_version: 3.10
|
12 |
+
spaces:
|
13 |
+
- ZeroGPU
|
14 |
---
|
15 |
|
16 |
+
# MeiGen-MultiTalk Demo
|
17 |
|
18 |
+
This is a demo of MeiGen-MultiTalk, an audio-driven multi-person conversational video generation model.
|
19 |
|
20 |
+
## Features
|
21 |
|
22 |
+
- π¬ Generate videos of people talking from still images and audio
|
23 |
+
- π₯ Support for both single-person and multi-person conversations
|
24 |
- π― High-quality lip synchronization
|
25 |
+
- πΊ Support for 480p and 720p resolution
|
26 |
+
- β±οΈ Generate videos up to 15 seconds long
|
27 |
|
28 |
+
## How to Use
|
29 |
|
30 |
+
1. Upload a reference image (photo of person(s) who will be speaking)
|
31 |
+
2. Upload an audio file
|
32 |
+
3. Enter a prompt describing the desired video
|
33 |
+
4. Adjust generation parameters if needed:
|
34 |
+
- Resolution: Video quality (480p or 720p)
|
35 |
+
- Audio CFG: Controls strength of audio influence
|
36 |
+
- Guidance Scale: Controls adherence to prompt
|
37 |
+
- Random Seed: For reproducible results
|
38 |
+
- Max Duration: Video length in seconds
|
39 |
+
5. Click "Generate Video" and wait for the result
|
40 |
|
41 |
+
## Tips
|
42 |
|
43 |
+
- Use clear, front-facing photos for best results
|
44 |
+
- Ensure good audio quality without background noise
|
45 |
+
- Keep prompts clear and specific
|
46 |
+
- For multi-person videos, ensure the reference image shows all speakers clearly
|
47 |
|
48 |
+
## Limitations
|
|
|
|
|
|
|
49 |
|
50 |
+
- Generation can take several minutes
|
51 |
+
- Maximum video duration is 15 seconds
|
52 |
+
- Best results with clear, well-lit reference images
|
53 |
+
- Audio should be clear and without background noise
|
54 |
|
55 |
+
## Credits
|
|
|
|
|
|
|
56 |
|
57 |
+
This demo uses the MeiGen-MultiTalk model created by MeiGen-AI.
|
58 |
|
59 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README_SPACE.md
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MeiGen-MultiTalk Demo
|
2 |
+
|
3 |
+
This is a demo of [MeiGen-MultiTalk](https://huggingface.co/MeiGen-AI/MeiGen-MultiTalk), an audio-driven multi-person conversational video generation model.
|
4 |
+
|
5 |
+
## Features
|
6 |
+
|
7 |
+
- π¬ Generate videos of people talking from still images and audio
|
8 |
+
- π₯ Support for both single-person and multi-person conversations
|
9 |
+
- π― High-quality lip synchronization
|
10 |
+
- πΊ Support for 480p and 720p resolution
|
11 |
+
- β±οΈ Generate videos up to 15 seconds long
|
12 |
+
|
13 |
+
## How to Use
|
14 |
+
|
15 |
+
1. Upload a reference image (photo of person(s) who will be speaking)
|
16 |
+
2. Upload one or more audio files:
|
17 |
+
- For single person: Upload one audio file
|
18 |
+
- For conversation: Upload multiple audio files (one per person)
|
19 |
+
3. Enter a prompt describing the desired video
|
20 |
+
4. Adjust generation parameters if needed:
|
21 |
+
- Resolution: Video quality (480p or 720p)
|
22 |
+
- Audio CFG: Controls strength of audio influence
|
23 |
+
- Guidance Scale: Controls adherence to prompt
|
24 |
+
- Random Seed: For reproducible results
|
25 |
+
- Max Duration: Video length in seconds
|
26 |
+
5. Click "Generate Video" and wait for the result
|
27 |
+
|
28 |
+
## Tips
|
29 |
+
|
30 |
+
- Use clear, front-facing photos for best results
|
31 |
+
- Ensure good audio quality without background noise
|
32 |
+
- Keep prompts clear and specific
|
33 |
+
- For multi-person videos, ensure the reference image shows all speakers clearly
|
34 |
+
|
35 |
+
## Limitations
|
36 |
+
|
37 |
+
- Generation can take several minutes
|
38 |
+
- Maximum video duration is 15 seconds
|
39 |
+
- Best results with clear, well-lit reference images
|
40 |
+
- Audio should be clear and without background noise
|
41 |
+
|
42 |
+
## Credits
|
43 |
+
|
44 |
+
This demo uses the MeiGen-MultiTalk model created by MeiGen-AI. If you use this in your work, please cite:
|
45 |
+
|
46 |
+
```bibtex
|
47 |
+
@article{kong2025let,
|
48 |
+
title={Let Them Talk: Audio-Driven Multi-Person Conversational Video Generation},
|
49 |
+
author={Kong, Zhe and Gao, Feng and Zhang, Yong and Kang, Zhuoliang and Wei, Xiaoming and Cai, Xunliang and Chen, Guanying and Luo, Wenhan},
|
50 |
+
journal={arXiv preprint arXiv:2505.22647},
|
51 |
+
year={2025}
|
52 |
+
}
|
app.py
CHANGED
@@ -1,140 +1,232 @@
|
|
1 |
-
import os
|
2 |
-
import json
|
3 |
import gradio as gr
|
4 |
-
from PIL import Image
|
5 |
import torch
|
6 |
-
|
|
|
7 |
import tempfile
|
|
|
|
|
|
|
8 |
|
9 |
-
#
|
10 |
-
|
11 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
# Here we'll add model loading logic
|
16 |
-
pass
|
17 |
-
|
18 |
-
def process_video(
|
19 |
image,
|
20 |
-
|
21 |
-
prompt,
|
22 |
resolution="480p",
|
23 |
-
audio_cfg=
|
24 |
-
|
|
|
25 |
seed=42,
|
26 |
-
max_duration=
|
|
|
27 |
):
|
28 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
try:
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
# Here we'll add video generation logic
|
63 |
-
# For now, return a message
|
64 |
-
return "Video generation will be implemented here"
|
65 |
-
|
66 |
except Exception as e:
|
67 |
-
return f"Error: {str(e)}"
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
""")
|
75 |
|
76 |
with gr.Row():
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
with gr.Row():
|
83 |
-
|
84 |
choices=["480p", "720p"],
|
85 |
value="480p",
|
86 |
label="Resolution"
|
87 |
)
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
minimum=1.0,
|
90 |
-
maximum=
|
91 |
-
value=
|
92 |
step=0.1,
|
93 |
-
label="Audio CFG"
|
94 |
)
|
95 |
|
96 |
-
|
97 |
-
cfg_input = gr.Slider(
|
98 |
minimum=1.0,
|
99 |
-
maximum=
|
100 |
-
value=
|
101 |
-
step=0.
|
102 |
label="Guidance Scale"
|
103 |
)
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
108 |
)
|
109 |
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
)
|
117 |
|
118 |
-
|
|
|
|
|
119 |
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
generate_btn.click(
|
124 |
-
fn=
|
125 |
inputs=[
|
126 |
image_input,
|
127 |
audio_input,
|
128 |
prompt_input,
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
134 |
],
|
135 |
-
outputs=
|
136 |
)
|
137 |
|
138 |
-
# Launch locally if running directly
|
139 |
if __name__ == "__main__":
|
140 |
-
demo.launch(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import torch
|
3 |
+
import numpy as np
|
4 |
+
from PIL import Image
|
5 |
import tempfile
|
6 |
+
import os
|
7 |
+
from pathlib import Path
|
8 |
+
import spaces
|
9 |
|
10 |
+
# Configuration
|
11 |
+
MAX_SEED = np.iinfo(np.int32).max
|
12 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
+
DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
|
14 |
|
15 |
+
@spaces.GPU(duration=120)
|
16 |
+
def generate_video(
|
|
|
|
|
|
|
|
|
17 |
image,
|
18 |
+
audio,
|
19 |
+
prompt="A person talking",
|
20 |
resolution="480p",
|
21 |
+
audio_cfg=2.5,
|
22 |
+
guidance_scale=5.0,
|
23 |
+
num_inference_steps=25,
|
24 |
seed=42,
|
25 |
+
max_duration=10,
|
26 |
+
progress=gr.Progress()
|
27 |
):
|
28 |
+
"""Generate talking video from image and audio"""
|
29 |
+
|
30 |
+
if image is None:
|
31 |
+
return None, "β Please upload an image"
|
32 |
+
|
33 |
+
if audio is None:
|
34 |
+
return None, "β Please upload an audio file"
|
35 |
+
|
36 |
try:
|
37 |
+
progress(0, "Initializing...")
|
38 |
+
|
39 |
+
# For now, return a placeholder message since we need to implement the actual model
|
40 |
+
# In a real implementation, you would load the MeiGen-MultiTalk model here
|
41 |
+
|
42 |
+
progress(0.5, "Processing audio and image...")
|
43 |
+
|
44 |
+
# Simulate processing time
|
45 |
+
import time
|
46 |
+
time.sleep(2)
|
47 |
+
|
48 |
+
progress(1.0, "Video generation complete!")
|
49 |
+
|
50 |
+
return None, f"""β
Video generation request processed!
|
51 |
+
|
52 |
+
**Settings:**
|
53 |
+
- Prompt: {prompt}
|
54 |
+
- Resolution: {resolution}
|
55 |
+
- Audio CFG: {audio_cfg}
|
56 |
+
- Guidance Scale: {guidance_scale}
|
57 |
+
- Steps: {num_inference_steps}
|
58 |
+
- Seed: {seed}
|
59 |
+
- Max Duration: {max_duration}s
|
60 |
+
|
61 |
+
**Note:** This is a demo interface. To implement the actual video generation, you would need to:
|
62 |
+
1. Load the MeiGen-MultiTalk model
|
63 |
+
2. Process the input image and audio
|
64 |
+
3. Generate the video using the model
|
65 |
+
4. Return the generated video file
|
66 |
+
|
67 |
+
The model files are not included in this demo due to size constraints."""
|
68 |
+
|
|
|
|
|
|
|
|
|
69 |
except Exception as e:
|
70 |
+
return None, f"β Error during generation: {str(e)}"
|
71 |
|
72 |
+
def randomize_seed():
|
73 |
+
return np.random.randint(0, MAX_SEED)
|
74 |
+
|
75 |
+
# Gradio Interface
|
76 |
+
with gr.Blocks(
|
77 |
+
theme=gr.themes.Soft(),
|
78 |
+
title="MeiGen-MultiTalk Demo",
|
79 |
+
css="""
|
80 |
+
.main-header {
|
81 |
+
text-align: center;
|
82 |
+
background: linear-gradient(45deg, #ff6b6b, #4ecdc4);
|
83 |
+
-webkit-background-clip: text;
|
84 |
+
-webkit-text-fill-color: transparent;
|
85 |
+
background-clip: text;
|
86 |
+
font-size: 2.5em;
|
87 |
+
font-weight: bold;
|
88 |
+
margin-bottom: 0.5em;
|
89 |
+
}
|
90 |
+
.subtitle {
|
91 |
+
text-align: center;
|
92 |
+
color: #666;
|
93 |
+
margin-bottom: 2em;
|
94 |
+
}
|
95 |
+
"""
|
96 |
+
) as demo:
|
97 |
+
|
98 |
+
gr.HTML("""
|
99 |
+
<div class="main-header">π¬ MeiGen-MultiTalk Demo</div>
|
100 |
+
<p class="subtitle">Generate talking videos from images and audio using AI</p>
|
101 |
""")
|
102 |
|
103 |
with gr.Row():
|
104 |
+
# Input Column
|
105 |
+
with gr.Column(scale=1):
|
106 |
+
gr.Markdown("### π Input Files")
|
107 |
+
|
108 |
+
image_input = gr.Image(
|
109 |
+
label="Reference Image",
|
110 |
+
type="pil",
|
111 |
+
height=300
|
112 |
+
)
|
113 |
+
|
114 |
+
audio_input = gr.Audio(
|
115 |
+
label="Audio File",
|
116 |
+
type="filepath"
|
117 |
+
)
|
118 |
+
|
119 |
+
prompt_input = gr.Textbox(
|
120 |
+
label="Prompt",
|
121 |
+
placeholder="A person talking naturally...",
|
122 |
+
value="A person talking",
|
123 |
+
lines=2
|
124 |
+
)
|
125 |
+
|
126 |
+
gr.Markdown("### βοΈ Generation Settings")
|
127 |
|
128 |
with gr.Row():
|
129 |
+
resolution = gr.Dropdown(
|
130 |
choices=["480p", "720p"],
|
131 |
value="480p",
|
132 |
label="Resolution"
|
133 |
)
|
134 |
+
|
135 |
+
max_duration = gr.Slider(
|
136 |
+
minimum=1,
|
137 |
+
maximum=15,
|
138 |
+
value=10,
|
139 |
+
step=1,
|
140 |
+
label="Max Duration (seconds)"
|
141 |
+
)
|
142 |
+
|
143 |
+
with gr.Row():
|
144 |
+
audio_cfg = gr.Slider(
|
145 |
minimum=1.0,
|
146 |
+
maximum=5.0,
|
147 |
+
value=2.5,
|
148 |
step=0.1,
|
149 |
+
label="Audio CFG Scale"
|
150 |
)
|
151 |
|
152 |
+
guidance_scale = gr.Slider(
|
|
|
153 |
minimum=1.0,
|
154 |
+
maximum=10.0,
|
155 |
+
value=5.0,
|
156 |
+
step=0.5,
|
157 |
label="Guidance Scale"
|
158 |
)
|
159 |
+
|
160 |
+
with gr.Row():
|
161 |
+
num_inference_steps = gr.Slider(
|
162 |
+
minimum=10,
|
163 |
+
maximum=50,
|
164 |
+
value=25,
|
165 |
+
step=1,
|
166 |
+
label="Inference Steps"
|
167 |
)
|
168 |
|
169 |
+
seed = gr.Number(
|
170 |
+
value=42,
|
171 |
+
minimum=0,
|
172 |
+
maximum=MAX_SEED,
|
173 |
+
label="Seed"
|
174 |
+
)
|
|
|
175 |
|
176 |
+
with gr.Row():
|
177 |
+
randomize_btn = gr.Button("π² Randomize Seed", variant="secondary")
|
178 |
+
generate_btn = gr.Button("π¬ Generate Video", variant="primary", size="lg")
|
179 |
|
180 |
+
# Output Column
|
181 |
+
with gr.Column(scale=1):
|
182 |
+
gr.Markdown("### π₯ Generated Video")
|
183 |
+
|
184 |
+
video_output = gr.Video(
|
185 |
+
label="Generated Video",
|
186 |
+
height=400
|
187 |
+
)
|
188 |
|
189 |
+
result_text = gr.Textbox(
|
190 |
+
label="Generation Log",
|
191 |
+
lines=8,
|
192 |
+
max_lines=15
|
193 |
+
)
|
194 |
+
|
195 |
+
# Examples
|
196 |
+
gr.Markdown("### π Tips for Best Results")
|
197 |
+
gr.Markdown("""
|
198 |
+
- **Image**: Use clear, front-facing photos with good lighting
|
199 |
+
- **Audio**: Ensure clean audio without background noise
|
200 |
+
- **Prompt**: Be specific about the desired talking style
|
201 |
+
- **Resolution**: Start with 480p for faster generation
|
202 |
+
- **Duration**: Shorter videos (5-10s) generally work better
|
203 |
+
""")
|
204 |
+
|
205 |
+
# Event handlers
|
206 |
+
randomize_btn.click(
|
207 |
+
fn=randomize_seed,
|
208 |
+
outputs=seed
|
209 |
+
)
|
210 |
+
|
211 |
generate_btn.click(
|
212 |
+
fn=generate_video,
|
213 |
inputs=[
|
214 |
image_input,
|
215 |
audio_input,
|
216 |
prompt_input,
|
217 |
+
resolution,
|
218 |
+
audio_cfg,
|
219 |
+
guidance_scale,
|
220 |
+
num_inference_steps,
|
221 |
+
seed,
|
222 |
+
max_duration
|
223 |
],
|
224 |
+
outputs=[video_output, result_text]
|
225 |
)
|
226 |
|
|
|
227 |
if __name__ == "__main__":
|
228 |
+
demo.launch(
|
229 |
+
share=False,
|
230 |
+
server_port=7860,
|
231 |
+
show_error=True
|
232 |
+
)
|
requirements.txt
CHANGED
@@ -1,20 +1,15 @@
|
|
|
|
1 |
torch>=2.0.0
|
2 |
torchvision
|
3 |
torchaudio
|
4 |
transformers>=4.30.0
|
5 |
-
diffusers
|
6 |
-
accelerate
|
7 |
-
|
8 |
-
opencv-python
|
|
|
9 |
numpy
|
10 |
scipy
|
11 |
-
tqdm
|
12 |
-
einops
|
13 |
-
omegaconf
|
14 |
-
huggingface-hub
|
15 |
-
moviepy
|
16 |
-
soundfile
|
17 |
librosa
|
18 |
-
|
19 |
-
|
20 |
-
pillow
|
|
|
1 |
+
gradio==4.44.1
|
2 |
torch>=2.0.0
|
3 |
torchvision
|
4 |
torchaudio
|
5 |
transformers>=4.30.0
|
6 |
+
diffusers>=0.21.0
|
7 |
+
accelerate>=0.21.0
|
8 |
+
xformers
|
9 |
+
opencv-python-headless
|
10 |
+
pillow
|
11 |
numpy
|
12 |
scipy
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
librosa
|
14 |
+
soundfile
|
15 |
+
spaces
|
|