rocketmandrey commited on
Commit
912dd8d
Β·
verified Β·
1 Parent(s): 0615e09

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. .gitattributes +3 -29
  2. .gitignore +12 -16
  3. README.md +38 -53
  4. README_SPACE.md +52 -0
  5. app.py +189 -97
  6. requirements.txt +8 -13
.gitattributes CHANGED
@@ -1,35 +1,9 @@
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
4
  *.gz filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  *.tgz filter=lfs diff=lfs merge=lfs -text
 
 
6
  *.zip filter=lfs diff=lfs merge=lfs -text
7
+ *.pth filter=lfs diff=lfs merge=lfs -text
8
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
9
+ *.onnx filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -22,8 +22,8 @@ wheels/
22
 
23
  # Virtual Environment
24
  venv/
25
- ENV/
26
  env/
 
27
 
28
  # IDE
29
  .idea/
@@ -31,23 +31,19 @@ env/
31
  *.swp
32
  *.swo
33
 
34
- # Logs
35
- *.log
36
-
37
- # Local development
38
- .env
39
- .env.local
40
-
41
- # Model weights
42
  weights/
43
-
44
- # Generated content
45
  outputs/
46
- temp/
47
  *.mp4
48
  *.wav
 
 
 
 
 
 
 
 
49
 
50
- # Keep example files
51
- !examples/*.json
52
- !assets/examples/*
53
- !assets/audio/*
 
22
 
23
  # Virtual Environment
24
  venv/
 
25
  env/
26
+ ENV/
27
 
28
  # IDE
29
  .idea/
 
31
  *.swp
32
  *.swo
33
 
34
+ # Project specific
 
 
 
 
 
 
 
35
  weights/
 
 
36
  outputs/
 
37
  *.mp4
38
  *.wav
39
+ *.jpg
40
+ *.png
41
+ *.safetensors
42
+ *.bin
43
+
44
+ # Logs
45
+ *.log
46
+ logs/
47
 
48
+ # OS
49
+ .DS_Store
 
 
README.md CHANGED
@@ -1,74 +1,59 @@
1
  ---
2
- title: Phunter Space - Video Generation Demo
3
  emoji: 🎬
4
- colorFrom: blue
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.34.2
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
 
 
11
  ---
12
 
13
- # Phunter Space - Video Generation Demo
14
 
15
- This is a Gradio demo for generating talking head videos from images and audio using advanced AI models.
16
 
17
- ## 🌟 Features
18
 
19
- - πŸ’¬ Generate talking head videos from images and audio
20
- - πŸ‘₯ Support for both single and multi-person video generation
21
  - 🎯 High-quality lip synchronization
22
- - πŸ“Ί Support for multiple resolutions (480p, 720p)
23
- - 🎨 Customizable generation parameters
24
 
25
- ## πŸš€ Quick Start
26
 
27
- 1. Click "Load Night Studio Example" or "Load Day Studio Example"
28
- 2. Upload your audio file (WAV format)
29
- 3. Click "Generate Video"
 
 
 
 
 
 
 
30
 
31
- ## πŸ“ Parameters Guide
32
 
33
- ### Resolution
34
- - 480p: Faster generation, lower quality
35
- - 720p: Better quality, slower generation
 
36
 
37
- ### Audio CFG (1.0-10.0)
38
- - Controls lip movement influence
39
- - Recommended: 4.0
40
- - Higher values = more pronounced articulation
41
 
42
- ### CFG Scale (1.0-15.0)
43
- - Controls prompt adherence
44
- - Recommended: 7.5
45
- - Higher values = stricter prompt following
46
 
47
- ### Max Duration
48
- - Limits output video length
49
- - Maximum: 15 seconds
50
- - Default: 10 seconds
51
 
52
- ## πŸ’‘ Tips
53
 
54
- 1. Use high-quality reference images
55
- 2. Provide detailed prompts
56
- 3. Start with example settings
57
- 4. Experiment with CFG values
58
- 5. Ensure good lighting in reference images
59
-
60
- ## πŸ“‹ Requirements
61
-
62
- - Input Image: Clear face photo(s)
63
- - Audio: WAV format
64
- - Prompt: Detailed scene description
65
-
66
- ## πŸ›  Technical Details
67
-
68
- - Model: MeiGen MultiTalk
69
- - Framework: Gradio 4.12.0
70
- - GPU: T4 (recommended)
71
-
72
- ## πŸ“¬ Contact
73
-
74
- For questions or issues, please visit the [GitHub repository](https://github.com/yourusername/phunter_space) or create an issue on Hugging Face Spaces.
 
1
  ---
2
+ title: MeiGen MultiTalk Demo
3
  emoji: 🎬
4
+ colorFrom: red
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 4.44.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ python_version: 3.10
12
+ spaces:
13
+ - ZeroGPU
14
  ---
15
 
16
+ # MeiGen-MultiTalk Demo
17
 
18
+ This is a demo of MeiGen-MultiTalk, an audio-driven multi-person conversational video generation model.
19
 
20
+ ## Features
21
 
22
+ - πŸ’¬ Generate videos of people talking from still images and audio
23
+ - πŸ‘₯ Support for both single-person and multi-person conversations
24
  - 🎯 High-quality lip synchronization
25
+ - πŸ“Ί Support for 480p and 720p resolution
26
+ - ⏱️ Generate videos up to 15 seconds long
27
 
28
+ ## How to Use
29
 
30
+ 1. Upload a reference image (photo of person(s) who will be speaking)
31
+ 2. Upload an audio file
32
+ 3. Enter a prompt describing the desired video
33
+ 4. Adjust generation parameters if needed:
34
+ - Resolution: Video quality (480p or 720p)
35
+ - Audio CFG: Controls strength of audio influence
36
+ - Guidance Scale: Controls adherence to prompt
37
+ - Random Seed: For reproducible results
38
+ - Max Duration: Video length in seconds
39
+ 5. Click "Generate Video" and wait for the result
40
 
41
+ ## Tips
42
 
43
+ - Use clear, front-facing photos for best results
44
+ - Ensure good audio quality without background noise
45
+ - Keep prompts clear and specific
46
+ - For multi-person videos, ensure the reference image shows all speakers clearly
47
 
48
+ ## Limitations
 
 
 
49
 
50
+ - Generation can take several minutes
51
+ - Maximum video duration is 15 seconds
52
+ - Best results with clear, well-lit reference images
53
+ - Audio should be clear and without background noise
54
 
55
+ ## Credits
 
 
 
56
 
57
+ This demo uses the MeiGen-MultiTalk model created by MeiGen-AI.
58
 
59
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README_SPACE.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MeiGen-MultiTalk Demo
2
+
3
+ This is a demo of [MeiGen-MultiTalk](https://huggingface.co/MeiGen-AI/MeiGen-MultiTalk), an audio-driven multi-person conversational video generation model.
4
+
5
+ ## Features
6
+
7
+ - πŸ’¬ Generate videos of people talking from still images and audio
8
+ - πŸ‘₯ Support for both single-person and multi-person conversations
9
+ - 🎯 High-quality lip synchronization
10
+ - πŸ“Ί Support for 480p and 720p resolution
11
+ - ⏱️ Generate videos up to 15 seconds long
12
+
13
+ ## How to Use
14
+
15
+ 1. Upload a reference image (photo of person(s) who will be speaking)
16
+ 2. Upload one or more audio files:
17
+ - For single person: Upload one audio file
18
+ - For conversation: Upload multiple audio files (one per person)
19
+ 3. Enter a prompt describing the desired video
20
+ 4. Adjust generation parameters if needed:
21
+ - Resolution: Video quality (480p or 720p)
22
+ - Audio CFG: Controls strength of audio influence
23
+ - Guidance Scale: Controls adherence to prompt
24
+ - Random Seed: For reproducible results
25
+ - Max Duration: Video length in seconds
26
+ 5. Click "Generate Video" and wait for the result
27
+
28
+ ## Tips
29
+
30
+ - Use clear, front-facing photos for best results
31
+ - Ensure good audio quality without background noise
32
+ - Keep prompts clear and specific
33
+ - For multi-person videos, ensure the reference image shows all speakers clearly
34
+
35
+ ## Limitations
36
+
37
+ - Generation can take several minutes
38
+ - Maximum video duration is 15 seconds
39
+ - Best results with clear, well-lit reference images
40
+ - Audio should be clear and without background noise
41
+
42
+ ## Credits
43
+
44
+ This demo uses the MeiGen-MultiTalk model created by MeiGen-AI. If you use this in your work, please cite:
45
+
46
+ ```bibtex
47
+ @article{kong2025let,
48
+ title={Let Them Talk: Audio-Driven Multi-Person Conversational Video Generation},
49
+ author={Kong, Zhe and Gao, Feng and Zhang, Yong and Kang, Zhuoliang and Wei, Xiaoming and Cai, Xunliang and Chen, Guanying and Luo, Wenhan},
50
+ journal={arXiv preprint arXiv:2505.22647},
51
+ year={2025}
52
+ }
app.py CHANGED
@@ -1,140 +1,232 @@
1
- import os
2
- import json
3
  import gradio as gr
4
- from PIL import Image
5
  import torch
6
- from huggingface_hub import hf_hub_download
 
7
  import tempfile
 
 
 
8
 
9
- # Constants
10
- MODEL_ID = "MeiGen-AI/MeiGen-MultiTalk"
11
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
12
 
13
- def load_models():
14
- """Load required models"""
15
- # Here we'll add model loading logic
16
- pass
17
-
18
- def process_video(
19
  image,
20
- audio_files,
21
- prompt,
22
  resolution="480p",
23
- audio_cfg=4.0,
24
- cfg=7.5,
 
25
  seed=42,
26
- max_duration=15
 
27
  ):
28
- """Process video generation"""
 
 
 
 
 
 
 
29
  try:
30
- # Create temporary directory for processing
31
- with tempfile.TemporaryDirectory() as temp_dir:
32
- # Save uploaded image
33
- image_path = os.path.join(temp_dir, "reference.jpg")
34
- image.save(image_path)
35
-
36
- # Save uploaded audio files
37
- audio_paths = []
38
- for audio in audio_files:
39
- audio_path = os.path.join(temp_dir, f"audio_{len(audio_paths)}.wav")
40
- audio_paths.append(audio_path)
41
- # Save audio file
42
- with open(audio_path, "wb") as f:
43
- f.write(audio)
44
-
45
- # Create configuration
46
- config = {
47
- "image": image_path,
48
- "audio": audio_paths[0] if len(audio_paths) == 1 else audio_paths,
49
- "prompt": prompt,
50
- "resolution": resolution,
51
- "audio_cfg": float(audio_cfg),
52
- "cfg": float(cfg),
53
- "seed": int(seed),
54
- "max_duration": int(max_duration)
55
- }
56
-
57
- # Save configuration
58
- config_path = os.path.join(temp_dir, "config.json")
59
- with open(config_path, "w") as f:
60
- json.dump(config, f, indent=2)
61
-
62
- # Here we'll add video generation logic
63
- # For now, return a message
64
- return "Video generation will be implemented here"
65
-
66
  except Exception as e:
67
- return f"Error: {str(e)}"
68
 
69
- # Create Gradio interface
70
- with gr.Blocks(title="MeiGen-MultiTalk Demo") as demo:
71
- gr.Markdown("""
72
- # MeiGen-MultiTalk Demo
73
- Generate talking head videos from images and audio files.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  """)
75
 
76
  with gr.Row():
77
- with gr.Column():
78
- image_input = gr.Image(label="Reference Image", type="pil")
79
- audio_input = gr.Audio(label="Audio File(s)", type="binary", multiple=True)
80
- prompt_input = gr.Textbox(label="Prompt", placeholder="Describe the desired video...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  with gr.Row():
83
- resolution_input = gr.Dropdown(
84
  choices=["480p", "720p"],
85
  value="480p",
86
  label="Resolution"
87
  )
88
- audio_cfg_input = gr.Slider(
 
 
 
 
 
 
 
 
 
 
89
  minimum=1.0,
90
- maximum=10.0,
91
- value=4.0,
92
  step=0.1,
93
- label="Audio CFG"
94
  )
95
 
96
- with gr.Row():
97
- cfg_input = gr.Slider(
98
  minimum=1.0,
99
- maximum=15.0,
100
- value=7.5,
101
- step=0.1,
102
  label="Guidance Scale"
103
  )
104
- seed_input = gr.Number(
105
- value=42,
106
- label="Random Seed",
107
- precision=0
 
 
 
 
108
  )
109
 
110
- max_duration_input = gr.Slider(
111
- minimum=1,
112
- maximum=15,
113
- value=10,
114
- step=1,
115
- label="Max Duration (seconds)"
116
- )
117
 
118
- generate_btn = gr.Button("Generate Video")
 
 
119
 
120
- with gr.Column():
121
- output = gr.Video(label="Generated Video")
 
 
 
 
 
 
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  generate_btn.click(
124
- fn=process_video,
125
  inputs=[
126
  image_input,
127
  audio_input,
128
  prompt_input,
129
- resolution_input,
130
- audio_cfg_input,
131
- cfg_input,
132
- seed_input,
133
- max_duration_input
 
134
  ],
135
- outputs=output
136
  )
137
 
138
- # Launch locally if running directly
139
  if __name__ == "__main__":
140
- demo.launch()
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import torch
3
+ import numpy as np
4
+ from PIL import Image
5
  import tempfile
6
+ import os
7
+ from pathlib import Path
8
+ import spaces
9
 
10
+ # Configuration
11
+ MAX_SEED = np.iinfo(np.int32).max
12
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
13
+ DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
14
 
15
+ @spaces.GPU(duration=120)
16
+ def generate_video(
 
 
 
 
17
  image,
18
+ audio,
19
+ prompt="A person talking",
20
  resolution="480p",
21
+ audio_cfg=2.5,
22
+ guidance_scale=5.0,
23
+ num_inference_steps=25,
24
  seed=42,
25
+ max_duration=10,
26
+ progress=gr.Progress()
27
  ):
28
+ """Generate talking video from image and audio"""
29
+
30
+ if image is None:
31
+ return None, "❌ Please upload an image"
32
+
33
+ if audio is None:
34
+ return None, "❌ Please upload an audio file"
35
+
36
  try:
37
+ progress(0, "Initializing...")
38
+
39
+ # For now, return a placeholder message since we need to implement the actual model
40
+ # In a real implementation, you would load the MeiGen-MultiTalk model here
41
+
42
+ progress(0.5, "Processing audio and image...")
43
+
44
+ # Simulate processing time
45
+ import time
46
+ time.sleep(2)
47
+
48
+ progress(1.0, "Video generation complete!")
49
+
50
+ return None, f"""βœ… Video generation request processed!
51
+
52
+ **Settings:**
53
+ - Prompt: {prompt}
54
+ - Resolution: {resolution}
55
+ - Audio CFG: {audio_cfg}
56
+ - Guidance Scale: {guidance_scale}
57
+ - Steps: {num_inference_steps}
58
+ - Seed: {seed}
59
+ - Max Duration: {max_duration}s
60
+
61
+ **Note:** This is a demo interface. To implement the actual video generation, you would need to:
62
+ 1. Load the MeiGen-MultiTalk model
63
+ 2. Process the input image and audio
64
+ 3. Generate the video using the model
65
+ 4. Return the generated video file
66
+
67
+ The model files are not included in this demo due to size constraints."""
68
+
 
 
 
 
69
  except Exception as e:
70
+ return None, f"❌ Error during generation: {str(e)}"
71
 
72
+ def randomize_seed():
73
+ return np.random.randint(0, MAX_SEED)
74
+
75
+ # Gradio Interface
76
+ with gr.Blocks(
77
+ theme=gr.themes.Soft(),
78
+ title="MeiGen-MultiTalk Demo",
79
+ css="""
80
+ .main-header {
81
+ text-align: center;
82
+ background: linear-gradient(45deg, #ff6b6b, #4ecdc4);
83
+ -webkit-background-clip: text;
84
+ -webkit-text-fill-color: transparent;
85
+ background-clip: text;
86
+ font-size: 2.5em;
87
+ font-weight: bold;
88
+ margin-bottom: 0.5em;
89
+ }
90
+ .subtitle {
91
+ text-align: center;
92
+ color: #666;
93
+ margin-bottom: 2em;
94
+ }
95
+ """
96
+ ) as demo:
97
+
98
+ gr.HTML("""
99
+ <div class="main-header">🎬 MeiGen-MultiTalk Demo</div>
100
+ <p class="subtitle">Generate talking videos from images and audio using AI</p>
101
  """)
102
 
103
  with gr.Row():
104
+ # Input Column
105
+ with gr.Column(scale=1):
106
+ gr.Markdown("### πŸ“ Input Files")
107
+
108
+ image_input = gr.Image(
109
+ label="Reference Image",
110
+ type="pil",
111
+ height=300
112
+ )
113
+
114
+ audio_input = gr.Audio(
115
+ label="Audio File",
116
+ type="filepath"
117
+ )
118
+
119
+ prompt_input = gr.Textbox(
120
+ label="Prompt",
121
+ placeholder="A person talking naturally...",
122
+ value="A person talking",
123
+ lines=2
124
+ )
125
+
126
+ gr.Markdown("### βš™οΈ Generation Settings")
127
 
128
  with gr.Row():
129
+ resolution = gr.Dropdown(
130
  choices=["480p", "720p"],
131
  value="480p",
132
  label="Resolution"
133
  )
134
+
135
+ max_duration = gr.Slider(
136
+ minimum=1,
137
+ maximum=15,
138
+ value=10,
139
+ step=1,
140
+ label="Max Duration (seconds)"
141
+ )
142
+
143
+ with gr.Row():
144
+ audio_cfg = gr.Slider(
145
  minimum=1.0,
146
+ maximum=5.0,
147
+ value=2.5,
148
  step=0.1,
149
+ label="Audio CFG Scale"
150
  )
151
 
152
+ guidance_scale = gr.Slider(
 
153
  minimum=1.0,
154
+ maximum=10.0,
155
+ value=5.0,
156
+ step=0.5,
157
  label="Guidance Scale"
158
  )
159
+
160
+ with gr.Row():
161
+ num_inference_steps = gr.Slider(
162
+ minimum=10,
163
+ maximum=50,
164
+ value=25,
165
+ step=1,
166
+ label="Inference Steps"
167
  )
168
 
169
+ seed = gr.Number(
170
+ value=42,
171
+ minimum=0,
172
+ maximum=MAX_SEED,
173
+ label="Seed"
174
+ )
 
175
 
176
+ with gr.Row():
177
+ randomize_btn = gr.Button("🎲 Randomize Seed", variant="secondary")
178
+ generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
179
 
180
+ # Output Column
181
+ with gr.Column(scale=1):
182
+ gr.Markdown("### πŸŽ₯ Generated Video")
183
+
184
+ video_output = gr.Video(
185
+ label="Generated Video",
186
+ height=400
187
+ )
188
 
189
+ result_text = gr.Textbox(
190
+ label="Generation Log",
191
+ lines=8,
192
+ max_lines=15
193
+ )
194
+
195
+ # Examples
196
+ gr.Markdown("### πŸ“‹ Tips for Best Results")
197
+ gr.Markdown("""
198
+ - **Image**: Use clear, front-facing photos with good lighting
199
+ - **Audio**: Ensure clean audio without background noise
200
+ - **Prompt**: Be specific about the desired talking style
201
+ - **Resolution**: Start with 480p for faster generation
202
+ - **Duration**: Shorter videos (5-10s) generally work better
203
+ """)
204
+
205
+ # Event handlers
206
+ randomize_btn.click(
207
+ fn=randomize_seed,
208
+ outputs=seed
209
+ )
210
+
211
  generate_btn.click(
212
+ fn=generate_video,
213
  inputs=[
214
  image_input,
215
  audio_input,
216
  prompt_input,
217
+ resolution,
218
+ audio_cfg,
219
+ guidance_scale,
220
+ num_inference_steps,
221
+ seed,
222
+ max_duration
223
  ],
224
+ outputs=[video_output, result_text]
225
  )
226
 
 
227
  if __name__ == "__main__":
228
+ demo.launch(
229
+ share=False,
230
+ server_port=7860,
231
+ show_error=True
232
+ )
requirements.txt CHANGED
@@ -1,20 +1,15 @@
 
1
  torch>=2.0.0
2
  torchvision
3
  torchaudio
4
  transformers>=4.30.0
5
- diffusers
6
- accelerate
7
- safetensors
8
- opencv-python
 
9
  numpy
10
  scipy
11
- tqdm
12
- einops
13
- omegaconf
14
- huggingface-hub
15
- moviepy
16
- soundfile
17
  librosa
18
- gradio>=4.0.0
19
- python-dotenv
20
- pillow
 
1
+ gradio==4.44.1
2
  torch>=2.0.0
3
  torchvision
4
  torchaudio
5
  transformers>=4.30.0
6
+ diffusers>=0.21.0
7
+ accelerate>=0.21.0
8
+ xformers
9
+ opencv-python-headless
10
+ pillow
11
  numpy
12
  scipy
 
 
 
 
 
 
13
  librosa
14
+ soundfile
15
+ spaces