rocketmandrey commited on
Commit
132fb5e
·
1 Parent(s): f0b6296

Initial Space setup with MeiGen MultiTalk demo

Browse files
Files changed (4) hide show
  1. .DS_Store +0 -0
  2. README.md +54 -7
  3. app.py +140 -0
  4. requirements.txt +20 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
README.md CHANGED
@@ -1,14 +1,61 @@
1
  ---
2
- title: Phunter Space
3
- emoji: 🏆
4
  colorFrom: blue
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 5.34.2
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
- short_description: phunter_space
 
 
 
 
 
 
 
 
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: MeiGen MultiTalk Demo
3
+ emoji: 🎬
4
  colorFrom: blue
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 4.19.2
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
+ hf_oauth: true
12
+ models:
13
+ - MeiGen-AI/MeiGen-MultiTalk
14
+ - TencentGameMate/chinese-wav2vec2-base
15
+ tags:
16
+ - audio
17
+ - video
18
+ - image
19
+ - text-to-video
20
  ---
21
 
22
+ # MeiGen-MultiTalk
23
+
24
+ Audio-driven multi-person conversational video generation system based on [MeiGen-AI/MeiGen-MultiTalk](https://huggingface.co/MeiGen-AI/MeiGen-MultiTalk).
25
+
26
+ ## Features
27
+
28
+ - 💬 Realistic Conversations - Support single & multi-person generation
29
+ - 👥 Interactive Character Control - Direct virtual humans via prompts
30
+ - 🎤 Generalization Performance - Support generation of cartoon characters and singing
31
+ - 📺 Resolution Flexibility - 480p & 720p output at arbitrary aspect ratios
32
+ - ⏱️ Long Video Generation - Support videos up to 15 seconds
33
+
34
+ ## Setup
35
+
36
+ 1. Install dependencies:
37
+ ```bash
38
+ pip install -r requirements.txt
39
+ ```
40
+
41
+ 2. Download required models:
42
+ ```bash
43
+ huggingface-cli download MeiGen-AI/MeiGen-MultiTalk --local-dir ./weights/MeiGen-MultiTalk
44
+ huggingface-cli download TencentGameMate/chinese-wav2vec2-base --local-dir ./weights/chinese-wav2vec2-base
45
+ ```
46
+
47
+ ## Usage
48
+
49
+ See the examples directory for sample configurations:
50
+ - `examples/single_example.json` - Single person video generation
51
+ - `examples/multi_example.json` - Multi-person conversation generation
52
+
53
+ ## License
54
+
55
+ This project is licensed under the Apache License 2.0 - see the LICENSE file for details.
56
+
57
+ ## Configuration Options
58
+
59
+ - `image`: Path to reference image
60
+ - `audio`: Path to audio file(s)
61
+ - `
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import gradio as gr
4
+ from PIL import Image
5
+ import torch
6
+ from huggingface_hub import hf_hub_download
7
+ import tempfile
8
+
9
+ # Constants
10
+ MODEL_ID = "MeiGen-AI/MeiGen-MultiTalk"
11
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
+
13
+ def load_models():
14
+ """Load required models"""
15
+ # Here we'll add model loading logic
16
+ pass
17
+
18
+ def process_video(
19
+ image,
20
+ audio_files,
21
+ prompt,
22
+ resolution="480p",
23
+ audio_cfg=4.0,
24
+ cfg=7.5,
25
+ seed=42,
26
+ max_duration=15
27
+ ):
28
+ """Process video generation"""
29
+ try:
30
+ # Create temporary directory for processing
31
+ with tempfile.TemporaryDirectory() as temp_dir:
32
+ # Save uploaded image
33
+ image_path = os.path.join(temp_dir, "reference.jpg")
34
+ image.save(image_path)
35
+
36
+ # Save uploaded audio files
37
+ audio_paths = []
38
+ for audio in audio_files:
39
+ audio_path = os.path.join(temp_dir, f"audio_{len(audio_paths)}.wav")
40
+ audio_paths.append(audio_path)
41
+ # Save audio file
42
+ with open(audio_path, "wb") as f:
43
+ f.write(audio)
44
+
45
+ # Create configuration
46
+ config = {
47
+ "image": image_path,
48
+ "audio": audio_paths[0] if len(audio_paths) == 1 else audio_paths,
49
+ "prompt": prompt,
50
+ "resolution": resolution,
51
+ "audio_cfg": float(audio_cfg),
52
+ "cfg": float(cfg),
53
+ "seed": int(seed),
54
+ "max_duration": int(max_duration)
55
+ }
56
+
57
+ # Save configuration
58
+ config_path = os.path.join(temp_dir, "config.json")
59
+ with open(config_path, "w") as f:
60
+ json.dump(config, f, indent=2)
61
+
62
+ # Here we'll add video generation logic
63
+ # For now, return a message
64
+ return "Video generation will be implemented here"
65
+
66
+ except Exception as e:
67
+ return f"Error: {str(e)}"
68
+
69
+ # Create Gradio interface
70
+ with gr.Blocks(title="MeiGen-MultiTalk Demo") as demo:
71
+ gr.Markdown("""
72
+ # MeiGen-MultiTalk Demo
73
+ Generate talking head videos from images and audio files.
74
+ """)
75
+
76
+ with gr.Row():
77
+ with gr.Column():
78
+ image_input = gr.Image(label="Reference Image", type="pil")
79
+ audio_input = gr.Audio(label="Audio File(s)", type="binary", multiple=True)
80
+ prompt_input = gr.Textbox(label="Prompt", placeholder="Describe the desired video...")
81
+
82
+ with gr.Row():
83
+ resolution_input = gr.Dropdown(
84
+ choices=["480p", "720p"],
85
+ value="480p",
86
+ label="Resolution"
87
+ )
88
+ audio_cfg_input = gr.Slider(
89
+ minimum=1.0,
90
+ maximum=10.0,
91
+ value=4.0,
92
+ step=0.1,
93
+ label="Audio CFG"
94
+ )
95
+
96
+ with gr.Row():
97
+ cfg_input = gr.Slider(
98
+ minimum=1.0,
99
+ maximum=15.0,
100
+ value=7.5,
101
+ step=0.1,
102
+ label="Guidance Scale"
103
+ )
104
+ seed_input = gr.Number(
105
+ value=42,
106
+ label="Random Seed",
107
+ precision=0
108
+ )
109
+
110
+ max_duration_input = gr.Slider(
111
+ minimum=1,
112
+ maximum=15,
113
+ value=10,
114
+ step=1,
115
+ label="Max Duration (seconds)"
116
+ )
117
+
118
+ generate_btn = gr.Button("Generate Video")
119
+
120
+ with gr.Column():
121
+ output = gr.Video(label="Generated Video")
122
+
123
+ generate_btn.click(
124
+ fn=process_video,
125
+ inputs=[
126
+ image_input,
127
+ audio_input,
128
+ prompt_input,
129
+ resolution_input,
130
+ audio_cfg_input,
131
+ cfg_input,
132
+ seed_input,
133
+ max_duration_input
134
+ ],
135
+ outputs=output
136
+ )
137
+
138
+ # Launch locally if running directly
139
+ if __name__ == "__main__":
140
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ torchvision
3
+ torchaudio
4
+ transformers>=4.30.0
5
+ diffusers
6
+ accelerate
7
+ safetensors
8
+ opencv-python
9
+ numpy
10
+ scipy
11
+ tqdm
12
+ einops
13
+ omegaconf
14
+ huggingface-hub
15
+ moviepy
16
+ soundfile
17
+ librosa
18
+ gradio>=4.0.0
19
+ python-dotenv
20
+ pillow