Spaces:
Sleeping
Sleeping
File size: 8,914 Bytes
f7b9a1c 1e74724 f7b9a1c 1e74724 f7b9a1c 1e74724 f7b9a1c 1e74724 f7b9a1c 1e74724 f7b9a1c 3dbae1e 1e74724 3dbae1e 863132b 20afd15 1e74724 f7b9a1c 1e74724 fb25e47 f7b9a1c 1e74724 f7b9a1c 1e74724 7749d13 1e74724 fb25e47 1e74724 86bc7b0 1e74724 c4f9281 a0ca03d fb25e47 f7b9a1c 1e74724 f7b9a1c 1e74724 f7b9a1c 7749d13 20afd15 fb25e47 7749d13 20afd15 7749d13 20afd15 7749d13 20afd15 7749d13 20afd15 7749d13 20afd15 7749d13 fb25e47 20afd15 7749d13 1e74724 f7b9a1c 1e74724 f7b9a1c 1e74724 f7b9a1c 1e74724 f7b9a1c 1e74724 f7b9a1c 1e74724 f7b9a1c 1e74724 f7b9a1c 1e74724 f7b9a1c c4f9281 1e74724 c4f9281 1e74724 c4f9281 1e74724 c4f9281 3dbae1e f7b9a1c 1e74724 f7b9a1c 1e74724 f7b9a1c 1e74724 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
import os
import gradio as gr
import shutil
from inference import tango
import numpy as np
SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
examples_audio = [
["./datasets/cached_audio/example_male_voice_9_seconds.wav"],
# ["./datasets/cached_audio/example_female_voice_9_seconds.wav"],
]
examples_video = [
# ["./datasets/cached_audio/speaker8_jjRWaMCWs44_00-00-30.16_00-00-33.32.mp4"],
# ["./datasets/cached_audio/speaker7_iuYlGRnC7J8_00-00-0.00_00-00-3.25.mp4"],
["./datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4"],
# ["./datasets/cached_audio/1wrQ6Msp7wM_00-00-39.69_00-00-45.68.mp4"],
# ["./datasets/cached_audio/101099-00_18_09-00_18_19.mp4"],
]
combined_examples = [
["./datasets/cached_audio/example_female_voice_9_seconds.wav", "./datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4", 2024],
]
def tango_wrapper(audio_path, character_name, seed=2024, create_graph=False, video_folder_path=None):
if isinstance(audio_path, tuple):
sample_rate, audio_waveform = audio_path
if audio_waveform.dtype != np.float32:
audio_waveform = audio_waveform.astype(np.float32) / 32768.0
audio_path = (sample_rate, audio_waveform)
return tango(audio_path, character_name, seed=seed, create_graph=create_graph, video_folder_path=video_folder_path)
def make_demo():
with gr.Blocks(analytics_enabled=False) as Interface:
gr.Markdown(
"""
<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
<div>
<h1>TANGO</h1>
<span>Generating full-body talking videos from audio and reference video</span>
<h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
<a href='https://h-liu1997.github.io/'>Haiyang Liu</a>, \
<a href='https://yangxingchao.github.io/'>Xingchao Yang</a>, \
<a href=''>Tomoya Akiyama</a>, \
<a href='https://sky24h.github.io/'> Yuantian Huang</a>, \
<a href=''>Qiaoge Li</a>, \
<a href='https://www.tut.ac.jp/english/university/faculty/cs/164.html'>Shigeru Kuriyama</a>, \
<a href='https://taketomitakafumi.sakura.ne.jp/web/en/'>Takafumi Taketomi</a>\
</h2>
<br>
<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
<a href="https://arxiv.org/abs/2410.04221"><img src="https://img.shields.io/badge/arXiv-2410.04221-blue"></a>
<a href="https://pantomatrix.github.io/TANGO/"><img src="https://img.shields.io/badge/Project_Page-TANGO-orange" alt="Project Page"></a>
<a href="https://github.com/CyberAgentAILab/TANGO"><img src="https://img.shields.io/badge/Github-Code-green"></a>
<a href="https://github.com/CyberAgentAILab/TANGO"><img src="https://img.shields.io/github/stars/CyberAgentAILab/TANGO
"></a>
</div>
</div>
</div>
"""
)
# Create a gallery with 5 videos
with gr.Row():
gr.Video(value="./datasets/cached_audio/demo1.mp4", label="Demo 0")
gr.Video(value="./datasets/cached_audio/demo2.mp4", label="Demo 1")
gr.Video(value="./datasets/cached_audio/demo3.mp4", label="Demo 2")
gr.Video(value="./datasets/cached_audio/demo4.mp4", label="Demo 3")
gr.Video(value="./datasets/cached_audio/demo5.mp4", label="Demo 4")
with gr.Row():
gr.Video(value="./datasets/cached_audio/demo6.mp4", label="Demo 5")
gr.Video(value="./datasets/cached_audio/demo0.mp4", label="Demo 6")
gr.Video(value="./datasets/cached_audio/demo7.mp4", label="Demo 7")
gr.Video(value="./datasets/cached_audio/demo8.mp4", label="Demo 8")
gr.Video(value="./datasets/cached_audio/demo9.mp4", label="Demo 9")
with gr.Row():
gr.Markdown(
"""
<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
This is an open-source project running locally, operates in low-quality mode. Some generated results from high-quality mode are shown above.
<br>
News:
<br>
[10/15]: Add watermark, fix bugs on custom character by downgrades to py3.9, fix bugs to support audio less than 4s.
</div>
"""
)
with gr.Row():
with gr.Column(scale=4):
video_output_1 = gr.Video(
label="Generated video - 1",
interactive=False,
autoplay=False,
loop=False,
show_share_button=True,
)
with gr.Column(scale=4):
video_output_2 = gr.Video(
label="Generated video - 2",
interactive=False,
autoplay=False,
loop=False,
show_share_button=True,
)
with gr.Column(scale=1):
file_output_1 = gr.File(label="Download 3D Motion and Visualize in Blender")
file_output_2 = gr.File(label="Download 3D Motion and Visualize in Blender")
gr.Markdown("""
<div style="display: flex; justify-content: center; align-items: center; text-align: left;">
Details of the low-quality mode:
<br>
1. lower resolution, video resized as long-side 512 and keep aspect ratio.
<br>
2. subgraph instead of full-graph, causing noticeable "frame jumps".
<br>
3. only use the first 8s of your input audio.
<br>
4. only use the first 20s of your input video for custom character. if you custom character, it will only generate one video result without "smoothing" for saving time.
<br>
5. use open-source tools like SMPLerX-s-model, Wav2Lip, and FiLM for faster processing.
<br>
<br>
Feel free to open an issue on GitHub or contact the authors if this does not meet your needs.
</div>
""")
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(label="Upload your audio")
seed_input = gr.Number(label="Seed", value=2024, interactive=True)
with gr.Column(scale=2):
gr.Examples(
examples=examples_audio,
inputs=[audio_input],
outputs=[video_output_1, video_output_2, file_output_1, file_output_2],
label="Select existing Audio examples",
cache_examples=False,
)
with gr.Column(scale=1):
video_input = gr.Video(label="Default Character", value="./datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4", interactive=False, elem_classes="video")
gr.Markdown(
"""
Custom character upload is not supported in gradio 5.x (python 3.10).
<br>
To use it, download to local and set up a py39 environment for SimplerX and mmcv.
"""
)
with gr.Column(scale=2):
gr.Markdown(
"""
The character is fixed to the default one on the left.
"""
)
# Fourth row: Generate video button
with gr.Row():
run_button = gr.Button("Generate Video")
# Define button click behavior
run_button.click(
fn=tango_wrapper,
inputs=[audio_input, video_input, seed_input],
outputs=[video_output_1, video_output_2, file_output_1, file_output_2],
)
with gr.Row():
with gr.Column(scale=4):
gr.Examples(
examples=combined_examples,
inputs=[audio_input, video_input, seed_input], # Both audio and video as inputs
outputs=[video_output_1, video_output_2, file_output_1, file_output_2],
fn=tango_wrapper, # Function that processes both audio and video inputs
label="Select Combined Audio and Video Examples (Cached)",
cache_examples=True,
)
return Interface
if __name__ == "__main__":
os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = "8675"
demo = make_demo()
demo.launch(share=True)
|