Spaces:

H-Liu1997
/

TANGO

Running on L40S

TANGO / app.py

haiyang

update py310

1e74724 2 days ago

8.91 kB

	import os
	import gradio as gr
	import shutil
	from inference import tango
	import numpy as np


	SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))


	examples_audio = [
	["./datasets/cached_audio/example_male_voice_9_seconds.wav"],
	# ["./datasets/cached_audio/example_female_voice_9_seconds.wav"],
	]

	examples_video = [
	# ["./datasets/cached_audio/speaker8_jjRWaMCWs44_00-00-30.16_00-00-33.32.mp4"],
	# ["./datasets/cached_audio/speaker7_iuYlGRnC7J8_00-00-0.00_00-00-3.25.mp4"],
	["./datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4"],
	# ["./datasets/cached_audio/1wrQ6Msp7wM_00-00-39.69_00-00-45.68.mp4"],
	# ["./datasets/cached_audio/101099-00_18_09-00_18_19.mp4"],
	]

	combined_examples = [
	["./datasets/cached_audio/example_female_voice_9_seconds.wav", "./datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4", 2024],
	]


	def tango_wrapper(audio_path, character_name, seed=2024, create_graph=False, video_folder_path=None):
	if isinstance(audio_path, tuple):
	sample_rate, audio_waveform = audio_path
	if audio_waveform.dtype != np.float32:
	audio_waveform = audio_waveform.astype(np.float32) / 32768.0
	audio_path = (sample_rate, audio_waveform)
	return tango(audio_path, character_name, seed=seed, create_graph=create_graph, video_folder_path=video_folder_path)


	def make_demo():
	with gr.Blocks(analytics_enabled=False) as Interface:
	gr.Markdown(
	"""
	<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
	<div>
	<h1>TANGO</h1>
	<span>Generating full-body talking videos from audio and reference video</span>
	<h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
	<a href='https://h-liu1997.github.io/'>Haiyang Liu</a>, \
	<a href='https://yangxingchao.github.io/'>Xingchao Yang</a>, \
	<a href=''>Tomoya Akiyama</a>, \
	<a href='https://sky24h.github.io/'> Yuantian Huang</a>, \
	<a href=''>Qiaoge Li</a>, \
	<a href='https://www.tut.ac.jp/english/university/faculty/cs/164.html'>Shigeru Kuriyama</a>, \
	<a href='https://taketomitakafumi.sakura.ne.jp/web/en/'>Takafumi Taketomi</a>\
	</h2>
	<br>
	<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
	<a href="https://arxiv.org/abs/2410.04221"><img src="https://img.shields.io/badge/arXiv-2410.04221-blue"></a>

	<a href="https://pantomatrix.github.io/TANGO/"><img src="https://img.shields.io/badge/Project_Page-TANGO-orange" alt="Project Page"></a>

	<a href="https://github.com/CyberAgentAILab/TANGO"><img src="https://img.shields.io/badge/Github-Code-green"></a>

	<a href="https://github.com/CyberAgentAILab/TANGO"><img src="https://img.shields.io/github/stars/CyberAgentAILab/TANGO
	"></a>
	</div>
	</div>
	</div>
	"""
	)

	# Create a gallery with 5 videos
	with gr.Row():
	gr.Video(value="./datasets/cached_audio/demo1.mp4", label="Demo 0")
	gr.Video(value="./datasets/cached_audio/demo2.mp4", label="Demo 1")
	gr.Video(value="./datasets/cached_audio/demo3.mp4", label="Demo 2")
	gr.Video(value="./datasets/cached_audio/demo4.mp4", label="Demo 3")
	gr.Video(value="./datasets/cached_audio/demo5.mp4", label="Demo 4")
	with gr.Row():
	gr.Video(value="./datasets/cached_audio/demo6.mp4", label="Demo 5")
	gr.Video(value="./datasets/cached_audio/demo0.mp4", label="Demo 6")
	gr.Video(value="./datasets/cached_audio/demo7.mp4", label="Demo 7")
	gr.Video(value="./datasets/cached_audio/demo8.mp4", label="Demo 8")
	gr.Video(value="./datasets/cached_audio/demo9.mp4", label="Demo 9")

	with gr.Row():
	gr.Markdown(
	"""
	<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
	This is an open-source project running locally, operates in low-quality mode. Some generated results from high-quality mode are shown above.
	<br>
	News:
	<br>
	[10/15]: Add watermark, fix bugs on custom character by downgrades to py3.9, fix bugs to support audio less than 4s.
	</div>
	"""
	)

	with gr.Row():
	with gr.Column(scale=4):
	video_output_1 = gr.Video(
	label="Generated video - 1",
	interactive=False,
	autoplay=False,
	loop=False,
	show_share_button=True,
	)
	with gr.Column(scale=4):
	video_output_2 = gr.Video(
	label="Generated video - 2",
	interactive=False,
	autoplay=False,
	loop=False,
	show_share_button=True,
	)
	with gr.Column(scale=1):
	file_output_1 = gr.File(label="Download 3D Motion and Visualize in Blender")
	file_output_2 = gr.File(label="Download 3D Motion and Visualize in Blender")
	gr.Markdown("""
	<div style="display: flex; justify-content: center; align-items: center; text-align: left;">
	Details of the low-quality mode:
	<br>
	1. lower resolution, video resized as long-side 512 and keep aspect ratio.
	<br>
	2. subgraph instead of full-graph, causing noticeable "frame jumps".
	<br>
	3. only use the first 8s of your input audio.
	<br>
	4. only use the first 20s of your input video for custom character. if you custom character, it will only generate one video result without "smoothing" for saving time.
	<br>
	5. use open-source tools like SMPLerX-s-model, Wav2Lip, and FiLM for faster processing.
	<br>
	<br>
	Feel free to open an issue on GitHub or contact the authors if this does not meet your needs.
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1):
	audio_input = gr.Audio(label="Upload your audio")
	seed_input = gr.Number(label="Seed", value=2024, interactive=True)
	with gr.Column(scale=2):
	gr.Examples(
	examples=examples_audio,
	inputs=[audio_input],
	outputs=[video_output_1, video_output_2, file_output_1, file_output_2],
	label="Select existing Audio examples",
	cache_examples=False,
	)
	with gr.Column(scale=1):
	video_input = gr.Video(label="Default Character", value="./datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4", interactive=False, elem_classes="video")
	gr.Markdown(
	"""
	Custom character upload is not supported in gradio 5.x (python 3.10).
	<br>
	To use it, download to local and set up a py39 environment for SimplerX and mmcv.
	"""
	)
	with gr.Column(scale=2):
	gr.Markdown(
	"""
	The character is fixed to the default one on the left.
	"""
	)

	# Fourth row: Generate video button
	with gr.Row():
	run_button = gr.Button("Generate Video")

	# Define button click behavior
	run_button.click(
	fn=tango_wrapper,
	inputs=[audio_input, video_input, seed_input],
	outputs=[video_output_1, video_output_2, file_output_1, file_output_2],
	)

	with gr.Row():
	with gr.Column(scale=4):
	gr.Examples(
	examples=combined_examples,
	inputs=[audio_input, video_input, seed_input], # Both audio and video as inputs
	outputs=[video_output_1, video_output_2, file_output_1, file_output_2],
	fn=tango_wrapper, # Function that processes both audio and video inputs
	label="Select Combined Audio and Video Examples (Cached)",
	cache_examples=True,
	)

	return Interface


	if __name__ == "__main__":
	os.environ["MASTER_ADDR"] = "127.0.0.1"
	os.environ["MASTER_PORT"] = "8675"

	demo = make_demo()
	demo.launch(share=True)