''' pip install datasets soundfile huggingface_hub librosa from datasets import load_dataset import soundfile as sf import os from collections import defaultdict import io def download_voices_with_dynamic_counting(output_folder='genshin_voices_sample_5', max_files_per_speaker=5): """动态统计并下载所有speaker的音频和转录文件(兼容bytes/path格式)""" # 加载数据集(流式模式) dataset = load_dataset('simon3000/genshin-voice', split='train', streaming=True) # 过滤条件:中文、有转录、类型为对话 filtered_data = dataset.filter( lambda x: ( x['language'] == 'Chinese' and x['transcription'] != '' and x['type'] == 'Dialog' ) ) # 动态统计speaker计数和文件下载 speaker_counts = defaultdict(int) speaker_file_indices = defaultdict(int) os.makedirs(output_folder, exist_ok=True) for voice in filtered_data: speaker = voice['speaker'] # 如果该speaker已下载足够文件,跳过 if speaker_counts[speaker] >= max_files_per_speaker: continue # 更新speaker计数 speaker_counts[speaker] += 1 file_num = str(speaker_file_indices[speaker] + 1).zfill(5) # 从00001开始 # 创建speaker子文件夹 speaker_folder = os.path.join(output_folder, speaker) os.makedirs(speaker_folder, exist_ok=True) # 构建文件路径 audio_path = os.path.join(speaker_folder, f'{speaker}_{file_num}.wav') transcription_path = os.path.join(speaker_folder, f'{speaker}_{file_num}.txt') # 处理音频数据(兼容bytes或path格式) audio_data = voice['audio'] try: if 'bytes' in audio_data and audio_data['bytes'] is not None: # 从bytes直接读取音频 with io.BytesIO(audio_data['bytes']) as audio_bytes: data, samplerate = sf.read(audio_bytes) sf.write(audio_path, data, samplerate) ''' elif 'path' in audio_data and os.path.exists(audio_data['path']): # 如果提供path且文件存在,直接复制 data, samplerate = sf.read(audio_data['path']) sf.write(audio_path, data, samplerate) ''' else: print(f"警告: {speaker}的音频数据格式不支持,跳过") speaker_counts[speaker] -= 1 # 回滚计数 continue except Exception as e: print(f"处理{speaker}的音频时出错: {str(e)}") speaker_counts[speaker] -= 1 continue # 保存转录文件 with open(transcription_path, 'w', encoding='utf-8') as f: f.write(voice['transcription']) speaker_file_indices[speaker] += 1 print( f"[下载进度] {speaker}_{file_num} | " f"进度: {speaker_counts[speaker]}/{max_files_per_speaker}" ) # 打印最终统计 print("\n=== 下载结果 ===") for speaker, count in speaker_counts.items(): print(f"{speaker}: {count}个文件") if __name__ == '__main__': download_voices_with_dynamic_counting() from gradio_client import Client, handle_file client = Client("http://localhost:7860") result = client.predict( prompt=handle_file('genshin_voices_sample_5/Ahangar/Ahangar_00001.wav'), text="偷窃者没有好下场", api_name="/gen_single" ) print(result) from shutil import copy2 copy2(result["value"], result["value"].split("/")[-1]) ''' import spaces import os import shutil import threading import time import sys from huggingface_hub import snapshot_download current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(current_dir) sys.path.append(os.path.join(current_dir, "indextts")) import gradio as gr from indextts.infer import IndexTTS from tools.i18n.i18n import I18nAuto i18n = I18nAuto(language="zh_CN") MODE = 'local' snapshot_download("IndexTeam/IndexTTS-1.5",local_dir="checkpoints",) tts = IndexTTS(model_dir="checkpoints", cfg_path="checkpoints/config.yaml") os.makedirs("outputs/tasks",exist_ok=True) os.makedirs("prompts",exist_ok=True) @spaces.GPU def infer(voice, text,output_path=None): if not tts: raise Exception("Model not loaded") if not output_path: output_path = os.path.join("outputs", f"spk_{int(time.time())}.wav") tts.infer(voice, text, output_path) return output_path def tts_api(voice, text): try: output_path = infer(voice, text) with open(output_path, "rb") as f: audio_bytes = f.read() return (200, {}, audio_bytes) except Exception as e: return (500, {"error": str(e)}, None) def gen_single(prompt, text): output_path = infer(prompt, text) return gr.update(value=output_path,visible=True) def update_prompt_audio(): update_button = gr.update(interactive=True) return update_button with gr.Blocks() as demo: mutex = threading.Lock() gr.HTML('''

IndexTTS: An Industrial-Level Controllable and Efficient Zero-Shot Text-To-Speech System

''') with gr.Tab("音频生成"): with gr.Row(): os.makedirs("prompts",exist_ok=True) prompt_audio = gr.Audio(label="请上传参考音频",key="prompt_audio", sources=["upload","microphone"],type="filepath") prompt_list = os.listdir("prompts") default = '' if prompt_list: default = prompt_list[0] input_text_single = gr.Textbox(label="请输入目标文本",key="input_text_single") gen_button = gr.Button("生成语音",key="gen_button",interactive=True) output_audio = gr.Audio(label="生成结果", visible=False,key="output_audio") prompt_audio.upload(update_prompt_audio, inputs=[], outputs=[gen_button]) gen_button.click(gen_single, inputs=[prompt_audio, input_text_single], outputs=[output_audio]) # 移除 Interface 相关内容,避免重复渲染 # 只保留 Blocks demo,UI和API共用 # 这样既有UI,也能通过Gradio HTTP API调用 # 通过POST /run/predict即可API调用 # 移除 add_api_route 和 mount_gradio_app,Spaces 不支持 def main(): tts.load_normalizer() demo.launch(server_name="0.0.0.0", server_port=7860, share = True) if __name__ == "__main__": main()