Spaces:
Runtime error
Runtime error
| from typing import cast | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan | |
| from speecht5_openjtalk_tokenizer import SpeechT5OpenjtalkTokenizer | |
| import pandas as pd | |
| import transformers | |
| setattr(transformers, SpeechT5OpenjtalkTokenizer.__name__, SpeechT5OpenjtalkTokenizer) | |
| class SpeechT5OpenjtalkProcessor(SpeechT5Processor): | |
| tokenizer_class = SpeechT5OpenjtalkTokenizer.__name__ | |
| model = SpeechT5ForTextToSpeech.from_pretrained("esnya/japanese_speecht5_tts") | |
| assert isinstance(model, SpeechT5ForTextToSpeech) | |
| processor = SpeechT5OpenjtalkProcessor.from_pretrained("esnya/japanese_speecht5_tts") | |
| assert isinstance(processor, SpeechT5OpenjtalkProcessor) | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| assert isinstance(vocoder, SpeechT5HifiGan) | |
| if torch.cuda.is_available(): | |
| model = model.cuda() | |
| vocoder = vocoder.cuda() | |
| def convert_float32_to_int16(wav: np.ndarray) -> np.ndarray: | |
| assert wav.dtype == np.float32 | |
| return np.clip(wav * 32768.0, -32768.0, 32767.0).astype(np.int16) | |
| def text_to_speech( | |
| text: str, | |
| threshold: float = 0.5, | |
| minlenratio: float = 0.0, | |
| maxlenratio: float = 10.0, | |
| ): | |
| speaker_embeddings = ( | |
| torch.rand( | |
| (1, model.config.speaker_embedding_dim), | |
| dtype=torch.float32, | |
| device=model.device, | |
| ) | |
| * 2 | |
| - 1 | |
| ) | |
| input_ids = processor(text=text, return_tensors="pt") | |
| assert input_ids is not None | |
| input_ids = input_ids.input_ids.to(model.device) | |
| speaker_embeddings = cast(torch.FloatTensor, speaker_embeddings) | |
| wav = model.generate_speech( | |
| input_ids, | |
| speaker_embeddings, | |
| threshold=threshold, | |
| minlenratio=minlenratio, | |
| maxlenratio=maxlenratio, | |
| vocoder=vocoder, | |
| ) | |
| wav = cast(torch.FloatTensor, wav) | |
| wav = convert_float32_to_int16(wav.reshape(-1).cpu().float().numpy()) | |
| return [ | |
| (vocoder.config.sampling_rate, wav), | |
| pd.DataFrame( | |
| { | |
| "dim": range(speaker_embeddings.shape[-1]), | |
| "value": speaker_embeddings[0].cpu().float().numpy(), | |
| } | |
| ), | |
| ] | |
| demo = gr.Interface( | |
| fn=text_to_speech, | |
| inputs=[ | |
| "text", | |
| gr.Slider(0, 0.5, 0.5, label="threshold"), | |
| gr.Slider(0, 100, 0, label="minlenratio"), | |
| gr.Slider(0, 100, 10, label="maxlenratio"), | |
| ], | |
| outputs=[ | |
| "audio", | |
| gr.BarPlot( | |
| label="speaker_embedding (random generated)", | |
| x="dim", | |
| y="value", | |
| y_lim=[-1, 1], | |
| ), | |
| ], | |
| ) | |
| demo.launch() | |