Spaces:
Running
Running
import io | |
import base64 | |
import time | |
import os | |
import gradio as gr | |
import torch | |
import numpy as np | |
from transformers import ( | |
VitsTokenizer, | |
VitsModel, | |
SpeechT5Processor, | |
SpeechT5ForTextToSpeech, | |
SpeechT5HifiGan, | |
set_seed | |
) | |
from scipy.io import wavfile | |
# Configuraciones para optimización | |
set_seed(42) | |
torch.set_grad_enabled(False) # Desactivar gradientes para inferencia | |
torch.set_num_threads(4) # Limitar threads para evitar sobrecargas | |
# Cache para respuestas frecuentes | |
CACHE_SIZE = 50 # Número máximo de respuestas cacheadas | |
cache = {} | |
# Configuración de modelos disponibles | |
MODELS = { | |
"facebook_mms": { | |
"name": "Facebook MMS (Español)", | |
"model_id": "facebook/mms-tts-spa", | |
"type": "vits" | |
}, | |
"facebook_fast": { | |
"name": "Facebook FastSpeech2 (Español)", | |
"model_id": "facebook/fastspeech2-es-cv", | |
"type": "vits" | |
}, | |
"microsoft_female": { | |
"name": "Microsoft SpeechT5 (Femenina)", | |
"model_id": "microsoft/speecht5_tts", | |
"speaker_embeddings": "microsoft/speecht5_tts", | |
"speaker_id": "es_female", | |
"type": "speecht5" | |
}, | |
"microsoft_male": { | |
"name": "Microsoft SpeechT5 (Masculino)", | |
"model_id": "microsoft/speecht5_tts", | |
"speaker_embeddings": "microsoft/speecht5_tts", | |
"speaker_id": "es_male", | |
"type": "speecht5" | |
} | |
} | |
# Carga diferida de modelos | |
models = {} | |
tokenizers = {} | |
processors = {} | |
vocoders = {} | |
def load_model(model_key): | |
"""Carga el modelo solo cuando es necesario""" | |
start_time = time.time() | |
if model_key not in models: | |
model_info = MODELS[model_key] | |
if model_info["type"] == "vits": | |
tokenizers[model_key] = VitsTokenizer.from_pretrained(model_info["model_id"]) | |
models[model_key] = VitsModel.from_pretrained(model_info["model_id"]) | |
elif model_info["type"] == "speecht5": | |
processors[model_key] = SpeechT5Processor.from_pretrained(model_info["model_id"]) | |
models[model_key] = SpeechT5ForTextToSpeech.from_pretrained(model_info["model_id"]) | |
vocoders[model_key] = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
load_time = time.time() - start_time | |
print(f"Modelo {model_key} cargado en {load_time:.2f} segundos") | |
# Función para generar el hash de caché | |
def get_cache_key(text, model_key): | |
return f"{text}_{model_key}" | |
def text_to_speech(text, model_key="facebook_mms"): | |
""" | |
Convierte texto en español a audio usando el modelo seleccionado | |
""" | |
if not text or text.strip() == "": | |
return { | |
"audio": None, | |
"error": "El texto no puede estar vacío", | |
"model": model_key | |
} | |
# Verificar en caché primero | |
cache_key = get_cache_key(text, model_key) | |
if cache_key in cache: | |
print(f"Resultado recuperado de caché para: {text[:20]}...") | |
return cache[cache_key] | |
try: | |
# Cargar modelo si no está en memoria | |
load_model(model_key) | |
model_info = MODELS[model_key] | |
start_time = time.time() | |
if model_info["type"] == "vits": | |
# Usar modelo VITS (Facebook MMS o FastSpeech2) | |
inputs = tokenizers[model_key](text=text, return_tensors="pt") | |
with torch.no_grad(): | |
output = models[model_key](**inputs).waveform | |
audio_data = output.squeeze().numpy() | |
sample_rate = 16000 | |
elif model_info["type"] == "speecht5": | |
# Usar modelo SpeechT5 (Microsoft) | |
processor = processors[model_key] | |
model = models[model_key] | |
vocoder = vocoders[model_key] | |
# Preprocesar texto | |
inputs = processor(text=text, return_tensors="pt") | |
# Generar embeddings del hablante (según idioma/género) | |
speaker_id = model_info["speaker_id"] | |
if speaker_id == "es_female": | |
speaker_embeddings = torch.tensor([[0.0, 1.0, 0.0, 0.0, 0.0]]) # Vector para mujer en español | |
elif speaker_id == "es_male": | |
speaker_embeddings = torch.tensor([[0.0, 0.0, 0.0, 1.0, 0.0]]) # Vector para hombre en español | |
# Generar codificación de audio | |
with torch.no_grad(): | |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings) | |
audio_data = vocoder(speech).numpy() | |
sample_rate = 16000 | |
# Normalizar audio para evitar distorsiones | |
audio_data = np.clip(audio_data, -1.0, 1.0) | |
# Si es float, convertir a int16 para el archivo WAV | |
if audio_data.dtype in [np.float32, np.float64]: | |
audio_data = (audio_data * 32767).astype(np.int16) | |
# Guardar como WAV | |
buffer = io.BytesIO() | |
wavfile.write(buffer, sample_rate, audio_data) | |
buffer.seek(0) | |
# Codificar en base64 para la API | |
audio_base64 = base64.b64encode(buffer.read()).decode('utf-8') | |
# Crear resultado | |
result = { | |
"audio": audio_base64, | |
"format": "wav", | |
"sampling_rate": sample_rate, | |
"model": model_key, | |
"processing_time": round(time.time() - start_time, 2) | |
} | |
# Guardar en caché | |
if len(cache) >= CACHE_SIZE: | |
# Eliminar entrada más antigua si se alcanza el límite | |
oldest_key = next(iter(cache)) | |
del cache[oldest_key] | |
cache[cache_key] = result | |
return result | |
except Exception as e: | |
return { | |
"audio": None, | |
"error": str(e), | |
"model": model_key | |
} | |
# Lista de modelos para la interfaz | |
model_choices = [info["name"] for _, info in MODELS.items()] | |
model_keys = list(MODELS.keys()) | |
# Configurar la interfaz Gradio | |
with gr.Blocks(title="Text-to-Speech en Español - Múltiples Voces") as demo: | |
gr.Markdown("# Conversor de Texto a Voz en Español") | |
gr.Markdown("### Genere audio en español con diferentes voces") | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox( | |
label="Texto a convertir en voz", | |
lines=5, | |
placeholder="Escribe aquí el texto en español que quieres convertir a voz..." | |
) | |
model_dropdown = gr.Dropdown( | |
choices=model_choices, | |
value=model_choices[0], | |
label="Selecciona la voz" | |
) | |
generate_btn = gr.Button("Generar audio") | |
with gr.Column(): | |
audio_output = gr.Audio(label="Audio generado", type="numpy") | |
json_output = gr.JSON(label="Respuesta API (para desarrolladores)") | |
# Función para manejar la interfaz gráfica | |
def generate_for_ui(text, model_choice): | |
# Convertir la selección del dropdown al key del modelo | |
selected_idx = model_choices.index(model_choice) | |
model_key = model_keys[selected_idx] | |
# Llamar a la función de generación | |
result = text_to_speech(text, model_key) | |
# Si hay error, mostrarlo | |
if not result.get("audio") or "error" in result: | |
return None, result | |
# Decodificar el audio para la interfaz | |
try: | |
audio_bytes = base64.b64decode(result["audio"]) | |
buffer = io.BytesIO(audio_bytes) | |
sr, audio_array = wavfile.read(buffer) | |
# Convertir a float para Gradio si es necesario | |
if audio_array.dtype == np.int16: | |
audio_array = audio_array.astype(np.float32) / 32768.0 | |
return (sr, audio_array), result | |
except Exception as e: | |
return None, {"error": str(e), "model": model_key} | |
# Conectar eventos | |
generate_btn.click( | |
fn=generate_for_ui, | |
inputs=[text_input, model_dropdown], | |
outputs=[audio_output, json_output] | |
) | |
# Agregar ejemplos | |
gr.Examples( | |
examples=[ | |
["Hola, este es un ejemplo de texto a voz en español usando inteligencia artificial.", model_choices[0]], | |
["La inteligencia artificial ha avanzado significativamente en los últimos años.", model_choices[1]], | |
["Me gustaría reservar una mesa para dos personas en el restaurante esta noche.", model_choices[2]], | |
["El tiempo estará soleado mañana, con temperaturas máximas de 25 grados.", model_choices[3]] | |
], | |
inputs=[text_input, model_dropdown] | |
) | |
# Configurar API | |
demo.queue(max_size=10).launch(debug=True, share=False, show_api=True) |