Spaces:

alex16052G
/

abi

Paused

App Files Files Community

alex16052G commited on Jan 22

Commit

c761f75

verified ·

1 Parent(s): a8028fe

Update chat_ai.py

Browse files

Files changed (1) hide show

chat_ai.py +96 -27

chat_ai.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # ruff: noqa: E402
 # Above allows ruff to ignore E402: module level import not at top of file
 import tempfile
 import os
@@ -12,10 +13,7 @@ import gradio as gr
 import soundfile as sf
 import torchaudio
 from cached_path import cached_path
-from transformers import (
-    WhisperProcessor,
-    WhisperForConditionalGeneration,
-)
 try:
     import spaces
@@ -33,6 +31,7 @@ from f5_tts.model import DiT
 from f5_tts.infer.utils_infer import (
     load_vocoder,
     load_model,
     infer_process,
     remove_silence_for_generated_wav,
     save_spectrogram,
@@ -47,7 +46,7 @@ F5TTS_ema_model = load_model(
     DiT, F5TTS_model_cfg, str(cached_path("hf://jpgallegoar/F5-Spanish/model_1200000.safetensors"))
 )
-# Cargar el modelo Whisper para transcripción (si decides usarlo en el futuro)
 whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-base")
 whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
 whisper_model.eval()
@@ -56,21 +55,33 @@ if torch.cuda.is_available():
 @gpu_decorator
 def infer(
-    gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1
 ):
-    """Genera el audio sintetizado a partir del texto"""
     try:
-        # El texto ingresado se usa directamente sin modificaciones
         input_text = gen_text
         print(f"Texto para generar audio: {input_text}")  # Debug: Verificar el texto
         final_wave, final_sample_rate, combined_spectrogram = infer_process(
-            ref_audio_orig=None,  # No se utiliza audio de referencia
-            ref_text=None,        # No se utiliza texto de referencia
-            gen_text=input_text,
-            model=model,
-            remove_silence=remove_silence,
             cross_fade_duration=cross_fade_duration,
             speed=speed,
             progress=gr.Progress(),
@@ -95,9 +106,50 @@ def infer(
         print(f"Error en infer: {e}")
         return None, None
 @gpu_decorator
-def generate_audio(text, model_choice, remove_silence):
-    """Genera el audio a partir del texto ingresado"""
     try:
         if not text.strip():
             return None, "Por favor, ingresa un texto para generar el audio."
@@ -105,27 +157,37 @@ def generate_audio(text, model_choice, remove_silence):
         # Debug: Verificar el texto ingresado
         print(f"Texto ingresado para TTS: {text}")
-        # Usar directamente el texto ingresado sin limpieza
         input_text = text
         print(f"Texto final para inferencia: {input_text}")  # Debug
         audio_result, spectrogram_path = infer(
             gen_text=input_text,
             model=model_choice,
             remove_silence=remove_silence,
             cross_fade_duration=0.15,
             speed=1.0,
         )
         if audio_result is None:
             return None, "Error al generar el audio."
         sample_rate, waveform = audio_result
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
             sf.write(f.name, waveform, sample_rate)
             audio_path = f.name
         return audio_path, "Audio generado exitosamente."
     except Exception as e:
         print(f"Error en generate_audio: {e}")
@@ -134,12 +196,19 @@ def generate_audio(text, model_choice, remove_silence):
 with gr.Blocks() as app:
     gr.Markdown(
         """
-# Conversor de Texto a Voz
-Escribe un texto y la IA lo convertirá a voz.
         """
     )
     with gr.Row():
         with gr.Column():
             model_choice = gr.Radio(
                 choices=["F5-TTS"],
@@ -161,12 +230,12 @@ Escribe un texto y la IA lo convertirá a voz.
     with gr.Row():
         audio_output = gr.Audio(label="Audio Generado", autoplay=True)
     status = gr.Textbox(label="Estado", interactive=False)
     generate_btn.click(
         generate_audio,
-        inputs=[text_input, model_choice, remove_silence],
         outputs=[audio_output, status],
     )
@@ -183,11 +252,11 @@ Escribe un texto y la IA lo convertirá a voz.
 @click.option("--api", "-a", default=True, is_flag=True, help="Permitir acceso a la API")
 def main(port, host, share, api):
     """Función principal para lanzar la aplicación Gradio de Texto a Voz."""
-    print("Iniciando la aplicación de Texto a Voz...")
     app.queue(api_open=api).launch(
-        server_name=host,
-        server_port=port,
-        share=share,
         show_api=api
     )

 # ruff: noqa: E402
 # Above allows ruff to ignore E402: module level import not at top of file
+import re
 import tempfile
 import os
 import soundfile as sf
 import torchaudio
 from cached_path import cached_path
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
 try:
     import spaces
 from f5_tts.infer.utils_infer import (
     load_vocoder,
     load_model,
+    preprocess_ref_audio_text,
     infer_process,
     remove_silence_for_generated_wav,
     save_spectrogram,
     DiT, F5TTS_model_cfg, str(cached_path("hf://jpgallegoar/F5-Spanish/model_1200000.safetensors"))
 )
+# Cargar el modelo Whisper para transcripción
 whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-base")
 whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
 whisper_model.eval()
 @gpu_decorator
 def infer(
+    ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1
 ):
+    """Genera el audio sintetizado a partir del texto utilizando la voz de referencia."""
     try:
+        # Preprocesar el audio de referencia y el texto de referencia
+        ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text)
+        ema_model = F5TTS_ema_model
+        # Asegurar que el texto a generar esté correctamente formateado
+        if not gen_text.startswith(" "):
+            gen_text = " " + gen_text
+        if not gen_text.endswith(". "):
+            gen_text += ". "
+        # El texto ingresado por el usuario se utiliza directamente sin modificaciones
         input_text = gen_text
         print(f"Texto para generar audio: {input_text}")  # Debug: Verificar el texto
+        # Procesar la inferencia para generar el audio
         final_wave, final_sample_rate, combined_spectrogram = infer_process(
+            ref_audio,
+            ref_text,
+            input_text,
+            ema_model,
+            vocoder,
             cross_fade_duration=cross_fade_duration,
             speed=speed,
             progress=gr.Progress(),
         print(f"Error en infer: {e}")
         return None, None
+def transcribe_audio(audio_path):
+    """Transcribe el audio de referencia usando el modelo Whisper en español."""
+    try:
+        if not os.path.exists(audio_path):
+            raise FileNotFoundError(f"Archivo de audio no encontrado: {audio_path}")
+        # Cargar el audio
+        audio, rate = torchaudio.load(audio_path)
+        # Resample si es necesario
+        if rate != 16000:
+            resampler = torchaudio.transforms.Resample(orig_freq=rate, new_freq=16000)
+            audio = resampler(audio)
+        # Asegurarse de que el audio tenga una sola dimensión
+        if audio.ndim > 1:
+            audio = torch.mean(audio, dim=0)
+        # Procesar el audio con el procesador de Whisper
+        inputs = whisper_processor(audio.cpu().numpy(), sampling_rate=16000, return_tensors="pt")
+        if torch.cuda.is_available():
+            inputs = {k: v.to("cuda") for k, v in inputs.items()}
+        # Forzar el idioma a español (usando el nombre en inglés)
+        forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language="spanish", task="transcribe")
+        # Generar la transcripción
+        predicted_ids = whisper_model.generate(
+            inputs["input_features"],
+            forced_decoder_ids=forced_decoder_ids
+        )
+        transcription = whisper_processor.decode(predicted_ids[0], skip_special_tokens=True)
+        print(f"Transcripción: {transcription}")  # Debug: Verificar la transcripción
+        return transcription
+    except Exception as e:
+        print(f"Error en transcribe_audio: {e}")
+        return None
 @gpu_decorator
+def generate_audio(text, ref_audio, ref_text, model_choice, remove_silence):
+    """Genera el audio a partir del texto ingresado utilizando la voz de referencia."""
     try:
         if not text.strip():
             return None, "Por favor, ingresa un texto para generar el audio."
         # Debug: Verificar el texto ingresado
         print(f"Texto ingresado para TTS: {text}")
+        # Si se proporciona audio de referencia y no se proporciona texto de referencia, transcribir el audio
+        if ref_audio and not ref_text.strip():
+            ref_text = transcribe_audio(ref_audio)
+            if ref_text is None:
+                return None, "Error al transcribir el audio de referencia."
+            print(f"Texto de referencia transcrito: {ref_text}")  # Debug
+        # Usar directamente el texto ingresado para generar el audio
         input_text = text
         print(f"Texto final para inferencia: {input_text}")  # Debug
+        # Generar el audio utilizando la función infer
         audio_result, spectrogram_path = infer(
+            ref_audio_orig=ref_audio,
+            ref_text=ref_text,
             gen_text=input_text,
             model=model_choice,
             remove_silence=remove_silence,
             cross_fade_duration=0.15,
             speed=1.0,
         )
         if audio_result is None:
             return None, "Error al generar el audio."
         sample_rate, waveform = audio_result
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
             sf.write(f.name, waveform, sample_rate)
             audio_path = f.name
         return audio_path, "Audio generado exitosamente."
     except Exception as e:
         print(f"Error en generate_audio: {e}")
 with gr.Blocks() as app:
     gr.Markdown(
         """
+# Conversor de Texto a Voz con Clonación de Voz
+Sube un audio de referencia para clonar la voz y luego escribe el texto que deseas convertir a voz.
         """
     )
     with gr.Row():
+        with gr.Column():
+            ref_audio = gr.Audio(label="Audio de Referencia (Clonación de Voz)", type="filepath")
+            ref_text = gr.Textbox(
+                label="Texto de Referencia (Opcional)",
+                info="Opcional: Deja en blanco para transcribir automáticamente el audio de referencia",
+                lines=2,
+            )
         with gr.Column():
             model_choice = gr.Radio(
                 choices=["F5-TTS"],
     with gr.Row():
         audio_output = gr.Audio(label="Audio Generado", autoplay=True)
     status = gr.Textbox(label="Estado", interactive=False)
     generate_btn.click(
         generate_audio,
+        inputs=[text_input, ref_audio, ref_text, model_choice, remove_silence],
         outputs=[audio_output, status],
     )
 @click.option("--api", "-a", default=True, is_flag=True, help="Permitir acceso a la API")
 def main(port, host, share, api):
     """Función principal para lanzar la aplicación Gradio de Texto a Voz."""
+    print("Iniciando la aplicación de Texto a Voz con Clonación de Voz...")
     app.queue(api_open=api).launch(
+        server_name=host,
+        server_port=port,
+        share=share,
         show_api=api
     )