Spaces:

Agents-MCP-Hackathon
/

Merlin-AI-Coach

Running

App Files Files Community

naishwarya commited on 15 days ago

Commit

4f18cf9

1 Parent(s): e551362

fix tts with gts

Browse files

Files changed (2) hide show

app_merlin_ai_coach.py +20 -11
requirements.txt +2 -1

app_merlin_ai_coach.py CHANGED Viewed

@@ -21,7 +21,8 @@ import numpy as np
 import soundfile as sf
 import whisper
 # from TTS.api import TTS
-# from TTS.utils.manage import ModelManager  # <-- Add this import
 # Load environment variables from .env if present
 load_dotenv()
@@ -592,7 +593,6 @@ def build_merlin_graph():
 # --- Load models (smallest variants for speed) ---
 whisper_model = whisper.load_model("base")
-#tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=torch.cuda.is_available())
 def transcribe_audio(audio):
     """
@@ -608,16 +608,25 @@ def transcribe_audio(audio):
 def synthesize_speech(text):
     """
-    Synthesize speech from text using Coqui TTS.
     Returns a (sample_rate, numpy array) tuple.
     """
-    return None
-    # if not text:
-    #     return None
-    # wav = tts_model.tts(text)
-    # # Ensure output is a numpy array
-    # wav_np = np.array(wav, dtype=np.float32)
-    # return (22050, wav_np)
 def get_task_dropdown_choices():
     """
@@ -755,7 +764,7 @@ with gr.Blocks(title="🧙 Merlin AI Coach") as demo:
         checklist_str = show_checklist()
         chat_history = chat_history + [[user_message, assistant_display]]
         # Synthesize assistant reply to audio only if TTS is enabled
-        audio_reply = synthesize_speech(assistant_display) if False else None
         # Always keep conversation group visible
         return chat_history, notes_str, checklist_str, "", tasks_str, state_plan_val, gr.update(visible=False), audio_reply, gr.update(visible=True)

 import soundfile as sf
 import whisper
 # from TTS.api import TTS
+from gtts import gTTS
+import io
 # Load environment variables from .env if present
 load_dotenv()
 # --- Load models (smallest variants for speed) ---
 whisper_model = whisper.load_model("base")
 def transcribe_audio(audio):
     """
 def synthesize_speech(text):
     """
+    Synthesize speech from text using gTTS.
     Returns a (sample_rate, numpy array) tuple.
     """
+    if not text:
+        return None
+    tts = gTTS(text)
+    buf = io.BytesIO()
+    tts.write_to_fp(buf)
+    buf.seek(0)
+    # Read mp3 from buffer and convert to numpy array (mono, 22050Hz)
+    import tempfile
+    import numpy as np
+    import soundfile as sf
+    import librosa
+    with tempfile.NamedTemporaryFile(suffix=".mp3") as tmp:
+        tmp.write(buf.read())
+        tmp.flush()
+        wav, sr = librosa.load(tmp.name, sr=22050, mono=True)
+    return (sr, wav.astype(np.float32))
 def get_task_dropdown_choices():
     """
         checklist_str = show_checklist()
         chat_history = chat_history + [[user_message, assistant_display]]
         # Synthesize assistant reply to audio only if TTS is enabled
+        audio_reply = synthesize_speech(assistant_display) if tts_enabled else None
         # Always keep conversation group visible
         return chat_history, notes_str, checklist_str, "", tasks_str, state_plan_val, gr.update(visible=False), audio_reply, gr.update(visible=True)

requirements.txt CHANGED Viewed

@@ -29,10 +29,11 @@ soundfile==0.13.1
 openai-whisper
 # For text-to-speech (TTS)
-#TTS==0.22.0
 # For audio processing
 torch==2.7.1
 numpy==1.26.4
 # llama-index-llms-openllm==0.4.1

 openai-whisper
 # For text-to-speech (TTS)
+gTTS==2.5.1
 # For audio processing
 torch==2.7.1
 numpy==1.26.4
+librosa==0.10.1
 # llama-index-llms-openllm==0.4.1