naishwarya commited on
Commit
4f18cf9
·
1 Parent(s): e551362

fix tts with gts

Browse files
Files changed (2) hide show
  1. app_merlin_ai_coach.py +20 -11
  2. requirements.txt +2 -1
app_merlin_ai_coach.py CHANGED
@@ -21,7 +21,8 @@ import numpy as np
21
  import soundfile as sf
22
  import whisper
23
  # from TTS.api import TTS
24
- # from TTS.utils.manage import ModelManager # <-- Add this import
 
25
 
26
  # Load environment variables from .env if present
27
  load_dotenv()
@@ -592,7 +593,6 @@ def build_merlin_graph():
592
 
593
  # --- Load models (smallest variants for speed) ---
594
  whisper_model = whisper.load_model("base")
595
- #tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=torch.cuda.is_available())
596
 
597
  def transcribe_audio(audio):
598
  """
@@ -608,16 +608,25 @@ def transcribe_audio(audio):
608
 
609
  def synthesize_speech(text):
610
  """
611
- Synthesize speech from text using Coqui TTS.
612
  Returns a (sample_rate, numpy array) tuple.
613
  """
614
- return None
615
- # if not text:
616
- # return None
617
- # wav = tts_model.tts(text)
618
- # # Ensure output is a numpy array
619
- # wav_np = np.array(wav, dtype=np.float32)
620
- # return (22050, wav_np)
 
 
 
 
 
 
 
 
 
621
 
622
  def get_task_dropdown_choices():
623
  """
@@ -755,7 +764,7 @@ with gr.Blocks(title="🧙 Merlin AI Coach") as demo:
755
  checklist_str = show_checklist()
756
  chat_history = chat_history + [[user_message, assistant_display]]
757
  # Synthesize assistant reply to audio only if TTS is enabled
758
- audio_reply = synthesize_speech(assistant_display) if False else None
759
  # Always keep conversation group visible
760
  return chat_history, notes_str, checklist_str, "", tasks_str, state_plan_val, gr.update(visible=False), audio_reply, gr.update(visible=True)
761
 
 
21
  import soundfile as sf
22
  import whisper
23
  # from TTS.api import TTS
24
+ from gtts import gTTS
25
+ import io
26
 
27
  # Load environment variables from .env if present
28
  load_dotenv()
 
593
 
594
  # --- Load models (smallest variants for speed) ---
595
  whisper_model = whisper.load_model("base")
 
596
 
597
  def transcribe_audio(audio):
598
  """
 
608
 
609
  def synthesize_speech(text):
610
  """
611
+ Synthesize speech from text using gTTS.
612
  Returns a (sample_rate, numpy array) tuple.
613
  """
614
+ if not text:
615
+ return None
616
+ tts = gTTS(text)
617
+ buf = io.BytesIO()
618
+ tts.write_to_fp(buf)
619
+ buf.seek(0)
620
+ # Read mp3 from buffer and convert to numpy array (mono, 22050Hz)
621
+ import tempfile
622
+ import numpy as np
623
+ import soundfile as sf
624
+ import librosa
625
+ with tempfile.NamedTemporaryFile(suffix=".mp3") as tmp:
626
+ tmp.write(buf.read())
627
+ tmp.flush()
628
+ wav, sr = librosa.load(tmp.name, sr=22050, mono=True)
629
+ return (sr, wav.astype(np.float32))
630
 
631
  def get_task_dropdown_choices():
632
  """
 
764
  checklist_str = show_checklist()
765
  chat_history = chat_history + [[user_message, assistant_display]]
766
  # Synthesize assistant reply to audio only if TTS is enabled
767
+ audio_reply = synthesize_speech(assistant_display) if tts_enabled else None
768
  # Always keep conversation group visible
769
  return chat_history, notes_str, checklist_str, "", tasks_str, state_plan_val, gr.update(visible=False), audio_reply, gr.update(visible=True)
770
 
requirements.txt CHANGED
@@ -29,10 +29,11 @@ soundfile==0.13.1
29
  openai-whisper
30
 
31
  # For text-to-speech (TTS)
32
- #TTS==0.22.0
33
 
34
  # For audio processing
35
  torch==2.7.1
36
  numpy==1.26.4
 
37
 
38
  # llama-index-llms-openllm==0.4.1
 
29
  openai-whisper
30
 
31
  # For text-to-speech (TTS)
32
+ gTTS==2.5.1
33
 
34
  # For audio processing
35
  torch==2.7.1
36
  numpy==1.26.4
37
+ librosa==0.10.1
38
 
39
  # llama-index-llms-openllm==0.4.1