Commit
·
4f18cf9
1
Parent(s):
e551362
fix tts with gts
Browse files- app_merlin_ai_coach.py +20 -11
- requirements.txt +2 -1
app_merlin_ai_coach.py
CHANGED
@@ -21,7 +21,8 @@ import numpy as np
|
|
21 |
import soundfile as sf
|
22 |
import whisper
|
23 |
# from TTS.api import TTS
|
24 |
-
|
|
|
25 |
|
26 |
# Load environment variables from .env if present
|
27 |
load_dotenv()
|
@@ -592,7 +593,6 @@ def build_merlin_graph():
|
|
592 |
|
593 |
# --- Load models (smallest variants for speed) ---
|
594 |
whisper_model = whisper.load_model("base")
|
595 |
-
#tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=torch.cuda.is_available())
|
596 |
|
597 |
def transcribe_audio(audio):
|
598 |
"""
|
@@ -608,16 +608,25 @@ def transcribe_audio(audio):
|
|
608 |
|
609 |
def synthesize_speech(text):
|
610 |
"""
|
611 |
-
Synthesize speech from text using
|
612 |
Returns a (sample_rate, numpy array) tuple.
|
613 |
"""
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
621 |
|
622 |
def get_task_dropdown_choices():
|
623 |
"""
|
@@ -755,7 +764,7 @@ with gr.Blocks(title="🧙 Merlin AI Coach") as demo:
|
|
755 |
checklist_str = show_checklist()
|
756 |
chat_history = chat_history + [[user_message, assistant_display]]
|
757 |
# Synthesize assistant reply to audio only if TTS is enabled
|
758 |
-
audio_reply = synthesize_speech(assistant_display) if
|
759 |
# Always keep conversation group visible
|
760 |
return chat_history, notes_str, checklist_str, "", tasks_str, state_plan_val, gr.update(visible=False), audio_reply, gr.update(visible=True)
|
761 |
|
|
|
21 |
import soundfile as sf
|
22 |
import whisper
|
23 |
# from TTS.api import TTS
|
24 |
+
from gtts import gTTS
|
25 |
+
import io
|
26 |
|
27 |
# Load environment variables from .env if present
|
28 |
load_dotenv()
|
|
|
593 |
|
594 |
# --- Load models (smallest variants for speed) ---
|
595 |
whisper_model = whisper.load_model("base")
|
|
|
596 |
|
597 |
def transcribe_audio(audio):
|
598 |
"""
|
|
|
608 |
|
609 |
def synthesize_speech(text):
|
610 |
"""
|
611 |
+
Synthesize speech from text using gTTS.
|
612 |
Returns a (sample_rate, numpy array) tuple.
|
613 |
"""
|
614 |
+
if not text:
|
615 |
+
return None
|
616 |
+
tts = gTTS(text)
|
617 |
+
buf = io.BytesIO()
|
618 |
+
tts.write_to_fp(buf)
|
619 |
+
buf.seek(0)
|
620 |
+
# Read mp3 from buffer and convert to numpy array (mono, 22050Hz)
|
621 |
+
import tempfile
|
622 |
+
import numpy as np
|
623 |
+
import soundfile as sf
|
624 |
+
import librosa
|
625 |
+
with tempfile.NamedTemporaryFile(suffix=".mp3") as tmp:
|
626 |
+
tmp.write(buf.read())
|
627 |
+
tmp.flush()
|
628 |
+
wav, sr = librosa.load(tmp.name, sr=22050, mono=True)
|
629 |
+
return (sr, wav.astype(np.float32))
|
630 |
|
631 |
def get_task_dropdown_choices():
|
632 |
"""
|
|
|
764 |
checklist_str = show_checklist()
|
765 |
chat_history = chat_history + [[user_message, assistant_display]]
|
766 |
# Synthesize assistant reply to audio only if TTS is enabled
|
767 |
+
audio_reply = synthesize_speech(assistant_display) if tts_enabled else None
|
768 |
# Always keep conversation group visible
|
769 |
return chat_history, notes_str, checklist_str, "", tasks_str, state_plan_val, gr.update(visible=False), audio_reply, gr.update(visible=True)
|
770 |
|
requirements.txt
CHANGED
@@ -29,10 +29,11 @@ soundfile==0.13.1
|
|
29 |
openai-whisper
|
30 |
|
31 |
# For text-to-speech (TTS)
|
32 |
-
|
33 |
|
34 |
# For audio processing
|
35 |
torch==2.7.1
|
36 |
numpy==1.26.4
|
|
|
37 |
|
38 |
# llama-index-llms-openllm==0.4.1
|
|
|
29 |
openai-whisper
|
30 |
|
31 |
# For text-to-speech (TTS)
|
32 |
+
gTTS==2.5.1
|
33 |
|
34 |
# For audio processing
|
35 |
torch==2.7.1
|
36 |
numpy==1.26.4
|
37 |
+
librosa==0.10.1
|
38 |
|
39 |
# llama-index-llms-openllm==0.4.1
|