Spaces:
Sleeping
Sleeping
import os | |
import tempfile | |
import whisper | |
from gtts import gTTS | |
from typing import Optional, Tuple, Dict, Any | |
from datetime import datetime | |
from pydub.utils import mediainfo | |
import json | |
# === Load Whisper model once === | |
try: | |
model = whisper.load_model("base") # Choose: "tiny", "base", "small", "medium", "large" | |
except Exception as e: | |
raise RuntimeError(f"❌ Failed to load Whisper model: {e}") | |
# === Optional: Enable this to log all transcriptions === | |
AUDIO_LOG_FILE = os.path.join(tempfile.gettempdir(), "audio_transcription_log.jsonl") | |
def transcribe_audio(file_path: str, language: Optional[str] = None) -> Tuple[str, Optional[str], float]: | |
""" | |
Transcribes audio. Returns (text, detected_language, duration_sec) | |
""" | |
try: | |
info = mediainfo(file_path) | |
duration = float(info.get("duration", 0)) | |
result = model.transcribe(file_path, language=language) | |
text = result.get("text", "").strip() | |
detected_lang = result.get("language", language) | |
return text, detected_lang, duration | |
except Exception as e: | |
return f"Error during transcription: {e}", None, 0.0 | |
def text_to_speech(text: str, lang: str = "en") -> str: | |
""" | |
Converts text to MP3 using gTTS and returns the file path. | |
""" | |
try: | |
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") | |
filename = f"tts_{timestamp}.mp3" | |
output_path = os.path.join(tempfile.gettempdir(), filename) | |
tts = gTTS(text=text, lang=lang) | |
tts.save(output_path) | |
return output_path | |
except Exception as e: | |
raise RuntimeError(f"Text-to-Speech conversion failed: {e}") | |
def log_audio_interaction(log: Dict[str, Any]): | |
""" | |
Logs audio interaction to a JSONL file in temp dir. | |
""" | |
try: | |
with open(AUDIO_LOG_FILE, "a", encoding="utf-8") as f: | |
f.write(json.dumps(log) + "\n") | |
except Exception: | |
pass | |
def process_audio_to_speech(file_path: str, lang: Optional[str] = None) -> Dict[str, Any]: | |
""" | |
Full pipeline: transcribe -> respond -> convert to TTS. | |
Returns dict with text, detected language, tts_audio_path, duration. | |
""" | |
text, detected_lang, duration = transcribe_audio(file_path, language=lang) | |
if text.startswith("Error"): | |
return { | |
"error": text, | |
"language": detected_lang, | |
"duration_sec": duration | |
} | |
# Customize AI response | |
response_text = f"You said: {text}" | |
tts_path = text_to_speech(response_text, lang=detected_lang or "en") | |
log_audio_interaction({ | |
"timestamp": datetime.utcnow().isoformat(), | |
"original_text": text, | |
"response_text": response_text, | |
"detected_language": detected_lang, | |
"duration_sec": duration, | |
"tts_path": tts_path | |
}) | |
return { | |
"transcription": text, | |
"response": response_text, | |
"language": detected_lang, | |
"duration_sec": duration, | |
"tts_audio_path": tts_path | |
} | |
def cleanup_audio_files(*file_paths: str): | |
""" | |
Deletes temp files if they exist. | |
""" | |
for path in file_paths: | |
try: | |
if os.path.exists(path): | |
os.remove(path) | |
except Exception: | |
pass | |