Spaces:
Sleeping
Sleeping
File size: 3,262 Bytes
293ab16 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import os
import tempfile
import whisper
from gtts import gTTS
from typing import Optional, Tuple, Dict, Any
from datetime import datetime
from pydub.utils import mediainfo
import json
# === Load Whisper model once ===
try:
model = whisper.load_model("base") # Choose: "tiny", "base", "small", "medium", "large"
except Exception as e:
raise RuntimeError(f"❌ Failed to load Whisper model: {e}")
# === Optional: Enable this to log all transcriptions ===
AUDIO_LOG_FILE = os.path.join(tempfile.gettempdir(), "audio_transcription_log.jsonl")
def transcribe_audio(file_path: str, language: Optional[str] = None) -> Tuple[str, Optional[str], float]:
"""
Transcribes audio. Returns (text, detected_language, duration_sec)
"""
try:
info = mediainfo(file_path)
duration = float(info.get("duration", 0))
result = model.transcribe(file_path, language=language)
text = result.get("text", "").strip()
detected_lang = result.get("language", language)
return text, detected_lang, duration
except Exception as e:
return f"Error during transcription: {e}", None, 0.0
def text_to_speech(text: str, lang: str = "en") -> str:
"""
Converts text to MP3 using gTTS and returns the file path.
"""
try:
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
filename = f"tts_{timestamp}.mp3"
output_path = os.path.join(tempfile.gettempdir(), filename)
tts = gTTS(text=text, lang=lang)
tts.save(output_path)
return output_path
except Exception as e:
raise RuntimeError(f"Text-to-Speech conversion failed: {e}")
def log_audio_interaction(log: Dict[str, Any]):
"""
Logs audio interaction to a JSONL file in temp dir.
"""
try:
with open(AUDIO_LOG_FILE, "a", encoding="utf-8") as f:
f.write(json.dumps(log) + "\n")
except Exception:
pass
def process_audio_to_speech(file_path: str, lang: Optional[str] = None) -> Dict[str, Any]:
"""
Full pipeline: transcribe -> respond -> convert to TTS.
Returns dict with text, detected language, tts_audio_path, duration.
"""
text, detected_lang, duration = transcribe_audio(file_path, language=lang)
if text.startswith("Error"):
return {
"error": text,
"language": detected_lang,
"duration_sec": duration
}
# Customize AI response
response_text = f"You said: {text}"
tts_path = text_to_speech(response_text, lang=detected_lang or "en")
log_audio_interaction({
"timestamp": datetime.utcnow().isoformat(),
"original_text": text,
"response_text": response_text,
"detected_language": detected_lang,
"duration_sec": duration,
"tts_path": tts_path
})
return {
"transcription": text,
"response": response_text,
"language": detected_lang,
"duration_sec": duration,
"tts_audio_path": tts_path
}
def cleanup_audio_files(*file_paths: str):
"""
Deletes temp files if they exist.
"""
for path in file_paths:
try:
if os.path.exists(path):
os.remove(path)
except Exception:
pass
|