llama-models / app /audio_tool.py
deniskiplimo816's picture
Upload 27 files
293ab16 verified
import os
import tempfile
import whisper
from gtts import gTTS
from typing import Optional, Tuple, Dict, Any
from datetime import datetime
from pydub.utils import mediainfo
import json
# === Load Whisper model once ===
try:
model = whisper.load_model("base") # Choose: "tiny", "base", "small", "medium", "large"
except Exception as e:
raise RuntimeError(f"❌ Failed to load Whisper model: {e}")
# === Optional: Enable this to log all transcriptions ===
AUDIO_LOG_FILE = os.path.join(tempfile.gettempdir(), "audio_transcription_log.jsonl")
def transcribe_audio(file_path: str, language: Optional[str] = None) -> Tuple[str, Optional[str], float]:
"""
Transcribes audio. Returns (text, detected_language, duration_sec)
"""
try:
info = mediainfo(file_path)
duration = float(info.get("duration", 0))
result = model.transcribe(file_path, language=language)
text = result.get("text", "").strip()
detected_lang = result.get("language", language)
return text, detected_lang, duration
except Exception as e:
return f"Error during transcription: {e}", None, 0.0
def text_to_speech(text: str, lang: str = "en") -> str:
"""
Converts text to MP3 using gTTS and returns the file path.
"""
try:
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
filename = f"tts_{timestamp}.mp3"
output_path = os.path.join(tempfile.gettempdir(), filename)
tts = gTTS(text=text, lang=lang)
tts.save(output_path)
return output_path
except Exception as e:
raise RuntimeError(f"Text-to-Speech conversion failed: {e}")
def log_audio_interaction(log: Dict[str, Any]):
"""
Logs audio interaction to a JSONL file in temp dir.
"""
try:
with open(AUDIO_LOG_FILE, "a", encoding="utf-8") as f:
f.write(json.dumps(log) + "\n")
except Exception:
pass
def process_audio_to_speech(file_path: str, lang: Optional[str] = None) -> Dict[str, Any]:
"""
Full pipeline: transcribe -> respond -> convert to TTS.
Returns dict with text, detected language, tts_audio_path, duration.
"""
text, detected_lang, duration = transcribe_audio(file_path, language=lang)
if text.startswith("Error"):
return {
"error": text,
"language": detected_lang,
"duration_sec": duration
}
# Customize AI response
response_text = f"You said: {text}"
tts_path = text_to_speech(response_text, lang=detected_lang or "en")
log_audio_interaction({
"timestamp": datetime.utcnow().isoformat(),
"original_text": text,
"response_text": response_text,
"detected_language": detected_lang,
"duration_sec": duration,
"tts_path": tts_path
})
return {
"transcription": text,
"response": response_text,
"language": detected_lang,
"duration_sec": duration,
"tts_audio_path": tts_path
}
def cleanup_audio_files(*file_paths: str):
"""
Deletes temp files if they exist.
"""
for path in file_paths:
try:
if os.path.exists(path):
os.remove(path)
except Exception:
pass