Spaces:

deniskiplimo816
/

llama-models

Sleeping

File size: 3,262 Bytes

293ab16

import os
import tempfile
import whisper
from gtts import gTTS
from typing import Optional, Tuple, Dict, Any
from datetime import datetime
from pydub.utils import mediainfo
import json

# === Load Whisper model once ===
try:
    model = whisper.load_model("base")  # Choose: "tiny", "base", "small", "medium", "large"
except Exception as e:
    raise RuntimeError(f"❌ Failed to load Whisper model: {e}")

# === Optional: Enable this to log all transcriptions ===
AUDIO_LOG_FILE = os.path.join(tempfile.gettempdir(), "audio_transcription_log.jsonl")

def transcribe_audio(file_path: str, language: Optional[str] = None) -> Tuple[str, Optional[str], float]:
    """
    Transcribes audio. Returns (text, detected_language, duration_sec)
    """
    try:
        info = mediainfo(file_path)
        duration = float(info.get("duration", 0))

        result = model.transcribe(file_path, language=language)
        text = result.get("text", "").strip()
        detected_lang = result.get("language", language)

        return text, detected_lang, duration
    except Exception as e:
        return f"Error during transcription: {e}", None, 0.0

def text_to_speech(text: str, lang: str = "en") -> str:
    """
    Converts text to MP3 using gTTS and returns the file path.
    """
    try:
        timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
        filename = f"tts_{timestamp}.mp3"
        output_path = os.path.join(tempfile.gettempdir(), filename)

        tts = gTTS(text=text, lang=lang)
        tts.save(output_path)

        return output_path
    except Exception as e:
        raise RuntimeError(f"Text-to-Speech conversion failed: {e}")

def log_audio_interaction(log: Dict[str, Any]):
    """
    Logs audio interaction to a JSONL file in temp dir.
    """
    try:
        with open(AUDIO_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(json.dumps(log) + "\n")
    except Exception:
        pass

def process_audio_to_speech(file_path: str, lang: Optional[str] = None) -> Dict[str, Any]:
    """
    Full pipeline: transcribe -> respond -> convert to TTS.
    Returns dict with text, detected language, tts_audio_path, duration.
    """
    text, detected_lang, duration = transcribe_audio(file_path, language=lang)

    if text.startswith("Error"):
        return {
            "error": text,
            "language": detected_lang,
            "duration_sec": duration
        }

    # Customize AI response
    response_text = f"You said: {text}"
    tts_path = text_to_speech(response_text, lang=detected_lang or "en")

    log_audio_interaction({
        "timestamp": datetime.utcnow().isoformat(),
        "original_text": text,
        "response_text": response_text,
        "detected_language": detected_lang,
        "duration_sec": duration,
        "tts_path": tts_path
    })

    return {
        "transcription": text,
        "response": response_text,
        "language": detected_lang,
        "duration_sec": duration,
        "tts_audio_path": tts_path
    }

def cleanup_audio_files(*file_paths: str):
    """
    Deletes temp files if they exist.
    """
    for path in file_paths:
        try:
            if os.path.exists(path):
                os.remove(path)
        except Exception:
            pass