File size: 3,262 Bytes
293ab16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
import tempfile
import whisper
from gtts import gTTS
from typing import Optional, Tuple, Dict, Any
from datetime import datetime
from pydub.utils import mediainfo
import json

# === Load Whisper model once ===
try:
    model = whisper.load_model("base")  # Choose: "tiny", "base", "small", "medium", "large"
except Exception as e:
    raise RuntimeError(f"❌ Failed to load Whisper model: {e}")

# === Optional: Enable this to log all transcriptions ===
AUDIO_LOG_FILE = os.path.join(tempfile.gettempdir(), "audio_transcription_log.jsonl")

def transcribe_audio(file_path: str, language: Optional[str] = None) -> Tuple[str, Optional[str], float]:
    """
    Transcribes audio. Returns (text, detected_language, duration_sec)
    """
    try:
        info = mediainfo(file_path)
        duration = float(info.get("duration", 0))

        result = model.transcribe(file_path, language=language)
        text = result.get("text", "").strip()
        detected_lang = result.get("language", language)

        return text, detected_lang, duration
    except Exception as e:
        return f"Error during transcription: {e}", None, 0.0

def text_to_speech(text: str, lang: str = "en") -> str:
    """
    Converts text to MP3 using gTTS and returns the file path.
    """
    try:
        timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
        filename = f"tts_{timestamp}.mp3"
        output_path = os.path.join(tempfile.gettempdir(), filename)

        tts = gTTS(text=text, lang=lang)
        tts.save(output_path)

        return output_path
    except Exception as e:
        raise RuntimeError(f"Text-to-Speech conversion failed: {e}")

def log_audio_interaction(log: Dict[str, Any]):
    """
    Logs audio interaction to a JSONL file in temp dir.
    """
    try:
        with open(AUDIO_LOG_FILE, "a", encoding="utf-8") as f:
            f.write(json.dumps(log) + "\n")
    except Exception:
        pass

def process_audio_to_speech(file_path: str, lang: Optional[str] = None) -> Dict[str, Any]:
    """
    Full pipeline: transcribe -> respond -> convert to TTS.
    Returns dict with text, detected language, tts_audio_path, duration.
    """
    text, detected_lang, duration = transcribe_audio(file_path, language=lang)

    if text.startswith("Error"):
        return {
            "error": text,
            "language": detected_lang,
            "duration_sec": duration
        }

    # Customize AI response
    response_text = f"You said: {text}"
    tts_path = text_to_speech(response_text, lang=detected_lang or "en")

    log_audio_interaction({
        "timestamp": datetime.utcnow().isoformat(),
        "original_text": text,
        "response_text": response_text,
        "detected_language": detected_lang,
        "duration_sec": duration,
        "tts_path": tts_path
    })

    return {
        "transcription": text,
        "response": response_text,
        "language": detected_lang,
        "duration_sec": duration,
        "tts_audio_path": tts_path
    }

def cleanup_audio_files(*file_paths: str):
    """
    Deletes temp files if they exist.
    """
    for path in file_paths:
        try:
            if os.path.exists(path):
                os.remove(path)
        except Exception:
            pass