File size: 2,320 Bytes
39ec667
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# web/app.py
# Gradio-based web UI for VocalPrint AI (Refactored to use shared CLI logic)

import gradio as gr
import os
import tempfile
import whisper
import torch
import json
import sys

# Ensure parent directory is in path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from core.processor import (
    download_video,
    extract_audio,
    transcribe,
    classify_accent,
    compute_fluency
)

# Load Whisper model once
whisper_model = whisper.load_model("small")

def process_video(url):
    try:
        temp_dir = tempfile.mkdtemp()
        video_path = os.path.join(temp_dir, "video.mp4")
        audio_path = os.path.join(temp_dir, "audio.wav")

        download_video(url, temp_dir)
        video_file = next((f for f in os.listdir(temp_dir) if f.endswith(".mp4")), None)
        if not video_file:
            raise FileNotFoundError("No .mp4 file found")

        extract_audio(os.path.join(temp_dir, video_file), audio_path)
        transcript, segments, language = transcribe(audio_path, whisper_model)
        top_accent, confidence, top3 = classify_accent(audio_path)
        fluency = compute_fluency(segments)

        # Format the top3 for the dataframe display
        top3_formatted = [[item["accent"], f"{item['confidence']}%"] for item in top3]

        return (
            top_accent,
            f"{confidence}%",
            fluency,
            language,
            transcript[:500],
            top3_formatted
        )
    except Exception as e:
        return ("Error", "-", "-", "-", str(e), [])

iface = gr.Interface(
    fn=process_video,
    inputs=gr.Textbox(label="Public Video URL (YouTube, Loom, MP4)", placeholder="https://..."),
    outputs=[
        gr.Textbox(label="Detected Accent"),
        gr.Textbox(label="Confidence (%)"),
        gr.Textbox(label="Fluency Score (0–100)"),
        gr.Textbox(label="Language Detected by Whisper"),
        gr.Textbox(label="Transcript Sample (first 500 chars)"),
        gr.Dataframe(headers=["Accent", "Confidence"], label="Top 3 Accent Predictions")
    ],
    title="VocalPrint AI",
    description="Analyze English speech from a public video link to detect accent, fluency, and transcription.",
    allow_flagging="never",
    theme="default"
)

if __name__ == "__main__":
    iface.launch()