Spaces:
Sleeping
Sleeping
File size: 2,320 Bytes
39ec667 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
# web/app.py
# Gradio-based web UI for VocalPrint AI (Refactored to use shared CLI logic)
import gradio as gr
import os
import tempfile
import whisper
import torch
import json
import sys
# Ensure parent directory is in path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from core.processor import (
download_video,
extract_audio,
transcribe,
classify_accent,
compute_fluency
)
# Load Whisper model once
whisper_model = whisper.load_model("small")
def process_video(url):
try:
temp_dir = tempfile.mkdtemp()
video_path = os.path.join(temp_dir, "video.mp4")
audio_path = os.path.join(temp_dir, "audio.wav")
download_video(url, temp_dir)
video_file = next((f for f in os.listdir(temp_dir) if f.endswith(".mp4")), None)
if not video_file:
raise FileNotFoundError("No .mp4 file found")
extract_audio(os.path.join(temp_dir, video_file), audio_path)
transcript, segments, language = transcribe(audio_path, whisper_model)
top_accent, confidence, top3 = classify_accent(audio_path)
fluency = compute_fluency(segments)
# Format the top3 for the dataframe display
top3_formatted = [[item["accent"], f"{item['confidence']}%"] for item in top3]
return (
top_accent,
f"{confidence}%",
fluency,
language,
transcript[:500],
top3_formatted
)
except Exception as e:
return ("Error", "-", "-", "-", str(e), [])
iface = gr.Interface(
fn=process_video,
inputs=gr.Textbox(label="Public Video URL (YouTube, Loom, MP4)", placeholder="https://..."),
outputs=[
gr.Textbox(label="Detected Accent"),
gr.Textbox(label="Confidence (%)"),
gr.Textbox(label="Fluency Score (0–100)"),
gr.Textbox(label="Language Detected by Whisper"),
gr.Textbox(label="Transcript Sample (first 500 chars)"),
gr.Dataframe(headers=["Accent", "Confidence"], label="Top 3 Accent Predictions")
],
title="VocalPrint AI",
description="Analyze English speech from a public video link to detect accent, fluency, and transcription.",
allow_flagging="never",
theme="default"
)
if __name__ == "__main__":
iface.launch()
|