File size: 4,337 Bytes
4a9476d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import streamlit as st
import tempfile
import shutil
import psutil
import torch
import torchaudio
from utils.audio_processing import trim_audio, download_audio_as_wav
from utils.video_processing import trim_video
from models.model_loader import load_accent_model, load_whisper
from utils.accent_analysis import analyze_accent
from utils.session_utils import initialize_session_state, display_memory_once, reset_session_state_except_model
from models.custom_interface import CustomEncoderWav2vec2Classifier
st.title("ποΈ English Accent Audio Detector")
# Initialize session state
initialize_session_state()
# Load models once
if 'classifier' not in st.session_state:
st.session_state.classifier = load_accent_model()
if 'whisper' not in st.session_state:
st.session_state.whisper = load_whisper()
# Memory info
display_memory_once()
# Reset state for a new analysis
if st.button("π Analyze new video"):
reset_session_state_except_model()
st.rerun()
# Check for ffmpeg
if not shutil.which("ffmpeg"):
raise EnvironmentError("FFmpeg not found. Please install or add it to PATH.")
# Input options
option = st.radio("Choose input method:", ["Upload video file", "Enter Video Url"])
if option == "Upload video file":
uploaded_video = st.file_uploader("Upload your video", type=["mp4", "mov", "avi", "mkv"])
if uploaded_video is not None:
temp_video_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
with open(temp_video_path.name, "wb") as f:
f.write(uploaded_video.read())
audio_path = trim_video(temp_video_path.name)
st.success("β
Video uploaded successfully.")
st.session_state.audio_path = audio_path
elif option == "Enter Video Url":
yt_url = st.text_input("Paste YouTube URL")
if st.button("Download Video"):
with st.spinner("Downloading video..."):
audio_path = download_audio_as_wav(yt_url)
audio_path = trim_audio(audio_path)
if audio_path:
st.success("β
Video downloaded successfully.")
st.session_state.audio_path = audio_path
# Transcription and Accent Analysis
if st.session_state.audio_path and not st.session_state.transcription:
if st.button("π§ Extract Audio"):
st.session_state.audio_ready = True
st.audio(st.session_state.audio_path, format='audio/wav')
mem = psutil.virtual_memory()
st.write(f"π Memory used: {mem.percent}%")
#Detect Language AND FILTER OUT NON-ENGLISH AUDIOS FOR ANALYSIS
segments, info = st.session_state.whisper.transcribe(st.session_state.audio_path, beam_size=1)
# Convert segments (generator) to full transcription string
st.session_state.transcription = " ".join([segment.text for segment in segments])
if info.language != "en":
st.error("β This video does not appear to be in English. Please provide a clear English video.")
else:
# Show transcription for audio
with st.spinner("Transcribing audio..."):
st.markdown(" Transcript Preview")
st.markdown(st.session_state.transcription)
st.success("π΅ Audio extracted and ready for analysis!")
mem = psutil.virtual_memory()
st.write(f"π Memory used: {mem.percent}%")
if st.session_state.transcription:
if st.button("π£οΈ Analyze Accent"):
with st.spinner("π Analyzing accent..."):
try:
mem = psutil.virtual_memory()
st.write(f"π Memory used: {mem.percent}%")
waveform, sample_rate = torchaudio.load(st.session_state.audio_path)
readable_accent, confidence = analyze_accent(waveform, sample_rate, st.session_state.classifier)
if readable_accent:
st.success(f"β
Accent Detected: **{readable_accent}**")
st.info(f"π Confidence: {confidence}%")
else:
st.warning("Could not determine accent.")
except Exception as e:
st.error("β Failed to analyze accent.")
st.code(str(e))
|