File size: 4,578 Bytes
4a9476d 152b4c7 3b0f62c 152b4c7 4a9476d 3e971db 4a9476d 1e72e7a 4a9476d 1e72e7a cdcbae9 4a9476d 5d43428 6b9c0d7 0667951 4a9476d 8f03a04 4a9476d cdcbae9 4a9476d 55992ca 4a9476d 55992ca 4a9476d 250c588 4a9476d 250c588 4a9476d 55992ca 4a9476d cdcbae9 4a9476d 3b0f62c 8f03a04 4a9476d 3b0f62c 4a9476d cdcbae9 4a9476d cdcbae9 5d43428 bdd706e e389dde 4a9476d e00a5df cdcbae9 4a9476d 8f03a04 4a9476d bdd706e cdcbae9 4a9476d f7737f1 4a9476d cdcbae9 4a9476d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import streamlit as st
import tempfile
import shutil
import psutil
import torch
import torchaudio
import sys
import os
import time
from utils.audio_processing import trim_audio, download_audio_as_wav
from utils.video_processing import trim_video
from models.model_loader import load_accent_model, load_whisper, unload_model
from utils.accent_analysis import analyze_accent
from utils.session_utils import initialize_session_state, display_memory_once, reset_session_state_except_model
from speechbrain.pretrained.interfaces import foreign_class
from faster_whisper import WhisperModel
from models.custom_interface import CustomEncoderWav2vec2Classifier
st.title("English Accent Audio Detector")
# Initialize session state
initialize_session_state()
# Load models once when there is no transcription
if "transcription" not in st.session_state or not st.session_state.transcription:
st.session_state.whisper = load_whisper()
# Reset state for a new analysis
if st.button("Analyze new video"):
reset_session_state_except_model()
st.rerun()
# Check for ffmpeg
if not shutil.which("ffmpeg"):
raise EnvironmentError("FFmpeg not found. Please install or add it to PATH.")
# Input options
option = st.radio("Choose input method:", ["Upload video file", "Enter Video Url"])
if option == "Upload video file":
uploaded_video = st.file_uploader("Upload your video", type=["mp4", "mov", "avi", "mkv"])
if uploaded_video is not None:
temp_video_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
with open(temp_video_path.name, "wb") as f:
f.write(uploaded_video.read())
audio_path = trim_video(temp_video_path.name)
st.success("Video uploaded successfully.")
st.session_state.audio_path = audio_path
elif option == "Enter Video Url":
yt_url = st.text_input("Paste YouTube URL")
if st.button("Download Video"):
with st.spinner("Downloading video..."):
audio_path = download_audio_as_wav(yt_url)
audio_path = trim_audio(audio_path)
if audio_path:
st.success("Video downloaded successfully.")
st.session_state.audio_path = audio_path
# Transcription and Accent Analysis
if st.session_state.audio_path and not st.session_state.transcription:
if st.button("Extract Audio"):
st.session_state.audio_ready = True
st.audio(st.session_state.audio_path, format='audio/wav')
time.sleep(5)
#Detect Language AND FILTER OUT NON-ENGLISH AUDIOS FOR ANALYSIS
segments, info = st.session_state.whisper.transcribe(st.session_state.audio_path, beam_size=1)
# Convert segments (generator) to full transcription string
st.session_state.transcription = " ".join([segment.text for segment in segments])
if info.language != "en":
st.error("This video does not appear to be in English. Please provide a clear English video.")
else:
# Show transcription for audio
with st.spinner("Transcribing audio..."):
st.markdown(" Transcript Preview")
st.markdown(st.session_state.transcription)
st.success("Audio extracted and ready for analysis!")
unload_model(st.session_state.whisper)
if 'classifier' not in st.session_state:
st.session_state.classifier = load_accent_model()
if st.session_state.transcription:
if st.button("Analyze Accent"):
with st.spinner("Analyzing accent..."):
try:
waveform, sample_rate = torchaudio.load(st.session_state.audio_path)
readable_accent, confidence = analyze_accent(waveform, sample_rate, st.session_state.classifier)
if readable_accent:
st.audio(st.session_state.audio_path, format='audio/wav')
st.markdown(" Transcript Preview")
st.markdown(st.session_state.transcription)
st.success(f"Accent Detected: **{readable_accent}**")
st.info(f"Confidence: {confidence}%")
else:
st.warning("Could not determine accent.")
unload_model(st.session_state.classifier)
except Exception as e:
st.error("Failed to analyze accent.")
st.code(str(e))
|