File size: 4,684 Bytes
4a9476d
 
 
 
 
 
152b4c7
 
 
4a9476d
 
3e971db
4a9476d
 
1e72e7a
 
4a9476d
 
1e72e7a
 
 
cdcbae9
4a9476d
 
 
 
5d43428
 
15546f2
0d27cea
4a9476d
 
 
 
 
cdcbae9
4a9476d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdcbae9
4a9476d
 
 
 
 
 
 
 
 
 
cdcbae9
4a9476d
 
 
 
 
cdcbae9
4a9476d
 
 
 
cdcbae9
4a9476d
ec65b05
4a9476d
 
 
 
 
 
cdcbae9
4a9476d
 
 
 
 
cdcbae9
5d43428
 
 
 
4a9476d
cdcbae9
4a9476d
 
 
 
e00a5df
cdcbae9
4a9476d
 
cdcbae9
4a9476d
 
 
 
cdcbae9
 
4a9476d
 
 
f7737f1
 
4a9476d
cdcbae9
4a9476d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import streamlit as st
import tempfile
import shutil
import psutil
import torch
import torchaudio
import sys
import os

from utils.audio_processing import trim_audio, download_audio_as_wav
from utils.video_processing import trim_video
from models.model_loader import load_accent_model, load_whisper, unload_model
from utils.accent_analysis import analyze_accent
from utils.session_utils import initialize_session_state, display_memory_once, reset_session_state_except_model
from speechbrain.pretrained.interfaces import foreign_class
from faster_whisper import WhisperModel
from models.custom_interface import CustomEncoderWav2vec2Classifier




st.title("English Accent Audio Detector")

# Initialize session state
initialize_session_state()

# Load models once when there is no transcription
if 'whisper' not in st.session_state and not session_state.trascription:
    with st.spinner("Loading Whisper model..."):
        st.session_state.whisper = load_whisper()

# Memory info
display_memory_once()

# Reset state for a new analysis
if st.button("Analyze new video"):
    reset_session_state_except_model()
    st.rerun()

# Check for ffmpeg
if not shutil.which("ffmpeg"):
    raise EnvironmentError("FFmpeg not found. Please install or add it to PATH.")

# Input options
option = st.radio("Choose input method:", ["Upload video file", "Enter Video Url"])

if option == "Upload video file":
    uploaded_video = st.file_uploader("Upload your video", type=["mp4", "mov", "avi", "mkv"])
    if uploaded_video is not None:
        temp_video_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
        with open(temp_video_path.name, "wb") as f:
            f.write(uploaded_video.read())
        audio_path = trim_video(temp_video_path.name)
        st.success("Video uploaded successfully.")
        st.session_state.audio_path = audio_path
        

elif option == "Enter Video Url":
    yt_url = st.text_input("Paste YouTube URL")
    if st.button("Download Video"):
        with st.spinner("Downloading video..."):
            audio_path = download_audio_as_wav(yt_url)
            audio_path = trim_audio(audio_path)
        if audio_path:
            st.success("Video downloaded successfully.")
            st.session_state.audio_path = audio_path
            

# Transcription and Accent Analysis
if st.session_state.audio_path and not st.session_state.transcription:
    if st.button("Extract Audio"):
        st.session_state.audio_ready = True
        st.audio(st.session_state.audio_path, format='audio/wav')
        
        mem = psutil.virtual_memory()
        st.write(f"Memory used: {mem.percent}%")    
        #Detect Language AND FILTER OUT NON-ENGLISH AUDIOS FOR ANALYSIS
        segments, info = st.session_state.whisper.transcribe(st.session_state.audio_path, beam_size=5)
            
        # Convert segments (generator) to full transcription string
        st.session_state.transcription = " ".join([segment.text for segment in segments])
                  
        if info.language != "en":
                    
            st.error("This video does not appear to be in English. Please provide a clear English video.")
        else:    
            # Show transcription for audio
            with st.spinner("Transcribing audio..."):
                st.markdown(" Transcript Preview")
                st.markdown(st.session_state.transcription)
                st.success("Audio extracted and ready for analysis!")
                unload_model(st.session_state.whisper)
            
                if 'classifier' not in st.session_state:
                    st.session_state.classifier = load_accent_model()
                mem = psutil.virtual_memory()
                st.write(f"Memory used: {mem.percent}%")

       

if st.session_state.transcription:
    if st.button("Analyze Accent"):
        with st.spinner("Analyzing accent..."):
            try:
                mem = psutil.virtual_memory()
                st.write(f"Memory used: {mem.percent}%")
                waveform, sample_rate = torchaudio.load(st.session_state.audio_path)
                readable_accent, confidence = analyze_accent(waveform, sample_rate, st.session_state.classifier)

                if readable_accent:
                    st.success(f"Accent Detected: **{readable_accent}**")
                    st.info(f"Confidence: {confidence}%")
                   
                else:
                    st.warning("Could not determine accent.")
                    
                unload_model(st.session_state.classifier)        
            except Exception as e:
                st.error("Failed to analyze accent.")
                st.code(str(e))