Spaces:

TaahKay
/

English_accent_detection

Running

App Files Files Community

TaahKay commited on 29 days ago

Commit

4a9476d

verified ·

1 Parent(s): 66b3183

Upload 13 files

Browse files

Files changed (13) hide show

.streamlit/config.toml +3 -0
LICENSE +21 -0
app.py +109 -0
models/__init__.py +1 -0
models/custom_interface.py +146 -0
models/model_loader.py +28 -0
packages.txt +1 -0
requirements.txt +29 -3
utils/__init__.py +1 -0
utils/accent_analysis.py +51 -0
utils/audio_processing.py +109 -0
utils/session_utils.py +30 -0
utils/video_processing.py +47 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[server]
+fileWatcherType = "none"
+maxUploadSize = 70

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Ryan Kembo
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import streamlit as st
+import tempfile
+import shutil
+import psutil
+import torch
+import torchaudio
+from utils.audio_processing import trim_audio, download_audio_as_wav
+from utils.video_processing import trim_video
+from models.model_loader import load_accent_model, load_whisper
+from utils.accent_analysis import analyze_accent
+from utils.session_utils import initialize_session_state, display_memory_once, reset_session_state_except_model
+from models.custom_interface import CustomEncoderWav2vec2Classifier
+st.title("🎙️ English Accent Audio Detector")
+# Initialize session state
+initialize_session_state()
+# Load models once
+if 'classifier' not in st.session_state:
+    st.session_state.classifier = load_accent_model()
+if 'whisper' not in st.session_state:
+    st.session_state.whisper = load_whisper()
+# Memory info
+display_memory_once()
+# Reset state for a new analysis
+if st.button("🔄 Analyze new video"):
+    reset_session_state_except_model()
+    st.rerun()
+# Check for ffmpeg
+if not shutil.which("ffmpeg"):
+    raise EnvironmentError("FFmpeg not found. Please install or add it to PATH.")
+# Input options
+option = st.radio("Choose input method:", ["Upload video file", "Enter Video Url"])
+if option == "Upload video file":
+    uploaded_video = st.file_uploader("Upload your video", type=["mp4", "mov", "avi", "mkv"])
+    if uploaded_video is not None:
+        temp_video_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+        with open(temp_video_path.name, "wb") as f:
+            f.write(uploaded_video.read())
+        audio_path = trim_video(temp_video_path.name)
+        st.success("✅ Video uploaded successfully.")
+        st.session_state.audio_path = audio_path
+elif option == "Enter Video Url":
+    yt_url = st.text_input("Paste YouTube URL")
+    if st.button("Download Video"):
+        with st.spinner("Downloading video..."):
+            audio_path = download_audio_as_wav(yt_url)
+            audio_path = trim_audio(audio_path)
+        if audio_path:
+            st.success("✅ Video downloaded successfully.")
+            st.session_state.audio_path = audio_path
+# Transcription and Accent Analysis
+if st.session_state.audio_path and not st.session_state.transcription:
+    if st.button("🎧 Extract Audio"):
+        st.session_state.audio_ready = True
+        st.audio(st.session_state.audio_path, format='audio/wav')
+        mem = psutil.virtual_memory()
+        st.write(f"🔍 Memory used: {mem.percent}%")
+        #Detect Language AND FILTER OUT NON-ENGLISH AUDIOS FOR ANALYSIS
+        segments, info = st.session_state.whisper.transcribe(st.session_state.audio_path, beam_size=1)
+        # Convert segments (generator) to full transcription string
+        st.session_state.transcription = " ".join([segment.text for segment in segments])
+        if info.language != "en":
+            st.error("❌ This video does not appear to be in English. Please provide a clear English video.")
+        else:
+            # Show transcription for audio
+            with st.spinner("Transcribing audio..."):
+                st.markdown(" Transcript Preview")
+                st.markdown(st.session_state.transcription)
+                st.success("🎵 Audio extracted and ready for analysis!")
+                mem = psutil.virtual_memory()
+                st.write(f"🔍 Memory used: {mem.percent}%")
+if st.session_state.transcription:
+    if st.button("🗣️ Analyze Accent"):
+        with st.spinner("🔍 Analyzing accent..."):
+            try:
+                mem = psutil.virtual_memory()
+                st.write(f"🔍 Memory used: {mem.percent}%")
+                waveform, sample_rate = torchaudio.load(st.session_state.audio_path)
+                readable_accent, confidence = analyze_accent(waveform, sample_rate, st.session_state.classifier)
+                if readable_accent:
+                    st.success(f"✅ Accent Detected: **{readable_accent}**")
+                    st.info(f"📊 Confidence: {confidence}%")
+                else:
+                    st.warning("Could not determine accent.")
+            except Exception as e:
+                st.error("❌ Failed to analyze accent.")
+                st.code(str(e))

models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

models/custom_interface.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import torch
+from speechbrain.pretrained import Pretrained
+class CustomEncoderWav2vec2Classifier(Pretrained):
+    """A ready-to-use class for utterance-level classification (e.g, speaker-id,
+    language-id, emotion recognition, keyword spotting, etc).
+    The class assumes that an self-supervised encoder like wav2vec2/hubert and a classifier model
+    are defined in the yaml file. If you want to
+    convert the predicted index into a corresponding text label, please
+    provide the path of the label_encoder in a variable called 'lab_encoder_file'
+    within the yaml.
+    The class can be used either to run only the encoder (encode_batch()) to
+    extract embeddings or to run a classification step (classify_batch()).
+    ```
+    Example
+    -------
+    >>> import torchaudio
+    >>> from speechbrain.pretrained import EncoderClassifier
+    >>> # Model is downloaded from the speechbrain HuggingFace repo
+    >>> tmpdir = getfixture("tmpdir")
+    >>> classifier = EncoderClassifier.from_hparams(
+    ...     source="speechbrain/spkrec-ecapa-voxceleb",
+    ...     savedir=tmpdir,
+    ... )
+    >>> # Compute embeddings
+    >>> signal, fs = torchaudio.load("samples/audio_samples/example1.wav")
+    >>> embeddings =  classifier.encode_batch(signal)
+    >>> # Classification
+    >>> prediction =  classifier .classify_batch(signal)
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def encode_batch(self, wavs, wav_lens=None, normalize=False):
+        """Encodes the input audio into a single vector embedding.
+        The waveforms should already be in the model's desired format.
+        You can call:
+        ``normalized = <this>.normalizer(signal, sample_rate)``
+        to get a correctly converted signal in most cases.
+        Arguments
+        ---------
+        wavs : torch.tensor
+            Batch of waveforms [batch, time, channels] or [batch, time]
+            depending on the model. Make sure the sample rate is fs=16000 Hz.
+        wav_lens : torch.tensor
+            Lengths of the waveforms relative to the longest one in the
+            batch, tensor of shape [batch]. The longest one should have
+            relative length 1.0 and others len(waveform) / max_length.
+            Used for ignoring padding.
+        normalize : bool
+            If True, it normalizes the embeddings with the statistics
+            contained in mean_var_norm_emb.
+        Returns
+        -------
+        torch.tensor
+            The encoded batch
+        """
+        # Manage single waveforms in input
+        if len(wavs.shape) == 1:
+            wavs = wavs.unsqueeze(0)
+        # Assign full length if wav_lens is not assigned
+        if wav_lens is None:
+            wav_lens = torch.ones(wavs.shape[0], device=self.device)
+        # Storing waveform in the specified device
+        wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
+        wavs = wavs.float()
+        # Computing features and embeddings
+        outputs = self.mods.wav2vec2(wavs)
+        # last dim will be used for AdaptativeAVG pool
+        outputs = self.mods.avg_pool(outputs, wav_lens)
+        outputs = outputs.view(outputs.shape[0], -1)
+        return outputs
+    def classify_batch(self, wavs, wav_lens=None):
+        """Performs classification on the top of the encoded features.
+        It returns the posterior probabilities, the index and, if the label
+        encoder is specified it also the text label.
+        Arguments
+        ---------
+        wavs : torch.tensor
+            Batch of waveforms [batch, time, channels] or [batch, time]
+            depending on the model. Make sure the sample rate is fs=16000 Hz.
+        wav_lens : torch.tensor
+            Lengths of the waveforms relative to the longest one in the
+            batch, tensor of shape [batch]. The longest one should have
+            relative length 1.0 and others len(waveform) / max_length.
+            Used for ignoring padding.
+        Returns
+        -------
+        out_prob
+            The log posterior probabilities of each class ([batch, N_class])
+        score:
+            It is the value of the log-posterior for the best class ([batch,])
+        index
+            The indexes of the best class ([batch,])
+        text_lab:
+            List with the text labels corresponding to the indexes.
+            (label encoder should be provided).
+        """
+        outputs = self.encode_batch(wavs, wav_lens)
+        outputs = self.mods.output_mlp(outputs)
+        out_prob = self.hparams.softmax(outputs)
+        score, index = torch.max(out_prob, dim=-1)
+        text_lab = self.hparams.label_encoder.decode_torch(index)
+        return out_prob, score, index, text_lab
+    def classify_file(self, path):
+        """Classifies the given audiofile into the given set of labels.
+        Arguments
+        ---------
+        path : str
+            Path to audio file to classify.
+        Returns
+        -------
+        out_prob
+            The log posterior probabilities of each class ([batch, N_class])
+        score:
+            It is the value of the log-posterior for the best class ([batch,])
+        index
+            The indexes of the best class ([batch,])
+        text_lab:
+            List with the text labels corresponding to the indexes.
+            (label encoder should be provided).
+        """
+        waveform = self.load_audio(path)
+        # Fake a batch:
+        batch = waveform.unsqueeze(0)
+        rel_length = torch.tensor([1.0])
+        outputs = self.encode_batch(batch, rel_length)
+        outputs = self.mods.output_mlp(outputs).squeeze(1)
+        out_prob = self.hparams.softmax(outputs)
+        score, index = torch.max(out_prob, dim=-1)
+        text_lab = self.hparams.label_encoder.decode_torch(index)
+        return out_prob, score, index, text_lab
+    def forward(self, wavs, wav_lens=None, normalize=False):
+        return self.encode_batch(
+            wavs=wavs, wav_lens=wav_lens, normalize=normalize
+        )

models/model_loader.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+import streamlit as st
+from speechbrain.pretrained.interfaces import foreign_class
+from faster_whisper import WhisperModel
+# -------------------------------
+# Load Model (Cached)
+# -------------------------------
+@st.cache_resource(show_spinner="Loading model...") # making sure we only load the model once per every app instance
+def load_accent_model():
+    """Loads custom accent classification model."""
+    if not os.getenv("HF_TOKEN"):
+        st.error("Hugging Face token not found.")
+        st.stop()
+    try:
+        return foreign_class(
+            source="Jzuluaga/accent-id-commonaccent_xlsr-en-english",
+            pymodule_file="custom_interface.py",
+            classname="CustomEncoderWav2vec2Classifier"
+        )
+    except Exception as e:
+        st.error(f"❌ Error loading model: {e}")
+        st.stop()
+@st.cache_resource(show_spinner="Loading Whisper...")
+def load_whisper():
+    return WhisperModel("tiny", device="cpu", compute_type="int8_float32")

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt CHANGED Viewed

@@ -1,3 +1,29 @@
-altair
-pandas
-streamlit

+streamlit
+moviepy
+ffmpeg-python
+requests
+speechbrain==0.5.14
+faster-whisper
+transformers==4.25.1
+numpy==1.23.5
+numba==0.56.4
+datasets==2.8.0
+librosa==0.9.2
+numba==0.56.4
+scikit-learn==1.3.2
+ipdb>=0.13.9
+pandas>=1.5.3
+huggingface_hub>=0.7.0
+hyperpyyaml>=0.0.1
+joblib>=0.14.1
+packaging
+pre-commit>=2.3.0
+sentencepiece>=0.1.91
+psutil
+SoundFile>=0.10.2
+torch==1.11.0
+torchaudio==0.11.0
+torchvision== 0.12.0
+tqdm>=4.42.0
+yt-dlp
+pydub

utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

utils/accent_analysis.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+import torchaudio
+import streamlit as st
+import traceback
+import psutil
+# Accent label map
+ACCENT_LABELS = {
+    "us": "American Accent",
+    "england": "British Accent",
+    "australia": "Australian Accent",
+    "indian": "Indian Accent",
+    "canada": "Canadian Accent",
+    "bermuda": "Bermudian Accent",
+    "scotland": "Scottish Accent",
+    "african": "African Accent",
+    "ireland": "Irish Accent",
+    "newzealand": "New Zealand Accent",
+    "wales": "Welsh Accent",
+    "malaysia": "Malaysian Accent",
+    "philippines": "Philippine Accent",
+    "singapore": "Singaporean Accent",
+    "hongkong": "Hong Kong Accent",
+    "southatlandtic": "South Atlantic Accent"
+}
+def analyze_accent(audio_tensor, sample_rate, model):
+    """Classifies audio to identify English accent."""
+    try:
+        # Convert stereo to mono (if needed)
+        if audio_tensor.shape[0] > 1:
+            audio_tensor = audio_tensor.mean(dim=0, keepdim=True)
+        audio_tensor = audio_tensor.squeeze(0).unsqueeze(0).to(torch.float32)
+        # Convert to 16kHz if needed
+        if sample_rate != 16000:
+            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
+            audio_tensor = resampler(audio_tensor)
+        audio_tensor = audio_tensor.to("cpu")
+        with torch.no_grad():
+            # Perform Classification
+            out_prob, score, index, text_lab = model.classify_batch(audio_tensor)
+            accent_label = text_lab[0]
+            readable = ACCENT_LABELS.get(accent_label, accent_label.title() + " accent")
+            return readable, round(score[0].item() * 100, 2)
+    except Exception:
+        st.error("❌ Error during classification.")
+        st.code(traceback.format_exc())
+        return None, None

utils/audio_processing.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import tempfile
+import subprocess
+import streamlit as st
+from pydub import AudioSegment
+import shutil
+AudioSegment.converter = shutil.which("ffmpeg")
+# -------------------------------
+# Utility Function: Download audio from a Video url
+# -------------------------------
+def download_audio_as_wav(url, max_filesize_mb=70):
+    """
+    Downloads audio from a URL using yt-dlp, then converts it to WAV using ffmpeg.
+    Supports fallback formats (.m4a, .webm, .opus) if .mp3 not found.
+    Cleans up temporary files after use.
+    Returns path to .wav file or None on failure.
+    """
+    audio_path = None
+    temp_wav = None
+    try:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            max_bytes = max_filesize_mb * 1024 * 1024
+            output_template = os.path.join(temp_dir, "audio.%(ext)s")
+            # yt-dlp download command
+            download_cmd = [
+                "yt-dlp",
+                "-f", f"bestaudio[filesize<={max_bytes}]",
+                "--extract-audio",
+                "--audio-format", "mp3",
+                "--no-playlist",
+                "--no-cache-dir",
+                "--restrict-filenames",
+                "-o", output_template,
+                url
+            ]
+            subprocess.run(download_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+            # Try to locate audio file (mp3 or fallback)
+            common_exts = [".mp3", ".m4a", ".webm", ".opus"]
+            for ext in common_exts:
+                matches = [f for f in os.listdir(temp_dir) if f.endswith(ext)]
+                if matches:
+                    audio_path = os.path.join(temp_dir, matches[0])
+                    break
+            if not audio_path or not os.path.exists(audio_path):
+                st.error("❌ No supported audio file found after download.")
+                return None
+            # Convert to WAV (outside temp_dir so it persists)
+            temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+            convert_cmd = ["ffmpeg", "-y", "-i", audio_path, temp_wav.name]
+            subprocess.run(convert_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+            # Return WAV file path; temp_dir and downloaded audio cleaned automatically
+            return temp_wav.name
+    except subprocess.CalledProcessError as e:
+        error_msg = e.stderr.decode() if hasattr(e, "stderr") else str(e)
+        if "st" in globals():
+            st.error("❌ Audio download or conversion failed.")
+            st.code(error_msg)
+        else:
+            print("Error during processing:", error_msg)
+        # Cleanup wav if created
+        if temp_wav is not None and os.path.exists(temp_wav.name):
+            os.remove(temp_wav.name)
+        return None
+    except Exception as e:
+        if "st" in globals():
+            st.error("❌ Unexpected error occurred.")
+            st.code(str(e))
+        else:
+            print("Unexpected error:", e)
+        if temp_wav is not None and os.path.exists(temp_wav.name):
+            os.remove(temp_wav.name)
+        return None
+# --------------------------
+# Utility: Trim audios to 2 minutes
+# --------------------------
+def trim_audio(input_wav_path, max_duration_sec=120):
+    """
+    Trims the input .wav file to the first `max_duration_sec` seconds.
+    Returns the path to the trimmed .wav file.
+    """
+    try:
+        # Load audio using pydub
+        audio = AudioSegment.from_wav(input_wav_path)
+        # Trim to max_duration_sec
+        trimmed_audio = audio[:max_duration_sec * 1000]  # pydub uses milliseconds
+        # Save to a new temporary .wav file
+        trimmed_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        trimmed_audio.export(trimmed_file.name, format="wav")
+        return trimmed_file.name
+    except Exception as e:
+        st.error(f"❌ Error trimming audio: {e}")
+        if trimmed_file and os.path.exists(trimmed_file.name):
+            os.remove(trimmed_file.name)
+        return None

utils/session_utils.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import streamlit as st
+import psutil
+# -------------------------------
+# Manage Station state variables
+# -------------------------------
+def initialize_session_state():
+    defaults = {
+        "audio_path": None,
+        "audio_ready": False,
+        "transcription": "",
+    }
+    for k, v in defaults.items():
+        if k not in st.session_state:
+            st.session_state[k] = v
+# 🔍 Show memory info after
+def display_memory_once():
+    if 'memory_logged' not in st.session_state:
+        mem = psutil.virtual_memory()
+        st.markdown(f"🧠 **Memory Used:** {mem.percent}%")
+        st.session_state.memory_logged = True
+# Reset the app
+def reset_session_state_except_model():
+    keys_to_keep = {"classifier", "whisper"}
+    for key in list(st.session_state.keys()):
+        if key not in keys_to_keep:
+            del st.session_state[key]

utils/video_processing.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import tempfile
+import subprocess
+import os
+from moviepy.editor import VideoFileClip
+import streamlit as st
+import traceback
+import shutil
+# --------------------------
+# Utility: Trim videos to 2 minutes
+# --------------------------
+def trim_video(video_path, max_duration=120):
+    """Trims video to max_duration (in seconds) and extracts audio."""
+    try:
+        video = VideoFileClip(video_path)
+        duration = video.duration
+        video.close()
+        audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
+        command = [
+            "ffmpeg", "-i", video_path,
+            "-t", str(min(duration, max_duration)),
+            "-ar", "16000", "-ac", "1",
+            "-acodec", "pcm_s16le", "-y", audio_path
+        ]
+        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        if result.returncode != 0:
+            st.error("❌ ffmpeg audio extraction failed.")
+            os.remove(audio_path)  # Clean up failed temp file
+            st.code(result.stderr.decode())
+            return None
+        return audio_path
+    except Exception as e:
+        st.error(f"❌ Error trimming video: {e}")
+        os.remove(audio_path)
+        st.code(traceback.format_exc())
+        return None
+    finally:
+        # Clean up input video if it was a temp file
+        if "tmp" in video_path and os.path.exists(video_path):
+            try:
+                os.remove(video_path)
+            except Exception:
+                pass  # Avoid crashing on cleanup