Spaces:

7H4M3R
/

Audio

Sleeping

File size: 3,245 Bytes

import streamlit as st
import os
from transformers import pipeline
from transformers.utils import logging
import numpy as np 
import pandas as pd
import yt_dlp
import torchaudio
import ffmpeg

logging.set_verbosity_info()

RATE_HZ = 16000
MAX_SECONDS = 1
MAX_LENGTH = RATE_HZ * MAX_SECONDS
MAX_SEGMENTS = 250

def download_video(url, output_path="video.mp4"):
    ydl_opts = {
        'format': 'worstvideo[ext=mp4]+bestaudio[ext=m4a]/bestaudio',
        'outtmpl': output_path,
        'merge_output_format': 'mp4',
        'quiet': True,
        'noplaylist': True,
        'nocheckcertificate': True,
        'retries': 3,
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    return output_path

def extract_audio(input_path, output_path="audio.mp3"):
    (
        ffmpeg
        .input(input_path)
        .output(output_path, format='mp3', acodec='libmp3lame', audio_bitrate='192k')
        .overwrite_output()
        .run(quiet=True)
    )
    return output_path

def split_audio(file):
    segmented_audio = []
    try:
        audio, rate = torchaudio.load(str(file))
        transform = torchaudio.transforms.Resample(rate, RATE_HZ)
        num_segments = (len(audio[0]) // MAX_LENGTH)  # Floor division to get segments
        for i in range(num_segments):
            if i >= MAX_SEGMENTS:
                break
            start = i * MAX_LENGTH
            end = min((i + 1) * MAX_LENGTH, len(audio[0]))
            segment = audio[0][start:end]
            segment = transform(segment).squeeze(0).numpy().reshape(-1)
            segmented_audio.append(segment)
    except Exception as e:
        print(f"Error processing file: {e}")
        return segmented_audio
    else:
        return np.concatenate(segmented_audio)

accent_mapping = {
    'us': 'American',
    'canada': 'Canadian',
    'england': 'British',
    'indian': 'Indian',
    'australia': 'Australian',
}

st.set_page_config(page_title="Accent Classifier", layout="centered")
st.title("🎙️ English Accent Classifier")
st.markdown("Upload a video link and get the English accent with confidence.")
video_url = st.text_input("Paste a public video URL (Loom, or MP4):")

if st.button("Analyze"):
    if not video_url.strip():
        st.warning("Please enter a valid URL.")
    else:
        with st.spinner("Downloading video..."):
            video_path = download_video(video_url)

        with st.spinner("Extracting audio..."):
            audio_path = extract_audio(video_path)

        with st.spinner("Extracting Waves..."):
            waves = split_audio(audio_path)

        with st.spinner("Classifying accent..."):
            model_name = "dima806/english_accents_classification"
            pipe = pipeline('audio-classification', model=model_name, device=0)
            # accent_data = accent_classify(pipe, audio_path)
            accent_data = pipe(waves)[0]
            accent = accent_mapping.get(accent_data.get("label", "us"))
            confidence = accent_data.get("score", 0)

        st.success("Analysis Complete!")
        st.markdown(f"**Accent:** {accent}")
        st.markdown(f"**Confidence Score:** {confidence:.2f}%")

        # Cleanup
        os.remove(video_path)
        os.remove(audio_path)