import torch import librosa from speechbrain.inference.classifiers import EncoderClassifier from pydub import AudioSegment import gradio as gr import uuid import os # Load model once classifier = EncoderClassifier.from_hparams( source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa" ) def classify_accent(video): # Generate unique filename temp_wav = f"/tmp/{uuid.uuid4().hex}.wav" # Convert to .wav audio = AudioSegment.from_file(video, format="mp4") audio.export(temp_wav, format="wav") # Load waveform waveform, sr = librosa.load(temp_wav, sr=16000, mono=True) waveform_tensor = torch.tensor(waveform).unsqueeze(0) # Predict prediction = classifier.classify_batch(waveform_tensor) _, score, _, text_lab = prediction # Cleanup os.remove(temp_wav) return f"Accent: {text_lab[0]} (Confidence: {score.item():.2f})" app = gr.Interface( fn=classify_accent, inputs=gr.Video(label="Upload an MP4"), outputs=gr.Text(label="Prediction"), title="English Accent Classifier", description="Upload a short MP4 video of spoken English to detect accent." ) if __name__ == "__main__": app.launch()