import gradio as gr import torch import numpy as np import librosa from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor # -------------------------------------------------- # Load Your Fine-Tuned Model for Fluency Prediction # -------------------------------------------------- # This model was fine-tuned with labels remapped from [3..10] to [0..7]. # Ensure that "Yilin0601/wav2vec2-fluency-checkpoints" is your correct repo. model = Wav2Vec2ForSequenceClassification.from_pretrained( "Yilin0601/wav2vec2-fluency-checkpoints" ) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( "Yilin0601/wav2vec2-fluency-checkpoints" ) # -------------------------------------------------- # Prediction Function # -------------------------------------------------- def predict(audio): if audio is None: return "No audio provided." # Gradio returns audio as (sample_rate, np.array) sample_rate, audio_data = audio # Ensure audio is in floating-point (librosa requires float32 or float64) if audio_data.dtype not in [np.float32, np.float64]: audio_data = audio_data.astype(np.float32) # Convert stereo to mono if needed if len(audio_data.shape) > 1 and audio_data.shape[1] > 1: audio_data = np.mean(audio_data, axis=1) # Resample to 16 kHz if necessary if sample_rate != 16000: audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000) # Extract features using the feature extractor inputs = feature_extractor( audio_data, sampling_rate=16000, return_tensors="pt", padding=True ) # Model inference model.eval() with torch.no_grad(): logits = model(**inputs).logits # The model outputs an 8-class prediction (0..7), corresponding to original fluency scores [3..10] pred_class = torch.argmax(logits, dim=-1).item() predicted_level = pred_class + 3 # Map back to [3..10] return f"Predicted Fluency Level: {predicted_level}" # -------------------------------------------------- # Gradio Interface # -------------------------------------------------- iface = gr.Interface( fn=predict, inputs=gr.Audio(type="numpy", label="Record or Upload Audio"), outputs="text", title="L2 English Fluency Predictor", description=( "

" "This demo predicts your English fluency level on a scale from 0 to 10. " "It uses a fine-tuned facebook/wav2vec2-base-960h model trained on the " "DynamicSuperb/L2EnglishAccuracy_speechocean762-Scoring dataset, which contains " "745 labeled audio recordings of non-native English speakers. " "To get your fluency score, simply record or upload an audio file. " "

" "Note: This prediction is for demo purposes and should be interpreted with caution. " "

" ), allow_flagging="never" ) if __name__ == "__main__": iface.launch()