File size: 3,025 Bytes
528d66e 1e0e1c2 528d66e 1e0e1c2 af74093 528d66e 2597334 227aa4c 2597334 3e63959 227aa4c 2597334 20b8be9 af74093 20b8be9 2597334 1e0e1c2 2597334 1e0e1c2 227aa4c 1e0e1c2 227aa4c 3e63959 2597334 227aa4c 1e0e1c2 2597334 1e0e1c2 227aa4c af74093 1e0e1c2 2597334 1e0e1c2 227aa4c 1e0e1c2 3e63959 1e0e1c2 227aa4c 1e0e1c2 2597334 1e0e1c2 20b8be9 4bcefb9 8a4ae78 4bcefb9 96e64e2 528d66e 1e0e1c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import gradio as gr
import torch
import numpy as np
import librosa
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
# --------------------------------------------------
# Load Your Fine-Tuned Model for Fluency Prediction
# --------------------------------------------------
# This model was fine-tuned with labels remapped from [3..10] to [0..7].
# Ensure that "Yilin0601/wav2vec2-fluency-checkpoints" is your correct repo.
model = Wav2Vec2ForSequenceClassification.from_pretrained(
"Yilin0601/wav2vec2-fluency-checkpoints"
)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
"Yilin0601/wav2vec2-fluency-checkpoints"
)
# --------------------------------------------------
# Prediction Function
# --------------------------------------------------
def predict(audio):
if audio is None:
return "No audio provided."
# Gradio returns audio as (sample_rate, np.array)
sample_rate, audio_data = audio
# Ensure audio is in floating-point (librosa requires float32 or float64)
if audio_data.dtype not in [np.float32, np.float64]:
audio_data = audio_data.astype(np.float32)
# Convert stereo to mono if needed
if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
audio_data = np.mean(audio_data, axis=1)
# Resample to 16 kHz if necessary
if sample_rate != 16000:
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
# Extract features using the feature extractor
inputs = feature_extractor(
audio_data,
sampling_rate=16000,
return_tensors="pt",
padding=True
)
# Model inference
model.eval()
with torch.no_grad():
logits = model(**inputs).logits
# The model outputs an 8-class prediction (0..7), corresponding to original fluency scores [3..10]
pred_class = torch.argmax(logits, dim=-1).item()
predicted_level = pred_class + 3 # Map back to [3..10]
return f"Predicted Fluency Level: {predicted_level}"
# --------------------------------------------------
# Gradio Interface
# --------------------------------------------------
iface = gr.Interface(
fn=predict,
inputs=gr.Audio(type="numpy", label="Record or Upload Audio"),
outputs="text",
title="L2 English Fluency Predictor",
description=(
"<p style='font-size:16px;'>"
"This demo predicts your English fluency level on a scale from 0 to 10. "
"It uses a fine-tuned <b>facebook/wav2vec2-base-960h</b> model trained on the "
"<b>DynamicSuperb/L2EnglishAccuracy_speechocean762-Scoring</b> dataset, which contains "
"745 labeled audio recordings of non-native English speakers. "
"To get your fluency score, simply record or upload an audio file. "
"<br><br>"
"<b>Note:</b> This prediction is for demo purposes and should be interpreted with caution. "
"</p>"
),
allow_flagging="never"
)
if __name__ == "__main__":
iface.launch()
|