Spaces:

Yilin0601
/

SpeechAccuracyClassification

Sleeping

File size: 3,025 Bytes

528d66e
1e0e1c2
528d66e
1e0e1c2
af74093
528d66e
2597334
227aa4c
2597334
3e63959
227aa4c
2597334
20b8be9
af74093
 
20b8be9
2597334
 
 
1e0e1c2
2597334
1e0e1c2
 
 
227aa4c
 
1e0e1c2
 
227aa4c
3e63959
 
 
2597334
 
 
 
227aa4c
1e0e1c2
2597334
1e0e1c2
227aa4c
af74093
 
 
 
 
 
1e0e1c2
2597334
1e0e1c2
 
 
 
227aa4c
1e0e1c2
3e63959
1e0e1c2
227aa4c
1e0e1c2
2597334
 
 
1e0e1c2
 
 
 
20b8be9
4bcefb9
8a4ae78
 
 
 
 
 
 
 
 
4bcefb9
96e64e2
528d66e
 
 
1e0e1c2

import gradio as gr
import torch
import numpy as np
import librosa
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor

# --------------------------------------------------
# Load Your Fine-Tuned Model for Fluency Prediction
# --------------------------------------------------
# This model was fine-tuned with labels remapped from [3..10] to [0..7].
# Ensure that "Yilin0601/wav2vec2-fluency-checkpoints" is your correct repo.
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "Yilin0601/wav2vec2-fluency-checkpoints"
)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
    "Yilin0601/wav2vec2-fluency-checkpoints"
)

# --------------------------------------------------
# Prediction Function
# --------------------------------------------------
def predict(audio):
    if audio is None:
        return "No audio provided."
    
    # Gradio returns audio as (sample_rate, np.array)
    sample_rate, audio_data = audio

    # Ensure audio is in floating-point (librosa requires float32 or float64)
    if audio_data.dtype not in [np.float32, np.float64]:
        audio_data = audio_data.astype(np.float32)

    # Convert stereo to mono if needed
    if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
        audio_data = np.mean(audio_data, axis=1)

    # Resample to 16 kHz if necessary
    if sample_rate != 16000:
        audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)

    # Extract features using the feature extractor
    inputs = feature_extractor(
        audio_data,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    )

    # Model inference
    model.eval()
    with torch.no_grad():
        logits = model(**inputs).logits

    # The model outputs an 8-class prediction (0..7), corresponding to original fluency scores [3..10]
    pred_class = torch.argmax(logits, dim=-1).item()
    predicted_level = pred_class + 3  # Map back to [3..10]

    return f"Predicted Fluency Level: {predicted_level}"

# --------------------------------------------------
# Gradio Interface
# --------------------------------------------------
iface = gr.Interface(
    fn=predict,
    inputs=gr.Audio(type="numpy", label="Record or Upload Audio"),
    outputs="text",
    title="L2 English Fluency Predictor",
    description=(
        "<p style='font-size:16px;'>"
        "This demo predicts your English fluency level on a scale from 0 to 10. "
        "It uses a fine-tuned <b>facebook/wav2vec2-base-960h</b> model trained on the "
        "<b>DynamicSuperb/L2EnglishAccuracy_speechocean762-Scoring</b> dataset, which contains "
        "745 labeled audio recordings of non-native English speakers. "
        "To get your fluency score, simply record or upload an audio file. "
        "<br><br>"
        "<b>Note:</b> This prediction is for demo purposes and should be interpreted with caution. "
        "</p>"
    ),
    allow_flagging="never"
)

if __name__ == "__main__":
    iface.launch()