Spaces:

Yilin0601
/

SpeechAccuracyClassification

Sleeping

App Files Files Community

Yilin0601 commited on Mar 21

Commit

c076438

verified ·

1 Parent(s): 96e64e2

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -12

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassifica
 # ------------------------------------------------
 model_name = "facebook/wav2vec2-base-960h"
-# We specify num_labels=8 to create a random classification head on top
 model = Wav2Vec2ForSequenceClassification.from_pretrained(
     model_name,
     num_labels=8
@@ -26,24 +26,24 @@ model.eval()
 def classify_accuracy(audio):
     """
     Receives a tuple (sample_rate, data) from Gradio when type='numpy'.
-    We'll resample if needed, run a forward pass, and return a 'level'.
     """
     if audio is None:
         return "No audio provided."
     sample_rate, data = audio
-    # Ensure we have a NumPy array
     if not isinstance(data, np.ndarray):
         data = np.array(data)
-    # Resample if the model expects 16kHz
     target_sr = 16000
     if sample_rate != target_sr:
         data = librosa.resample(data, orig_sr=sample_rate, target_sr=target_sr)
         sample_rate = target_sr
-    # Extract features
     inputs = feature_extractor(
         data,
         sampling_rate=sample_rate,
@@ -51,14 +51,14 @@ def classify_accuracy(audio):
         padding=True
     )
     with torch.no_grad():
         outputs = model(**inputs)
         logits = outputs.logits
         predicted_id = torch.argmax(logits, dim=-1).item()
-    # Map 0..7 → 3..10 if you want a "level" in that range
     accuracy_level = predicted_id + 3
     return f"Predicted Accuracy Level: {accuracy_level}"
 # ------------------------------------------------
@@ -66,15 +66,15 @@ def classify_accuracy(audio):
 # ------------------------------------------------
 title = "Speech Accuracy Classifier (Base Wav2Vec2)"
 description = (
-    "Upload an audio file (or record audio) on the left. "
-    "The base model is NOT fine-tuned for classification, so results may be random. "
-    "This demo simply illustrates how to attach a classification head."
 )
 demo = gr.Interface(
     fn=classify_accuracy,
-    inputs=gr.Audio(source="upload", type="numpy"),
-    outputs="text",
     title=title,
     description=description,
     allow_flagging="never"

 # ------------------------------------------------
 model_name = "facebook/wav2vec2-base-960h"
+# Specify num_labels=8 to create a random classification head on top.
 model = Wav2Vec2ForSequenceClassification.from_pretrained(
     model_name,
     num_labels=8
 def classify_accuracy(audio):
     """
     Receives a tuple (sample_rate, data) from Gradio when type='numpy'.
+    Resamples if needed, runs a forward pass, and returns a 'level'.
     """
     if audio is None:
         return "No audio provided."
     sample_rate, data = audio
+    # Ensure data is a NumPy array.
     if not isinstance(data, np.ndarray):
         data = np.array(data)
+    # Resample to 16kHz if needed.
     target_sr = 16000
     if sample_rate != target_sr:
         data = librosa.resample(data, orig_sr=sample_rate, target_sr=target_sr)
         sample_rate = target_sr
+    # Extract features from the audio data.
     inputs = feature_extractor(
         data,
         sampling_rate=sample_rate,
         padding=True
     )
+    # Run model inference.
     with torch.no_grad():
         outputs = model(**inputs)
         logits = outputs.logits
         predicted_id = torch.argmax(logits, dim=-1).item()
+    # Map predicted id (0..7) to the final level (3..10).
     accuracy_level = predicted_id + 3
     return f"Predicted Accuracy Level: {accuracy_level}"
 # ------------------------------------------------
 # ------------------------------------------------
 title = "Speech Accuracy Classifier (Base Wav2Vec2)"
 description = (
+    "Record audio using your microphone or upload an audio file (left). "
+    "The model (not fine-tuned) will classify the audio into an accuracy level (right)."
 )
+# Using source="microphone" allows for direct recording, while recent versions also enable file upload.
 demo = gr.Interface(
     fn=classify_accuracy,
+    inputs=gr.Audio(source="microphone", type="numpy", label="Record/Upload Audio"),
+    outputs=gr.Textbox(label="Classification Result"),
     title=title,
     description=description,
     allow_flagging="never"