Spaces:

Issamohammed
/

Transcriber

Running

App Files Files Community

Issamohammed commited on Apr 16

Commit

db55266

verified ·

1 Parent(s): 7705134

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -11

app.py CHANGED Viewed

@@ -1,21 +1,59 @@
-from pydub import AudioSegment
 import mimetypes
 def transcribe(audio_path):
     try:
-        # Detect file type using MIME or extension
-        mime_type, _ = mimetypes.guess_type(audio_path)
         ext = os.path.splitext(audio_path)[1].lower()
-        if mime_type == "audio/mp4" or ext == ".m4a":
-            print("Converting .m4a to .wav...")
-            sound = AudioSegment.from_file(audio_path, format="m4a")
-            converted_path = audio_path.replace(".m4a", ".converted.wav")
-            sound.export(converted_path, format="wav")
-            audio_path = converted_path
         result = pipe(audio_path, chunk_length_s=30, generate_kwargs={"task": "transcribe", "language": "sv"})
         return result["text"]
     except Exception as e:
-        return f"Error during transcription: {str(e)}"

+import os
+import torch
+import gradio as gr
 import mimetypes
+from pydub import AudioSegment
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+# Set device and precision
+device = "cpu"
+torch_dtype = torch.float32
+# Load KB-Whisper model
+model_id = "KBLab/kb-whisper-large"
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id, torch_dtype=torch_dtype
+).to(device)
+processor = AutoProcessor.from_pretrained(model_id)
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    device=device,
+    torch_dtype=torch_dtype,
+)
 def transcribe(audio_path):
     try:
+        # Get file extension
         ext = os.path.splitext(audio_path)[1].lower()
+        # Convert to WAV if not already
+        if ext != ".wav":
+            try:
+                sound = AudioSegment.from_file(audio_path)
+                converted_path = audio_path.replace(ext, ".converted.wav")
+                sound.export(converted_path, format="wav")
+                audio_path = converted_path
+            except Exception as e:
+                return f"Error converting audio to WAV: {str(e)}"
+        # Transcribe
         result = pipe(audio_path, chunk_length_s=30, generate_kwargs={"task": "transcribe", "language": "sv"})
         return result["text"]
     except Exception as e:
+        return f"Transcription failed: {str(e)}"
+# Gradio UI
+gr.Interface(
+    fn=transcribe,
+    inputs=gr.Audio(type="filepath", label="Upload Audio (.m4a, .mp3, .wav)"),
+    outputs=gr.Textbox(label="Swedish Transcript"),
+    title="Swedish Speech Transcriber with KB-Whisper",
+    description="Supports .m4a, .mp3, .wav files. Transcribes spoken Swedish using KBLab's Whisper Large model. May take time on CPU.",
+).launch()