Spaces:

Issamohammed
/

Transcriber

Running

File size: 1,824 Bytes

import os
import torch
import gradio as gr
import mimetypes
from pydub import AudioSegment
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

# Set device and precision
device = "cpu"
torch_dtype = torch.float32

# Load KB-Whisper model
model_id = "KBLab/kb-whisper-large"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype
).to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    device=device,
    torch_dtype=torch_dtype,
)

def transcribe(audio_path):
    try:
        # Get file extension
        ext = os.path.splitext(audio_path)[1].lower()

        # Convert to WAV if not already
        if ext != ".wav":
            try:
                sound = AudioSegment.from_file(audio_path)
                converted_path = audio_path.replace(ext, ".converted.wav")
                sound.export(converted_path, format="wav")
                audio_path = converted_path
            except Exception as e:
                return f"Error converting audio to WAV: {str(e)}"

        # Transcribe
        result = pipe(audio_path, chunk_length_s=30, generate_kwargs={"task": "transcribe", "language": "sv"})
        return result["text"]

    except Exception as e:
        return f"Transcription failed: {str(e)}"

# Gradio UI
gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath", label="Upload Audio (.m4a, .mp3, .wav)"),
    outputs=gr.Textbox(label="Swedish Transcript"),
    title="Swedish Speech Transcriber with KB-Whisper",
    description="Supports .m4a, .mp3, .wav files. Transcribes spoken Swedish using KBLab's Whisper Large model. May take time on CPU.",
).launch()