Spaces:
Running
Running
File size: 2,498 Bytes
db55266 0b63b29 db55266 0b63b29 db55266 0b63b29 db55266 0b63b29 db55266 0b63b29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import os
import torch
import gradio as gr
from pydub import AudioSegment
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
import tempfile
import math
from datasets import load_dataset, Audio
import numpy as np
import torchaudio
# Set up model
device = "cpu"
torch_dtype = torch.float32
model_id = "KBLab/kb-whisper-large"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
# Helper: Split audio into chunks
def split_audio(audio_path, chunk_duration_ms=10000):
audio = AudioSegment.from_file(audio_path)
chunks = [audio[i:i + chunk_duration_ms] for i in range(0, len(audio), chunk_duration_ms)]
return chunks
# Helper: Transcribe a single chunk
def transcribe_chunk(chunk):
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
chunk.export(tmpfile.name, format="wav")
input_audio, _ = torchaudio.load(tmpfile.name)
input_features = processor(input_audio.squeeze(), sampling_rate=16000, return_tensors="pt").input_features
input_features = input_features.to(device)
predicted_ids = model.generate(input_features, task="transcribe", language="sv")
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
os.remove(tmpfile.name)
return transcription
# Full transcription function with progress
def transcribe_with_progress(audio_path, progress=gr.Progress()):
ext = os.path.splitext(audio_path)[1].lower()
if ext != ".wav":
sound = AudioSegment.from_file(audio_path)
audio_path = audio_path.replace(ext, ".converted.wav")
sound.export(audio_path, format="wav")
chunks = split_audio(audio_path, chunk_duration_ms=8000)
full_transcript = ""
total_chunks = len(chunks)
for i, chunk in enumerate(chunks):
partial_text = transcribe_chunk(chunk)
full_transcript += partial_text + " "
progress(i + 1, total_chunks) # Update progress bar
yield full_transcript.strip() # Stream updated text to UI
# UI
gr.Interface(
fn=transcribe_with_progress,
inputs=gr.Audio(type="filepath", label="Upload Swedish Audio"),
outputs=gr.Textbox(label="Live Transcript (Swedish)"),
title="Live Swedish Transcriber (KB-Whisper)",
description="Streams transcription word-by-word with visual progress. Supports .m4a, .mp3, .wav. May be slow on CPU.",
live=True
).launch()
|