asr-inference

Running on Zero

App Files Files Community

Update whisper.py

by ssolito - opened Jun 19

base: refs/heads/main

←

from: refs/pr/5

Discussion Files changed

+87

-176

Files changed (1) hide show

whisper.py +87 -176

whisper.py CHANGED Viewed

@@ -1,35 +1,59 @@
-from pyannote.audio import Pipeline
 from pydub import AudioSegment
 import os
-from transformers import WhisperForConditionalGeneration, WhisperProcessor
 import torchaudio
 import torch
 import re
 from transformers import pipeline
 import spaces
 device = 0 if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float32
-MODEL_NAME = "openai/whisper-large-v3"
-CKPT = "projecte-aina/whisper-large-v3-tiny-caesar"
 BATCH_SIZE = 1
-model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch_dtype).to(device)
-processor = WhisperProcessor.from_pretrained(MODEL_NAME)
-pipeline_vad = Pipeline.from_pretrained("./pyannote/config.yaml")
-threshold = 10000
-segments_dir = "."
 pipe = pipeline(
     task="automatic-speech-recognition",
-    model=CKPT,
     chunk_length_s=30,
     device=device,
     token=os.getenv("HF_TOKEN")
     )
 def post_process_transcription(transcription, max_repeats=2):
     tokens = re.findall(r'\b\w+\'?\w*\b[.,!?]?', transcription)
@@ -56,151 +80,6 @@ def post_process_transcription(transcription, max_repeats=2):
     return cleaned_transcription
-def convert_forced_to_tokens(forced_decoder_ids):
-    forced_decoder_tokens = []
-    for i, (idx, token) in enumerate(forced_decoder_ids):
-        if token is not None:
-            forced_decoder_tokens.append([idx, processor.tokenizer.decode(token)])
-        else:
-            forced_decoder_tokens.append([idx, token])
-    return forced_decoder_tokens
-def generate_1st_chunk(audio):
-    input_audio, sample_rate = torchaudio.load(audio)
-    input_audio = torchaudio.transforms.Resample(sample_rate, 16000)(input_audio)
-    input_speech = input_audio[0]
-    input_features = processor(input_speech,
-                                    sampling_rate=16_000,
-                                    return_tensors="pt", torch_dtype=torch_dtype).input_features.to(device)
-    forced_decoder_ids = []
-    forced_decoder_ids.append([1,50270]) #[1, '<|ca|>']
-    forced_decoder_ids.append([2,50262]) #[2, '<|es|>']
-    forced_decoder_ids.append([3,50360]) #[3, '<|transcribe|>']
-    forced_decoder_ids_modified = forced_decoder_ids
-    idx = processor.tokenizer.all_special_tokens.index("<|startofprev|>")
-    forced_bos_token_id = processor.tokenizer.all_special_ids[idx]
-    prompt = "Antes de 'digui'm', '112'. 112, digui'm. Hola, puc parlar en castellà? Sí, digui, diga. Sí, mire: a veces al abrir la puerta de mi piso tengo una persona ahí. Vale, avisamos a la Guàrdia Urbana, ¿de acuerdo? Vale, perfecto. Gracias. Gracias. Buen día."
-    prompt_tokens = processor.tokenizer(prompt, add_special_tokens=False).input_ids
-    # we need to force these tokens
-    forced_decoder_ids = []
-    for idx, token in enumerate(prompt_tokens):
-        # indexing starts from 1 for forced tokens (token at position 0 is the SOS token)
-        forced_decoder_ids.append([idx + 1, token])
-    # now we add the SOS token at the end
-    offset = len(forced_decoder_ids)
-    forced_decoder_ids.append([offset + 1, model.generation_config.decoder_start_token_id])
-    # now we need to append the rest of the prefix tokens (lang, task, timestamps)
-    offset = len(forced_decoder_ids)
-    for idx, token in forced_decoder_ids_modified:
-        forced_decoder_ids.append([idx + offset , token])
-    model.generation_config.forced_decoder_ids = forced_decoder_ids
-    pred_ids = model.generate(input_features,
-                                    return_timestamps=True,
-                                    max_new_tokens=128,
-                                    decoder_start_token_id=forced_bos_token_id)
-    #exclude prompt from output
-    forced_decoder_tokens = convert_forced_to_tokens(forced_decoder_ids)
-    output = processor.decode(pred_ids[0][len(forced_decoder_tokens) + 1:], skip_special_tokens=True)
-    return output[1:]
-def generate_2nd_chuk(audio):
-    input_audio, sample_rate = torchaudio.load(audio)
-    input_audio = torchaudio.transforms.Resample(sample_rate, 16000)(input_audio)
-    input_speech = input_audio[0]
-    input_features = processor(input_speech,
-                                    sampling_rate=16_000,
-                                    return_tensors="pt", torch_dtype=torch_dtype).input_features.to(device)
-    forced_decoder_ids = []
-    forced_decoder_ids.append([1,50270]) #[1, '<|ca|>']
-    forced_decoder_ids.append([2,50262]) #[2, '<|es|>']
-    forced_decoder_ids.append([3,50360]) #[3, '<|transcribe|>']
-    forced_decoder_ids_modified = forced_decoder_ids
-    idx = processor.tokenizer.all_special_tokens.index("<|startofprev|>")
-    forced_bos_token_id = processor.tokenizer.all_special_ids[idx]
-    prompt = "112, digui'm. Hola, puc parlar en castellà? Sí, digui, diga. Sí, mire: a veces al abrir la puerta de mi piso tengo una persona ahí. Vale, avisamos a la Guàrdia Urbana, ¿de acuerdo? Vale, perfecto. Gracias. Gracias. Buen día."
-    prompt_tokens = processor.tokenizer(prompt, add_special_tokens=False).input_ids
-    # we need to force these tokens
-    forced_decoder_ids = []
-    for idx, token in enumerate(prompt_tokens):
-        # indexing starts from 1 for forced tokens (token at position 0 is the SOS token)
-        forced_decoder_ids.append([idx + 1, token])
-    # now we add the SOS token at the end
-    offset = len(forced_decoder_ids)
-    forced_decoder_ids.append([offset + 1, model.generation_config.decoder_start_token_id])
-    # now we need to append the rest of the prefix tokens (lang, task, timestamps)
-    offset = len(forced_decoder_ids)
-    for idx, token in forced_decoder_ids_modified:
-        forced_decoder_ids.append([idx + offset , token])
-    model.generation_config.forced_decoder_ids = forced_decoder_ids
-    pred_ids = model.generate(input_features,
-                                    return_timestamps=True,
-                                    max_new_tokens=128,
-                                    decoder_start_token_id=forced_bos_token_id)
-    #exclude prompt from output
-    forced_decoder_tokens = convert_forced_to_tokens(forced_decoder_ids)
-    output = processor.decode(pred_ids[0][len(forced_decoder_tokens) + 1:], skip_special_tokens=True)
-    return output[1:]
-def processing_vad_threshold(audio, output_vad, threshold, max_duration, concatenated_segment):
-    transcription_audio = ""
-    is_first_chunk = True
-    for speech in output_vad.get_timeline().support():
-        start, end = speech.start, speech.end
-        segment_duration = (end - start) * 1000
-        segment_audio = audio[start * 1000:end * 1000]
-        if max_duration + segment_duration < threshold:
-            concatenated_segment += audio[start * 1000:end * 1000]
-            max_duration += segment_duration
-        else:
-            if len(concatenated_segment) > 0:
-                temp_segment_path = os.path.join(segments_dir, f"temp_segment.wav")
-                concatenated_segment.export(temp_segment_path, format="wav")
-                if is_first_chunk:
-                    output = generate_1st_chunk(temp_segment_path)
-                    is_first_chunk = False
-                else:
-                    output = generate_2nd_chuk(temp_segment_path)
-                transcription_audio = transcription_audio + output
-                max_duration = segment_duration
-                concatenated_segment = segment_audio
-    # Process any remaining audio in the concatenated_segment
-    if len(concatenated_segment) > 0:
-        temp_segment_path = os.path.join(segments_dir, f"temp_segment.wav")
-        concatenated_segment.export(temp_segment_path, format="wav")
-        output = generate_2nd_chuk(temp_segment_path)
-        transcription_audio = transcription_audio + output
-    return(transcription_audio)
 def format_audio(audio_path):
     input_audio, sample_rate = torchaudio.load(audio_path)
@@ -212,34 +91,66 @@ def format_audio(audio_path):
     input_audio = input_audio.squeeze().numpy()
     return(input_audio)
 def transcribe_pipeline(audio, task):
     text = pipe(audio, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
     return text
-def generate(audio_path, use_v5):
-    audio = AudioSegment.from_wav(audio_path)
-    temp_mono_path = None
-    if audio.channels != 1: #stereo2mono
-       audio = audio.set_channels(1)
-       temp_mono_path = "temp_mono.wav"
-       audio.export(temp_mono_path, format="wav")
-       audio_path = temp_mono_path
-    output_vad = pipeline_vad(audio_path)
-    concatenated_segment = AudioSegment.empty()
-    max_duration = 0
-    if use_v5:
-        output = processing_vad_threshold(audio, output_vad, threshold, max_duration, concatenated_segment)
-    else:
-        task = "transcribe"
         output = transcribe_pipeline(format_audio(audio_path), task)
-    clean_output = post_process_transcription(output)
     if temp_mono_path and os.path.exists(temp_mono_path):
-       os.remove(temp_mono_path)
     return clean_output

 from pydub import AudioSegment
 import os
+from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizer
 import torchaudio
 import torch
 import re
 from transformers import pipeline
+from peft import PeftModel, PeftConfig
 import spaces
 device = 0 if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float32
+### Configuration
+MODEL_NAME_V2 = "./whisper-large-v3-catalan"
+MODEL_NAME_V1 = "projecte-aina/whisper-large-v3-tiny-caesar"
+CHUNK_LENGTH = 30
 BATCH_SIZE = 1
 pipe = pipeline(
     task="automatic-speech-recognition",
+    model=MODEL_NAME_V1,
     chunk_length_s=30,
     device=device,
     token=os.getenv("HF_TOKEN")
     )
+peft_config = PeftConfig.from_pretrained(MODEL_NAME_V2)
+model = WhisperForConditionalGeneration.from_pretrained(
+    peft_config.base_model_name_or_path,
+    device_map="auto"
+)
+task = "transcribe"
+model = PeftModel.from_pretrained(model, MODEL_NAME_V2)
+model.config.use_cache = True
+tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, task=task)
+processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, task=task)
+feature_extractor = processor.feature_extractor
+forced_decoder_ids = processor.get_decoder_prompt_ids(task=task)
+asr_pipe = pipeline(
+    task="automatic-speech-recognition",
+    model=model,
+    tokenizer=tokenizer,
+    feature_extractor=feature_extractor,
+    chunk_length_s=30)
+def asr(audio_path, task):
+    asr_result = asr_pipe(audio_path, batch_size=BATCH_SIZE, generate_kwargs={"task":task}, return_timestamps=True)
+    base_model = asr_pipe.model.base_model if hasattr(asr_pipe.model, "base_model") else asr_pipe.model
+    return asr_result
 def post_process_transcription(transcription, max_repeats=2):
     tokens = re.findall(r'\b\w+\'?\w*\b[.,!?]?', transcription)
     return cleaned_transcription
 def format_audio(audio_path):
     input_audio, sample_rate = torchaudio.load(audio_path)
     input_audio = input_audio.squeeze().numpy()
     return(input_audio)
+def split_stereo_channels(audio_path):
+    audio = AudioSegment.from_wav(audio_path)
+    channels = audio.split_to_mono()
+    if len(channels) != 2:
+        raise ValueError(f"Audio {audio_path} does not have 2 channels.")
+    channels[0].export(f"temp_mono_speaker1.wav", format="wav")  # Right
+    channels[1].export(f"temp_mono_speaker2.wav", format="wav")  # Left
 def transcribe_pipeline(audio, task):
     text = pipe(audio, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
     return text
+def generate(audio_path, use_v2):
+    if use_v2:
+        split_stereo_channels(audio_path)
+        audio_id = os.path.splitext(os.path.basename(audio_path))[0]
+        left_channel_path = "temp_mono_speaker2.wav"
+        right_channel_path = "temp_mono_speaker1.wav"
+        left_audio = format_audio(left_channel_path)
+        right_audio = format_audio(right_channel_path)
+        left_result = asr(left_audio, task)
+        right_result = asr(right_audio, task)
+        left_segs = [(seg["timestamp"][0], seg["timestamp"][1], "Speaker 1", post_process_transcription(seg["text"])) for seg in left_result["chunks"]]
+        right_segs = [(seg["timestamp"][0], seg["timestamp"][1], "Speaker 2", post_process_transcription(seg["text"])) for seg in right_result["chunks"]]
+        merged_transcript = sorted(left_segs + right_segs, key=lambda x: x[0])
+        merged_text = " ".join([seg[3] for seg in merged_transcript])
+        output = ""
+        for start, end, speaker, text in merged_transcript:
+            output += f"[{start:.2f}s - {end:.2f}s] {speaker}: {text}\n"
+    else:
+        audio = AudioSegment.from_wav(audio_path)
+        temp_mono_path = None
+        if audio.channels != 1: #stereo2mono
+            audio = audio.set_channels(1)
+            temp_mono_path = "temp_mono.wav"
+            audio.export(temp_mono_path, format="wav")
+            audio_path = temp_mono_path
+            task = "transcribe"
         output = transcribe_pipeline(format_audio(audio_path), task)
+    clean_output = post_process_transcription(output, max_repeats=1) #check
     if temp_mono_path and os.path.exists(temp_mono_path):
+        os.remove(temp_mono_path)
+    for temp_file in ["temp_mono_speaker1.wav", "temp_mono_speaker2.wav"]:
+        if os.path.exists(temp_file):
+            os.remove(temp_file)
     return clean_output