asr-inference

Running on Zero

Update whisper.py

#10

by ssolito - opened 8 days ago

←

Files changed (1) hide show

whisper.py CHANGED Viewed

@@ -107,7 +107,9 @@ def transcribe_pipeline(audio, task):
     return text
 def generate(audio_path, use_v2):
     if use_v2:
         split_stereo_channels(audio_path)
@@ -126,7 +128,6 @@ def generate(audio_path, use_v2):
         right_segs = [(seg["timestamp"][0], seg["timestamp"][1], "Speaker 2", post_process_transcription(seg["text"])) for seg in right_result["chunks"]]
         merged_transcript = sorted(left_segs + right_segs, key=lambda x: x[0])
-        merged_text = " ".join([seg[3] for seg in merged_transcript])
         output = ""
         for start, end, speaker, text in merged_transcript:
@@ -134,14 +135,12 @@ def generate(audio_path, use_v2):
     else:
         audio = AudioSegment.from_wav(audio_path)
-        temp_mono_path = None
         if audio.channels != 1: #stereo2mono
             audio = audio.set_channels(1)
             temp_mono_path = "temp_mono.wav"
             audio.export(temp_mono_path, format="wav")
             audio_path = temp_mono_path
-        task = "transcribe"
         output = transcribe_pipeline(format_audio(audio_path), task)
     clean_output = post_process_transcription(output, max_repeats=1) #check

     return text
 def generate(audio_path, use_v2):
+    task = "transcribe
+    temp_mono_path = None
     if use_v2:
         split_stereo_channels(audio_path)
         right_segs = [(seg["timestamp"][0], seg["timestamp"][1], "Speaker 2", post_process_transcription(seg["text"])) for seg in right_result["chunks"]]
         merged_transcript = sorted(left_segs + right_segs, key=lambda x: x[0])
         output = ""
         for start, end, speaker, text in merged_transcript:
     else:
         audio = AudioSegment.from_wav(audio_path)
         if audio.channels != 1: #stereo2mono
             audio = audio.set_channels(1)
             temp_mono_path = "temp_mono.wav"
             audio.export(temp_mono_path, format="wav")
             audio_path = temp_mono_path
         output = transcribe_pipeline(format_audio(audio_path), task)
     clean_output = post_process_transcription(output, max_repeats=1) #check