Spaces:

VIDraft
/

voice-trans

Running on Zero

App Files Files Community

openfree commited on 17 days ago

Commit

f1697bd

verified ·

1 Parent(s): 8000eeb

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -24

app.py CHANGED Viewed

@@ -74,20 +74,34 @@ async def process_audio_chunk(audio_data, src_lang):
         # Gradio는 (sample_rate, audio_array) 튜플을 반환
         if isinstance(audio_data, tuple):
             sample_rate, audio_array = audio_data
-            # numpy array를 WAV 파일로 변환
-            import numpy as np
-            import wave
             with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
                 with wave.open(tmp.name, 'wb') as wav_file:
                     wav_file.setnchannels(1)  # mono
                     wav_file.setsampwidth(2)  # 16-bit
                     wav_file.setframerate(sample_rate)
-                    # numpy array를 16-bit PCM으로 변환
-                    if audio_array.dtype == np.float32 or audio_array.dtype == np.float64:
-                        audio_array = (audio_array * 32767).astype(np.int16)
-                    wav_file.writeframes(audio_array.tobytes())
                 tmp_path = tmp.name
         else:
             # bytes 데이터인 경우
@@ -95,16 +109,55 @@ async def process_audio_chunk(audio_data, src_lang):
                 tmp.write(audio_data)
                 tmp_path = tmp.name
-        # Whisper API로 변환
         with open(tmp_path, 'rb') as audio_file:
             transcript = await openai.AsyncClient().audio.transcriptions.create(
                 model="whisper-1",
                 file=audio_file,
-                language=src_lang[:2].lower()
             )
         os.unlink(tmp_path)  # 임시 파일 삭제
-        return transcript.text
     except Exception as e:
         print(f"STT 오류: {e}")
         return ""
@@ -142,9 +195,9 @@ def realtime_single_sync(audio, src, tgt, state):
         state["sample_rate"] = sample_rate
         state["audio_buffer"].append(audio_array)
-        # 버퍼가 충분히 쌓였을 때만 처리 (약 1-2초 분량)
         buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
-        if buffer_duration >= 1.5:  # 1.5초마다 처리
             loop = asyncio.new_event_loop()
             asyncio.set_event_loop(loop)
@@ -211,7 +264,7 @@ def realtime_four_sync(audio, src, state):
         # 버퍼가 충분히 쌓였을 때만 처리
         buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
-        if buffer_duration >= 1.5:  # 1.5초마다 처리
             loop = asyncio.new_event_loop()
             asyncio.set_event_loop(loop)
@@ -271,34 +324,65 @@ with gr.Blocks(title="SMARTok Demo") as demo:
         with gr.TabItem("⏱️ 실시간 1"):
             src3 = gr.Dropdown(LANG, value="Korean", label="입력 언어")
             tgt3 = gr.Dropdown(LANG, value="English", label="출력 언어")
-            mic3 = gr.Audio(sources=["microphone"], streaming=True)
-            o3 = gr.Textbox(label="원문(실시간)", lines=8)
-            t3 = gr.Textbox(label="번역(실시간)", lines=8)
             st3 = gr.State()
             # stream 메서드 수정
             mic3.stream(
                 realtime_single_sync,
                 inputs=[mic3, src3, tgt3, st3],
-                outputs=[o3, t3, st3]
             )
         # 탭 4 – 실시간 4언어
         with gr.TabItem("🌏 실시간 4"):
             src4 = gr.Dropdown(LANG, value="Korean", label="입력 언어")
-            mic4 = gr.Audio(sources=["microphone"], streaming=True)
-            o4 = gr.Textbox(label="원문", lines=8)
-            e4 = gr.Textbox(label="English", lines=8)
-            c4 = gr.Textbox(label="Chinese(简体)", lines=8)
-            th4 = gr.Textbox(label="Thai", lines=8)
-            r4 = gr.Textbox(label="Russian", lines=8)
             st4 = gr.State()
             # stream 메서드 수정
             mic4.stream(
                 realtime_four_sync,
                 inputs=[mic4, src4, st4],
-                outputs=[o4, e4, c4, th4, r4, st4]
             )
 demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)

         # Gradio는 (sample_rate, audio_array) 튜플을 반환
         if isinstance(audio_data, tuple):
             sample_rate, audio_array = audio_data
+            # 오디오가 너무 짧으면 무시 (0.5초 미만)
+            if len(audio_array) < sample_rate * 0.5:
+                return ""
+            # 오디오 정규화 및 노이즈 필터링
+            audio_array = audio_array.astype(np.float32)
+            # 무음 감지 - RMS가 너무 낮으면 무시
+            rms = np.sqrt(np.mean(audio_array**2))
+            if rms < 0.01:  # 무음 임계값
+                return ""
+            # 정규화
+            max_val = np.max(np.abs(audio_array))
+            if max_val > 0:
+                audio_array = audio_array / max_val * 0.95
+            # numpy array를 WAV 파일로 변환
             with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
                 with wave.open(tmp.name, 'wb') as wav_file:
                     wav_file.setnchannels(1)  # mono
                     wav_file.setsampwidth(2)  # 16-bit
                     wav_file.setframerate(sample_rate)
+                    # float32를 16-bit PCM으로 변환
+                    audio_int16 = (audio_array * 32767).astype(np.int16)
+                    wav_file.writeframes(audio_int16.tobytes())
                 tmp_path = tmp.name
         else:
             # bytes 데이터인 경우
                 tmp.write(audio_data)
                 tmp_path = tmp.name
+        # Whisper API로 변환 - 언어 힌트와 프롬프트 추가
         with open(tmp_path, 'rb') as audio_file:
+            # 언어별 프롬프트 설정으로 hallucination 방지
+            language_prompts = {
+                "Korean": "이것은 한국어 대화입니다.",
+                "English": "This is an English conversation.",
+                "Japanese": "これは日本語の会話です。",
+                "Chinese": "这是中文对话。",
+            }
+            prompt = language_prompts.get(src_lang, "")
             transcript = await openai.AsyncClient().audio.transcriptions.create(
                 model="whisper-1",
                 file=audio_file,
+                language=src_lang[:2].lower(),
+                prompt=prompt,
+                temperature=0.0  # 더 보수적인 추론
             )
         os.unlink(tmp_path)  # 임시 파일 삭제
+        # 결과 후처리 - 반복되는 패턴 제거
+        text = transcript.text.strip()
+        # 같은 문장이 반복되는 경우 처리
+        sentences = text.split('.')
+        if len(sentences) > 1:
+            unique_sentences = []
+            for sent in sentences:
+                sent = sent.strip()
+                if sent and (not unique_sentences or sent != unique_sentences[-1]):
+                    unique_sentences.append(sent)
+            text = '. '.join(unique_sentences)
+            if text and not text.endswith('.'):
+                text += '.'
+        # 뉴스 관련 hallucination 패턴 감지 및 제거
+        hallucination_patterns = [
+            "MBC 뉴스", "KBS 뉴스", "SBS 뉴스", "JTBC 뉴스",
+            "뉴스룸", "뉴스데스크", "앵커", "기자입니다"
+        ]
+        # 짧은 텍스트에서 뉴스 패턴이 감지되면 무시
+        if len(text) < 50 and any(pattern in text for pattern in hallucination_patterns):
+            return ""
+        return text
     except Exception as e:
         print(f"STT 오류: {e}")
         return ""
         state["sample_rate"] = sample_rate
         state["audio_buffer"].append(audio_array)
+        # 버퍼가 충분히 쌓였을 때만 처리 (약 2-3초 분량)
         buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
+        if buffer_duration >= 2.0:  # 2초마다 처리
             loop = asyncio.new_event_loop()
             asyncio.set_event_loop(loop)
         # 버퍼가 충분히 쌓였을 때만 처리
         buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
+        if buffer_duration >= 2.0:  # 2초마다 처리
             loop = asyncio.new_event_loop()
             asyncio.set_event_loop(loop)
         with gr.TabItem("⏱️ 실시간 1"):
             src3 = gr.Dropdown(LANG, value="Korean", label="입력 언어")
             tgt3 = gr.Dropdown(LANG, value="English", label="출력 언어")
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("🎤 **마이크 입력**")
+                    mic3 = gr.Audio(
+                        sources=["microphone"],
+                        streaming=True,
+                        type="numpy",  # numpy 형식 명시
+                        label="마이크"
+                    )
+                    gr.Markdown("💡 **사용 방법**\n- 2-3초 정도 문장을 말씀해주세요\n- 너무 짧거나 긴 문장은 인식이 어려울 수 있습니다")
+                with gr.Column():
+                    o3 = gr.Textbox(label="원문(실시간)", lines=8, interactive=False)
+                    t3 = gr.Textbox(label="번역(실시간)", lines=8, interactive=False)
             st3 = gr.State()
             # stream 메서드 수정
             mic3.stream(
                 realtime_single_sync,
                 inputs=[mic3, src3, tgt3, st3],
+                outputs=[o3, t3, st3],
+                time_limit=30,  # 30초 제한
+                stream_every=0.5  # 0.5초마다 스트림
             )
         # 탭 4 – 실시간 4언어
         with gr.TabItem("🌏 실시간 4"):
             src4 = gr.Dropdown(LANG, value="Korean", label="입력 언어")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("🎤 **마이크 입력**")
+                    mic4 = gr.Audio(
+                        sources=["microphone"],
+                        streaming=True,
+                        type="numpy",
+                        label="마이크"
+                    )
+                    o4 = gr.Textbox(label="원문", lines=8, interactive=False)
+                with gr.Column(scale=2):
+                    with gr.Row():
+                        e4 = gr.Textbox(label="English", lines=8, interactive=False)
+                        c4 = gr.Textbox(label="Chinese(简体)", lines=8, interactive=False)
+                    with gr.Row():
+                        th4 = gr.Textbox(label="Thai", lines=8, interactive=False)
+                        r4 = gr.Textbox(label="Russian", lines=8, interactive=False)
             st4 = gr.State()
             # stream 메서드 수정
             mic4.stream(
                 realtime_four_sync,
                 inputs=[mic4, src4, st4],
+                outputs=[o4, e4, c4, th4, r4, st4],
+                time_limit=30,
+                stream_every=0.5
             )
 demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)