openfree commited on
Commit
f1697bd
ยท
verified ยท
1 Parent(s): 8000eeb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -24
app.py CHANGED
@@ -74,20 +74,34 @@ async def process_audio_chunk(audio_data, src_lang):
74
  # Gradio๋Š” (sample_rate, audio_array) ํŠœํ”Œ์„ ๋ฐ˜ํ™˜
75
  if isinstance(audio_data, tuple):
76
  sample_rate, audio_array = audio_data
77
- # numpy array๋ฅผ WAV ํŒŒ์ผ๋กœ ๋ณ€ํ™˜
78
- import numpy as np
79
- import wave
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
82
  with wave.open(tmp.name, 'wb') as wav_file:
83
  wav_file.setnchannels(1) # mono
84
  wav_file.setsampwidth(2) # 16-bit
85
  wav_file.setframerate(sample_rate)
86
 
87
- # numpy array๋ฅผ 16-bit PCM์œผ๋กœ ๋ณ€ํ™˜
88
- if audio_array.dtype == np.float32 or audio_array.dtype == np.float64:
89
- audio_array = (audio_array * 32767).astype(np.int16)
90
- wav_file.writeframes(audio_array.tobytes())
91
  tmp_path = tmp.name
92
  else:
93
  # bytes ๋ฐ์ดํ„ฐ์ธ ๊ฒฝ์šฐ
@@ -95,16 +109,55 @@ async def process_audio_chunk(audio_data, src_lang):
95
  tmp.write(audio_data)
96
  tmp_path = tmp.name
97
 
98
- # Whisper API๋กœ ๋ณ€ํ™˜
99
  with open(tmp_path, 'rb') as audio_file:
 
 
 
 
 
 
 
 
 
 
100
  transcript = await openai.AsyncClient().audio.transcriptions.create(
101
  model="whisper-1",
102
  file=audio_file,
103
- language=src_lang[:2].lower()
 
 
104
  )
105
 
106
  os.unlink(tmp_path) # ์ž„์‹œ ํŒŒ์ผ ์‚ญ์ œ
107
- return transcript.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  except Exception as e:
109
  print(f"STT ์˜ค๋ฅ˜: {e}")
110
  return ""
@@ -142,9 +195,9 @@ def realtime_single_sync(audio, src, tgt, state):
142
  state["sample_rate"] = sample_rate
143
  state["audio_buffer"].append(audio_array)
144
 
145
- # ๋ฒ„ํผ๊ฐ€ ์ถฉ๋ถ„ํžˆ ์Œ“์˜€์„ ๋•Œ๋งŒ ์ฒ˜๋ฆฌ (์•ฝ 1-2์ดˆ ๋ถ„๋Ÿ‰)
146
  buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
147
- if buffer_duration >= 1.5: # 1.5์ดˆ๋งˆ๋‹ค ์ฒ˜๋ฆฌ
148
  loop = asyncio.new_event_loop()
149
  asyncio.set_event_loop(loop)
150
 
@@ -211,7 +264,7 @@ def realtime_four_sync(audio, src, state):
211
 
212
  # ๋ฒ„ํผ๊ฐ€ ์ถฉ๋ถ„ํžˆ ์Œ“์˜€์„ ๋•Œ๋งŒ ์ฒ˜๋ฆฌ
213
  buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
214
- if buffer_duration >= 1.5: # 1.5์ดˆ๋งˆ๋‹ค ์ฒ˜๋ฆฌ
215
  loop = asyncio.new_event_loop()
216
  asyncio.set_event_loop(loop)
217
 
@@ -271,34 +324,65 @@ with gr.Blocks(title="SMARTok Demo") as demo:
271
  with gr.TabItem("โฑ๏ธ ์‹ค์‹œ๊ฐ„ 1"):
272
  src3 = gr.Dropdown(LANG, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
273
  tgt3 = gr.Dropdown(LANG, value="English", label="์ถœ๋ ฅ ์–ธ์–ด")
274
- mic3 = gr.Audio(sources=["microphone"], streaming=True)
275
- o3 = gr.Textbox(label="์›๋ฌธ(์‹ค์‹œ๊ฐ„)", lines=8)
276
- t3 = gr.Textbox(label="๋ฒˆ์—ญ(์‹ค์‹œ๊ฐ„)", lines=8)
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  st3 = gr.State()
278
 
279
  # stream ๋ฉ”์„œ๋“œ ์ˆ˜์ •
280
  mic3.stream(
281
  realtime_single_sync,
282
  inputs=[mic3, src3, tgt3, st3],
283
- outputs=[o3, t3, st3]
 
 
284
  )
285
 
286
  # ํƒญ 4 โ€“ ์‹ค์‹œ๊ฐ„ 4์–ธ์–ด
287
  with gr.TabItem("๐ŸŒ ์‹ค์‹œ๊ฐ„ 4"):
288
  src4 = gr.Dropdown(LANG, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
289
- mic4 = gr.Audio(sources=["microphone"], streaming=True)
290
- o4 = gr.Textbox(label="์›๋ฌธ", lines=8)
291
- e4 = gr.Textbox(label="English", lines=8)
292
- c4 = gr.Textbox(label="Chinese(็ฎ€ไฝ“)", lines=8)
293
- th4 = gr.Textbox(label="Thai", lines=8)
294
- r4 = gr.Textbox(label="Russian", lines=8)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  st4 = gr.State()
296
 
297
  # stream ๋ฉ”์„œ๋“œ ์ˆ˜์ •
298
  mic4.stream(
299
  realtime_four_sync,
300
  inputs=[mic4, src4, st4],
301
- outputs=[o4, e4, c4, th4, r4, st4]
 
 
302
  )
303
 
304
  demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)
 
74
  # Gradio๋Š” (sample_rate, audio_array) ํŠœํ”Œ์„ ๋ฐ˜ํ™˜
75
  if isinstance(audio_data, tuple):
76
  sample_rate, audio_array = audio_data
 
 
 
77
 
78
+ # ์˜ค๋””์˜ค๊ฐ€ ๋„ˆ๋ฌด ์งง์œผ๋ฉด ๋ฌด์‹œ (0.5์ดˆ ๋ฏธ๋งŒ)
79
+ if len(audio_array) < sample_rate * 0.5:
80
+ return ""
81
+
82
+ # ์˜ค๋””์˜ค ์ •๊ทœํ™” ๋ฐ ๋…ธ์ด์ฆˆ ํ•„ํ„ฐ๋ง
83
+ audio_array = audio_array.astype(np.float32)
84
+
85
+ # ๋ฌด์Œ ๊ฐ์ง€ - RMS๊ฐ€ ๋„ˆ๋ฌด ๋‚ฎ์œผ๋ฉด ๋ฌด์‹œ
86
+ rms = np.sqrt(np.mean(audio_array**2))
87
+ if rms < 0.01: # ๋ฌด์Œ ์ž„๊ณ„๊ฐ’
88
+ return ""
89
+
90
+ # ์ •๊ทœํ™”
91
+ max_val = np.max(np.abs(audio_array))
92
+ if max_val > 0:
93
+ audio_array = audio_array / max_val * 0.95
94
+
95
+ # numpy array๋ฅผ WAV ํŒŒ์ผ๋กœ ๋ณ€ํ™˜
96
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
97
  with wave.open(tmp.name, 'wb') as wav_file:
98
  wav_file.setnchannels(1) # mono
99
  wav_file.setsampwidth(2) # 16-bit
100
  wav_file.setframerate(sample_rate)
101
 
102
+ # float32๋ฅผ 16-bit PCM์œผ๋กœ ๋ณ€ํ™˜
103
+ audio_int16 = (audio_array * 32767).astype(np.int16)
104
+ wav_file.writeframes(audio_int16.tobytes())
 
105
  tmp_path = tmp.name
106
  else:
107
  # bytes ๋ฐ์ดํ„ฐ์ธ ๊ฒฝ์šฐ
 
109
  tmp.write(audio_data)
110
  tmp_path = tmp.name
111
 
112
+ # Whisper API๋กœ ๋ณ€ํ™˜ - ์–ธ์–ด ํžŒํŠธ์™€ ํ”„๋กฌํ”„ํŠธ ์ถ”๊ฐ€
113
  with open(tmp_path, 'rb') as audio_file:
114
+ # ์–ธ์–ด๋ณ„ ํ”„๋กฌํ”„ํŠธ ์„ค์ •์œผ๋กœ hallucination ๋ฐฉ์ง€
115
+ language_prompts = {
116
+ "Korean": "์ด๊ฒƒ์€ ํ•œ๊ตญ์–ด ๋Œ€ํ™”์ž…๋‹ˆ๋‹ค.",
117
+ "English": "This is an English conversation.",
118
+ "Japanese": "ใ“ใ‚Œใฏๆ—ฅๆœฌ่ชžใฎไผš่ฉฑใงใ™ใ€‚",
119
+ "Chinese": "่ฟ™ๆ˜ฏไธญๆ–‡ๅฏน่ฏใ€‚",
120
+ }
121
+
122
+ prompt = language_prompts.get(src_lang, "")
123
+
124
  transcript = await openai.AsyncClient().audio.transcriptions.create(
125
  model="whisper-1",
126
  file=audio_file,
127
+ language=src_lang[:2].lower(),
128
+ prompt=prompt,
129
+ temperature=0.0 # ๋” ๋ณด์ˆ˜์ ์ธ ์ถ”๋ก 
130
  )
131
 
132
  os.unlink(tmp_path) # ์ž„์‹œ ํŒŒ์ผ ์‚ญ์ œ
133
+
134
+ # ๊ฒฐ๊ณผ ํ›„์ฒ˜๋ฆฌ - ๋ฐ˜๋ณต๋˜๋Š” ํŒจํ„ด ์ œ๊ฑฐ
135
+ text = transcript.text.strip()
136
+
137
+ # ๊ฐ™์€ ๋ฌธ์žฅ์ด ๋ฐ˜๋ณต๋˜๋Š” ๊ฒฝ์šฐ ์ฒ˜๋ฆฌ
138
+ sentences = text.split('.')
139
+ if len(sentences) > 1:
140
+ unique_sentences = []
141
+ for sent in sentences:
142
+ sent = sent.strip()
143
+ if sent and (not unique_sentences or sent != unique_sentences[-1]):
144
+ unique_sentences.append(sent)
145
+ text = '. '.join(unique_sentences)
146
+ if text and not text.endswith('.'):
147
+ text += '.'
148
+
149
+ # ๋‰ด์Šค ๊ด€๋ จ hallucination ํŒจํ„ด ๊ฐ์ง€ ๋ฐ ์ œ๊ฑฐ
150
+ hallucination_patterns = [
151
+ "MBC ๋‰ด์Šค", "KBS ๋‰ด์Šค", "SBS ๋‰ด์Šค", "JTBC ๋‰ด์Šค",
152
+ "๋‰ด์Šค๋ฃธ", "๋‰ด์Šค๋ฐ์Šคํฌ", "์•ต์ปค", "๊ธฐ์ž์ž…๋‹ˆ๋‹ค"
153
+ ]
154
+
155
+ # ์งง์€ ํ…์ŠคํŠธ์—์„œ ๋‰ด์Šค ํŒจํ„ด์ด ๊ฐ์ง€๋˜๋ฉด ๋ฌด์‹œ
156
+ if len(text) < 50 and any(pattern in text for pattern in hallucination_patterns):
157
+ return ""
158
+
159
+ return text
160
+
161
  except Exception as e:
162
  print(f"STT ์˜ค๋ฅ˜: {e}")
163
  return ""
 
195
  state["sample_rate"] = sample_rate
196
  state["audio_buffer"].append(audio_array)
197
 
198
+ # ๋ฒ„ํผ๊ฐ€ ์ถฉ๋ถ„ํžˆ ์Œ“์˜€์„ ๋•Œ๋งŒ ์ฒ˜๋ฆฌ (์•ฝ 2-3์ดˆ ๋ถ„๋Ÿ‰)
199
  buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
200
+ if buffer_duration >= 2.0: # 2์ดˆ๋งˆ๋‹ค ์ฒ˜๋ฆฌ
201
  loop = asyncio.new_event_loop()
202
  asyncio.set_event_loop(loop)
203
 
 
264
 
265
  # ๋ฒ„ํผ๊ฐ€ ์ถฉ๋ถ„ํžˆ ์Œ“์˜€์„ ๋•Œ๋งŒ ์ฒ˜๋ฆฌ
266
  buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
267
+ if buffer_duration >= 2.0: # 2์ดˆ๋งˆ๋‹ค ์ฒ˜๋ฆฌ
268
  loop = asyncio.new_event_loop()
269
  asyncio.set_event_loop(loop)
270
 
 
324
  with gr.TabItem("โฑ๏ธ ์‹ค์‹œ๊ฐ„ 1"):
325
  src3 = gr.Dropdown(LANG, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
326
  tgt3 = gr.Dropdown(LANG, value="English", label="์ถœ๋ ฅ ์–ธ์–ด")
327
+
328
+ with gr.Row():
329
+ with gr.Column():
330
+ gr.Markdown("๐ŸŽค **๋งˆ์ดํฌ ์ž…๋ ฅ**")
331
+ mic3 = gr.Audio(
332
+ sources=["microphone"],
333
+ streaming=True,
334
+ type="numpy", # numpy ํ˜•์‹ ๋ช…์‹œ
335
+ label="๋งˆ์ดํฌ"
336
+ )
337
+ gr.Markdown("๐Ÿ’ก **์‚ฌ์šฉ ๋ฐฉ๋ฒ•**\n- 2-3์ดˆ ์ •๋„ ๋ฌธ์žฅ์„ ๋ง์”€ํ•ด์ฃผ์„ธ์š”\n- ๋„ˆ๋ฌด ์งง๊ฑฐ๋‚˜ ๊ธด ๋ฌธ์žฅ์€ ์ธ์‹์ด ์–ด๋ ค์šธ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค")
338
+
339
+ with gr.Column():
340
+ o3 = gr.Textbox(label="์›๋ฌธ(์‹ค์‹œ๊ฐ„)", lines=8, interactive=False)
341
+ t3 = gr.Textbox(label="๋ฒˆ์—ญ(์‹ค์‹œ๊ฐ„)", lines=8, interactive=False)
342
+
343
  st3 = gr.State()
344
 
345
  # stream ๋ฉ”์„œ๋“œ ์ˆ˜์ •
346
  mic3.stream(
347
  realtime_single_sync,
348
  inputs=[mic3, src3, tgt3, st3],
349
+ outputs=[o3, t3, st3],
350
+ time_limit=30, # 30์ดˆ ์ œํ•œ
351
+ stream_every=0.5 # 0.5์ดˆ๋งˆ๋‹ค ์ŠคํŠธ๋ฆผ
352
  )
353
 
354
  # ํƒญ 4 โ€“ ์‹ค์‹œ๊ฐ„ 4์–ธ์–ด
355
  with gr.TabItem("๐ŸŒ ์‹ค์‹œ๊ฐ„ 4"):
356
  src4 = gr.Dropdown(LANG, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
357
+
358
+ with gr.Row():
359
+ with gr.Column(scale=1):
360
+ gr.Markdown("๐ŸŽค **๋งˆ์ดํฌ ์ž…๋ ฅ**")
361
+ mic4 = gr.Audio(
362
+ sources=["microphone"],
363
+ streaming=True,
364
+ type="numpy",
365
+ label="๋งˆ์ดํฌ"
366
+ )
367
+ o4 = gr.Textbox(label="์›๋ฌธ", lines=8, interactive=False)
368
+
369
+ with gr.Column(scale=2):
370
+ with gr.Row():
371
+ e4 = gr.Textbox(label="English", lines=8, interactive=False)
372
+ c4 = gr.Textbox(label="Chinese(็ฎ€ไฝ“)", lines=8, interactive=False)
373
+ with gr.Row():
374
+ th4 = gr.Textbox(label="Thai", lines=8, interactive=False)
375
+ r4 = gr.Textbox(label="Russian", lines=8, interactive=False)
376
+
377
  st4 = gr.State()
378
 
379
  # stream ๋ฉ”์„œ๋“œ ์ˆ˜์ •
380
  mic4.stream(
381
  realtime_four_sync,
382
  inputs=[mic4, src4, st4],
383
+ outputs=[o4, e4, c4, th4, r4, st4],
384
+ time_limit=30,
385
+ stream_every=0.5
386
  )
387
 
388
  demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)