Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -74,20 +74,34 @@ async def process_audio_chunk(audio_data, src_lang):
|
|
74 |
# Gradio๋ (sample_rate, audio_array) ํํ์ ๋ฐํ
|
75 |
if isinstance(audio_data, tuple):
|
76 |
sample_rate, audio_array = audio_data
|
77 |
-
# numpy array๋ฅผ WAV ํ์ผ๋ก ๋ณํ
|
78 |
-
import numpy as np
|
79 |
-
import wave
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
82 |
with wave.open(tmp.name, 'wb') as wav_file:
|
83 |
wav_file.setnchannels(1) # mono
|
84 |
wav_file.setsampwidth(2) # 16-bit
|
85 |
wav_file.setframerate(sample_rate)
|
86 |
|
87 |
-
#
|
88 |
-
|
89 |
-
|
90 |
-
wav_file.writeframes(audio_array.tobytes())
|
91 |
tmp_path = tmp.name
|
92 |
else:
|
93 |
# bytes ๋ฐ์ดํฐ์ธ ๊ฒฝ์ฐ
|
@@ -95,16 +109,55 @@ async def process_audio_chunk(audio_data, src_lang):
|
|
95 |
tmp.write(audio_data)
|
96 |
tmp_path = tmp.name
|
97 |
|
98 |
-
# Whisper API๋ก ๋ณํ
|
99 |
with open(tmp_path, 'rb') as audio_file:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
transcript = await openai.AsyncClient().audio.transcriptions.create(
|
101 |
model="whisper-1",
|
102 |
file=audio_file,
|
103 |
-
language=src_lang[:2].lower()
|
|
|
|
|
104 |
)
|
105 |
|
106 |
os.unlink(tmp_path) # ์์ ํ์ผ ์ญ์
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
except Exception as e:
|
109 |
print(f"STT ์ค๋ฅ: {e}")
|
110 |
return ""
|
@@ -142,9 +195,9 @@ def realtime_single_sync(audio, src, tgt, state):
|
|
142 |
state["sample_rate"] = sample_rate
|
143 |
state["audio_buffer"].append(audio_array)
|
144 |
|
145 |
-
# ๋ฒํผ๊ฐ ์ถฉ๋ถํ ์์์ ๋๋ง ์ฒ๋ฆฌ (์ฝ
|
146 |
buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
|
147 |
-
if buffer_duration >=
|
148 |
loop = asyncio.new_event_loop()
|
149 |
asyncio.set_event_loop(loop)
|
150 |
|
@@ -211,7 +264,7 @@ def realtime_four_sync(audio, src, state):
|
|
211 |
|
212 |
# ๋ฒํผ๊ฐ ์ถฉ๋ถํ ์์์ ๋๋ง ์ฒ๋ฆฌ
|
213 |
buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
|
214 |
-
if buffer_duration >=
|
215 |
loop = asyncio.new_event_loop()
|
216 |
asyncio.set_event_loop(loop)
|
217 |
|
@@ -271,34 +324,65 @@ with gr.Blocks(title="SMARTok Demo") as demo:
|
|
271 |
with gr.TabItem("โฑ๏ธ ์ค์๊ฐ 1"):
|
272 |
src3 = gr.Dropdown(LANG, value="Korean", label="์
๋ ฅ ์ธ์ด")
|
273 |
tgt3 = gr.Dropdown(LANG, value="English", label="์ถ๋ ฅ ์ธ์ด")
|
274 |
-
|
275 |
-
|
276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
st3 = gr.State()
|
278 |
|
279 |
# stream ๋ฉ์๋ ์์
|
280 |
mic3.stream(
|
281 |
realtime_single_sync,
|
282 |
inputs=[mic3, src3, tgt3, st3],
|
283 |
-
outputs=[o3, t3, st3]
|
|
|
|
|
284 |
)
|
285 |
|
286 |
# ํญ 4 โ ์ค์๊ฐ 4์ธ์ด
|
287 |
with gr.TabItem("๐ ์ค์๊ฐ 4"):
|
288 |
src4 = gr.Dropdown(LANG, value="Korean", label="์
๋ ฅ ์ธ์ด")
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
st4 = gr.State()
|
296 |
|
297 |
# stream ๋ฉ์๋ ์์
|
298 |
mic4.stream(
|
299 |
realtime_four_sync,
|
300 |
inputs=[mic4, src4, st4],
|
301 |
-
outputs=[o4, e4, c4, th4, r4, st4]
|
|
|
|
|
302 |
)
|
303 |
|
304 |
demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)
|
|
|
74 |
# Gradio๋ (sample_rate, audio_array) ํํ์ ๋ฐํ
|
75 |
if isinstance(audio_data, tuple):
|
76 |
sample_rate, audio_array = audio_data
|
|
|
|
|
|
|
77 |
|
78 |
+
# ์ค๋์ค๊ฐ ๋๋ฌด ์งง์ผ๋ฉด ๋ฌด์ (0.5์ด ๋ฏธ๋ง)
|
79 |
+
if len(audio_array) < sample_rate * 0.5:
|
80 |
+
return ""
|
81 |
+
|
82 |
+
# ์ค๋์ค ์ ๊ทํ ๋ฐ ๋
ธ์ด์ฆ ํํฐ๋ง
|
83 |
+
audio_array = audio_array.astype(np.float32)
|
84 |
+
|
85 |
+
# ๋ฌด์ ๊ฐ์ง - RMS๊ฐ ๋๋ฌด ๋ฎ์ผ๋ฉด ๋ฌด์
|
86 |
+
rms = np.sqrt(np.mean(audio_array**2))
|
87 |
+
if rms < 0.01: # ๋ฌด์ ์๊ณ๊ฐ
|
88 |
+
return ""
|
89 |
+
|
90 |
+
# ์ ๊ทํ
|
91 |
+
max_val = np.max(np.abs(audio_array))
|
92 |
+
if max_val > 0:
|
93 |
+
audio_array = audio_array / max_val * 0.95
|
94 |
+
|
95 |
+
# numpy array๋ฅผ WAV ํ์ผ๋ก ๋ณํ
|
96 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
97 |
with wave.open(tmp.name, 'wb') as wav_file:
|
98 |
wav_file.setnchannels(1) # mono
|
99 |
wav_file.setsampwidth(2) # 16-bit
|
100 |
wav_file.setframerate(sample_rate)
|
101 |
|
102 |
+
# float32๋ฅผ 16-bit PCM์ผ๋ก ๋ณํ
|
103 |
+
audio_int16 = (audio_array * 32767).astype(np.int16)
|
104 |
+
wav_file.writeframes(audio_int16.tobytes())
|
|
|
105 |
tmp_path = tmp.name
|
106 |
else:
|
107 |
# bytes ๋ฐ์ดํฐ์ธ ๊ฒฝ์ฐ
|
|
|
109 |
tmp.write(audio_data)
|
110 |
tmp_path = tmp.name
|
111 |
|
112 |
+
# Whisper API๋ก ๋ณํ - ์ธ์ด ํํธ์ ํ๋กฌํํธ ์ถ๊ฐ
|
113 |
with open(tmp_path, 'rb') as audio_file:
|
114 |
+
# ์ธ์ด๋ณ ํ๋กฌํํธ ์ค์ ์ผ๋ก hallucination ๋ฐฉ์ง
|
115 |
+
language_prompts = {
|
116 |
+
"Korean": "์ด๊ฒ์ ํ๊ตญ์ด ๋ํ์
๋๋ค.",
|
117 |
+
"English": "This is an English conversation.",
|
118 |
+
"Japanese": "ใใใฏๆฅๆฌ่ชใฎไผ่ฉฑใงใใ",
|
119 |
+
"Chinese": "่ฟๆฏไธญๆๅฏน่ฏใ",
|
120 |
+
}
|
121 |
+
|
122 |
+
prompt = language_prompts.get(src_lang, "")
|
123 |
+
|
124 |
transcript = await openai.AsyncClient().audio.transcriptions.create(
|
125 |
model="whisper-1",
|
126 |
file=audio_file,
|
127 |
+
language=src_lang[:2].lower(),
|
128 |
+
prompt=prompt,
|
129 |
+
temperature=0.0 # ๋ ๋ณด์์ ์ธ ์ถ๋ก
|
130 |
)
|
131 |
|
132 |
os.unlink(tmp_path) # ์์ ํ์ผ ์ญ์
|
133 |
+
|
134 |
+
# ๊ฒฐ๊ณผ ํ์ฒ๋ฆฌ - ๋ฐ๋ณต๋๋ ํจํด ์ ๊ฑฐ
|
135 |
+
text = transcript.text.strip()
|
136 |
+
|
137 |
+
# ๊ฐ์ ๋ฌธ์ฅ์ด ๋ฐ๋ณต๋๋ ๊ฒฝ์ฐ ์ฒ๋ฆฌ
|
138 |
+
sentences = text.split('.')
|
139 |
+
if len(sentences) > 1:
|
140 |
+
unique_sentences = []
|
141 |
+
for sent in sentences:
|
142 |
+
sent = sent.strip()
|
143 |
+
if sent and (not unique_sentences or sent != unique_sentences[-1]):
|
144 |
+
unique_sentences.append(sent)
|
145 |
+
text = '. '.join(unique_sentences)
|
146 |
+
if text and not text.endswith('.'):
|
147 |
+
text += '.'
|
148 |
+
|
149 |
+
# ๋ด์ค ๊ด๋ จ hallucination ํจํด ๊ฐ์ง ๋ฐ ์ ๊ฑฐ
|
150 |
+
hallucination_patterns = [
|
151 |
+
"MBC ๋ด์ค", "KBS ๋ด์ค", "SBS ๋ด์ค", "JTBC ๋ด์ค",
|
152 |
+
"๋ด์ค๋ฃธ", "๋ด์ค๋ฐ์คํฌ", "์ต์ปค", "๊ธฐ์์
๋๋ค"
|
153 |
+
]
|
154 |
+
|
155 |
+
# ์งง์ ํ
์คํธ์์ ๋ด์ค ํจํด์ด ๊ฐ์ง๋๋ฉด ๋ฌด์
|
156 |
+
if len(text) < 50 and any(pattern in text for pattern in hallucination_patterns):
|
157 |
+
return ""
|
158 |
+
|
159 |
+
return text
|
160 |
+
|
161 |
except Exception as e:
|
162 |
print(f"STT ์ค๋ฅ: {e}")
|
163 |
return ""
|
|
|
195 |
state["sample_rate"] = sample_rate
|
196 |
state["audio_buffer"].append(audio_array)
|
197 |
|
198 |
+
# ๋ฒํผ๊ฐ ์ถฉ๋ถํ ์์์ ๋๋ง ์ฒ๋ฆฌ (์ฝ 2-3์ด ๋ถ๋)
|
199 |
buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
|
200 |
+
if buffer_duration >= 2.0: # 2์ด๋ง๋ค ์ฒ๋ฆฌ
|
201 |
loop = asyncio.new_event_loop()
|
202 |
asyncio.set_event_loop(loop)
|
203 |
|
|
|
264 |
|
265 |
# ๋ฒํผ๊ฐ ์ถฉ๋ถํ ์์์ ๋๋ง ์ฒ๋ฆฌ
|
266 |
buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
|
267 |
+
if buffer_duration >= 2.0: # 2์ด๋ง๋ค ์ฒ๋ฆฌ
|
268 |
loop = asyncio.new_event_loop()
|
269 |
asyncio.set_event_loop(loop)
|
270 |
|
|
|
324 |
with gr.TabItem("โฑ๏ธ ์ค์๊ฐ 1"):
|
325 |
src3 = gr.Dropdown(LANG, value="Korean", label="์
๋ ฅ ์ธ์ด")
|
326 |
tgt3 = gr.Dropdown(LANG, value="English", label="์ถ๋ ฅ ์ธ์ด")
|
327 |
+
|
328 |
+
with gr.Row():
|
329 |
+
with gr.Column():
|
330 |
+
gr.Markdown("๐ค **๋ง์ดํฌ ์
๋ ฅ**")
|
331 |
+
mic3 = gr.Audio(
|
332 |
+
sources=["microphone"],
|
333 |
+
streaming=True,
|
334 |
+
type="numpy", # numpy ํ์ ๋ช
์
|
335 |
+
label="๋ง์ดํฌ"
|
336 |
+
)
|
337 |
+
gr.Markdown("๐ก **์ฌ์ฉ ๋ฐฉ๋ฒ**\n- 2-3์ด ์ ๋ ๋ฌธ์ฅ์ ๋ง์ํด์ฃผ์ธ์\n- ๋๋ฌด ์งง๊ฑฐ๋ ๊ธด ๋ฌธ์ฅ์ ์ธ์์ด ์ด๋ ค์ธ ์ ์์ต๋๋ค")
|
338 |
+
|
339 |
+
with gr.Column():
|
340 |
+
o3 = gr.Textbox(label="์๋ฌธ(์ค์๊ฐ)", lines=8, interactive=False)
|
341 |
+
t3 = gr.Textbox(label="๋ฒ์ญ(์ค์๊ฐ)", lines=8, interactive=False)
|
342 |
+
|
343 |
st3 = gr.State()
|
344 |
|
345 |
# stream ๋ฉ์๋ ์์
|
346 |
mic3.stream(
|
347 |
realtime_single_sync,
|
348 |
inputs=[mic3, src3, tgt3, st3],
|
349 |
+
outputs=[o3, t3, st3],
|
350 |
+
time_limit=30, # 30์ด ์ ํ
|
351 |
+
stream_every=0.5 # 0.5์ด๋ง๋ค ์คํธ๋ฆผ
|
352 |
)
|
353 |
|
354 |
# ํญ 4 โ ์ค์๊ฐ 4์ธ์ด
|
355 |
with gr.TabItem("๐ ์ค์๊ฐ 4"):
|
356 |
src4 = gr.Dropdown(LANG, value="Korean", label="์
๋ ฅ ์ธ์ด")
|
357 |
+
|
358 |
+
with gr.Row():
|
359 |
+
with gr.Column(scale=1):
|
360 |
+
gr.Markdown("๐ค **๋ง์ดํฌ ์
๋ ฅ**")
|
361 |
+
mic4 = gr.Audio(
|
362 |
+
sources=["microphone"],
|
363 |
+
streaming=True,
|
364 |
+
type="numpy",
|
365 |
+
label="๋ง์ดํฌ"
|
366 |
+
)
|
367 |
+
o4 = gr.Textbox(label="์๋ฌธ", lines=8, interactive=False)
|
368 |
+
|
369 |
+
with gr.Column(scale=2):
|
370 |
+
with gr.Row():
|
371 |
+
e4 = gr.Textbox(label="English", lines=8, interactive=False)
|
372 |
+
c4 = gr.Textbox(label="Chinese(็ฎไฝ)", lines=8, interactive=False)
|
373 |
+
with gr.Row():
|
374 |
+
th4 = gr.Textbox(label="Thai", lines=8, interactive=False)
|
375 |
+
r4 = gr.Textbox(label="Russian", lines=8, interactive=False)
|
376 |
+
|
377 |
st4 = gr.State()
|
378 |
|
379 |
# stream ๋ฉ์๋ ์์
|
380 |
mic4.stream(
|
381 |
realtime_four_sync,
|
382 |
inputs=[mic4, src4, st4],
|
383 |
+
outputs=[o4, e4, c4, th4, r4, st4],
|
384 |
+
time_limit=30,
|
385 |
+
stream_every=0.5
|
386 |
)
|
387 |
|
388 |
demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)
|