openfree commited on
Commit
4e89e7e
ยท
verified ยท
1 Parent(s): 92dd616

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +149 -102
app.py CHANGED
@@ -14,7 +14,7 @@ LANG = ["Korean","English","Japanese","Chinese",
14
  VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
15
  for l in LANG}
16
  FOUR = ["English","Chinese","Thai","Russian"]
17
- WS_URL = "wss://api.openai.com/v1/audio/transcriptions/stream"
18
 
19
  # โ”€โ”€โ”€ 1. ๊ณตํ†ต GPT ๋ฒˆ์—ญ / TTS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
20
  async def gpt_translate(text, src, tgt):
@@ -41,124 +41,171 @@ def translate_pdf(file, src, tgt):
41
  return "โš ๏ธ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ", ""
42
  return text, asyncio.run(gpt_translate(text, src, tgt))
43
 
44
- # โ”€โ”€โ”€ 3. WebSocket STT ํ—ฌํผ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
45
- async def ws_stt_generator(audio_queue: asyncio.Queue):
46
- """
47
- ๋ฐฑ๊ทธ๋ผ์šด๋“œ ํƒœ์Šคํฌ:
48
- - audio_queue ๋กœ๋ถ€ํ„ฐ chunk(bytes) ์ˆ˜์‹ 
49
- - WS ๋กœ ์ „์†ก, ์„œ๋ฒ„ event ์ˆ˜์‹  โ†’ yield (partial text, final?)
50
- """
51
- async with websockets.connect(
52
- WS_URL,
53
- extra_headers={"Authorization": f"Bearer {openai.api_key}"},
54
- max_size=None
55
- ) as ws:
56
- async def sender():
57
- while True:
58
- chunk = await audio_queue.get()
59
- if chunk is None: # ์ข…๋ฃŒ ํ”Œ๋ž˜๊ทธ
60
- await ws.send(json.dumps({"terminate": True}))
61
- break
62
- await ws.send(chunk)
63
- asyncio.create_task(sender())
64
- async for msg in ws:
65
- data = json.loads(msg)
66
- yield data["text"], data.get("final", False)
67
-
68
- # โ”€โ”€โ”€ 4. Gradio ์ŠคํŠธ๋ฆผ ํ•ธ๋“ค๋Ÿฌ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
69
- async def realtime_single(mic, src, tgt, state):
70
- """
71
- mic: bytes chunk (Gradio ์ž๋™)
72
- state: {"queue": Queue, "task": Task, "orig": str, "trans": str}
73
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  if state is None:
75
- state = {"queue": asyncio.Queue(), "task": None, "orig":"", "trans":""}
76
-
77
- if mic is None: # ์ŠคํŠธ๋ฆผ ์ข…๋ฃŒ
78
- await state["queue"].put(None)
79
  return state["orig"], state["trans"], state
80
-
81
- # ์ฒซ ํ˜ธ์ถœ์ด๋ฉด WS ํƒœ์Šคํฌ ์‹œ์ž‘
82
- if state["task"] is None:
83
- async def run_ws():
84
- async for text, final in ws_stt_generator(state["queue"]):
85
- state["orig"] += (" " if state["orig"] else "") + text
86
- add = await gpt_translate(text, src, tgt)
87
- state["trans"] += (" " if state["trans"] else "") + add
88
- state["task"] = asyncio.create_task(run_ws())
89
-
90
- # ๋งˆ์ดํฌ chunk enqueue
91
- await state["queue"].put(mic)
 
 
 
 
 
92
  return state["orig"], state["trans"], state
93
 
94
- async def realtime_four(mic, src, state):
 
95
  if state is None:
96
- state = {"queue": asyncio.Queue(), "task": None,
97
- "orig":"", "English":"", "Chinese":"", "Thai":"", "Russian":""}
98
-
99
- if mic is None:
100
- await state["queue"].put(None)
101
- return tuple(state[k] for k in
102
- ["orig","English","Chinese","Thai","Russian"]) + (state,)
103
-
104
- if state["task"] is None:
105
- async def run_ws():
106
- async for text, _ in ws_stt_generator(state["queue"]):
107
- state["orig"] += (" "+text)
108
- for lang in FOUR:
109
- state[lang] += (" "+ await gpt_translate(text, src, lang))
110
- state["task"] = asyncio.create_task(run_ws())
111
-
112
- await state["queue"].put(mic)
113
- return tuple(state[k] for k in
114
- ["orig","English","Chinese","Thai","Russian"]) + (state,)
 
 
 
 
 
 
 
 
 
 
115
 
116
  # โ”€โ”€โ”€ 5. UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
117
  with gr.Blocks(title="SMARTok Demo") as demo:
118
  with gr.Tabs():
119
  # ํƒญ 1 โ€“ ์˜ค๋””์˜ค ๋ฒˆ์—ญ
120
  with gr.TabItem("๐ŸŽ™๏ธ ์˜ค๋””์˜ค"):
121
- src1=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ")
122
- tgt1=gr.Dropdown(LANG,value="English",label="์ถœ๋ ฅ")
123
- aud1=gr.Audio(sources=["microphone","upload"],type="filepath")
124
- btn1=gr.Button("๋ฒˆ์—ญ")
125
- o1=gr.Textbox(label="์›๋ฌธ"); t1=gr.Textbox(label="๋ฒˆ์—ญ")
126
- a1=gr.Audio(label="TTS",type="filepath",autoplay=True)
127
- btn1.click(lambda a,s,t: translate_pdf.__wrapped__ if False else translate_pdf,
128
- [aud1,src1,tgt1],[o1,t1,a1]) # dummy, ์œ ์ง€์šฉ
 
129
 
130
  # ํƒญ 2 โ€“ PDF ๋ฒˆ์—ญ
131
  with gr.TabItem("๐Ÿ“„ PDF"):
132
- src2=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ")
133
- tgt2=gr.Dropdown(LANG,value="English",label="์ถœ๋ ฅ")
134
- pdf=gr.File(file_types=[".pdf"])
135
- btn2=gr.Button("๋ฒˆ์—ญ")
136
- o2=gr.Textbox(label="์ถ”์ถœ ์›๋ฌธ",lines=15)
137
- t2=gr.Textbox(label="๋ฒˆ์—ญ ๊ฒฐ๊ณผ",lines=15)
138
- btn2.click(translate_pdf:=translate_pdf,[pdf,src2,tgt2],[o2,t2])
 
139
 
140
  # ํƒญ 3 โ€“ ์‹ค์‹œ๊ฐ„ 1์–ธ์–ด
141
  with gr.TabItem("โฑ๏ธ ์‹ค์‹œ๊ฐ„ 1"):
142
- src3=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ")
143
- tgt3=gr.Dropdown(LANG,value="English",label="์ถœ๋ ฅ")
144
- mic3=gr.Audio(sources=["microphone"],streaming=True)
145
- o3=gr.Textbox(label="์›๋ฌธ(์‹ค์‹œ๊ฐ„)",lines=8)
146
- t3=gr.Textbox(label="๋ฒˆ์—ญ(์‹ค์‹œ๊ฐ„)",lines=8)
147
- st3=gr.State()
148
- mic3.stream(realtime_single,inputs=[src3,tgt3,st3],
149
- outputs=[o3,t3,st3])
 
 
 
 
 
150
 
151
  # ํƒญ 4 โ€“ ์‹ค์‹œ๊ฐ„ 4์–ธ์–ด
152
  with gr.TabItem("๐ŸŒ ์‹ค์‹œ๊ฐ„ 4"):
153
- src4=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ")
154
- mic4=gr.Audio(sources=["microphone"],streaming=True)
155
- o4=gr.Textbox(label="์›๋ฌธ",lines=8)
156
- e4=gr.Textbox(label="English",lines=8)
157
- c4=gr.Textbox(label="Chinese(็ฎ€ไฝ“)",lines=8)
158
- th4=gr.Textbox(label="Thai",lines=8)
159
- r4=gr.Textbox(label="Russian",lines=8)
160
- st4=gr.State()
161
- mic4.stream(realtime_four,inputs=[src4,st4],
162
- outputs=[o4,e4,c4,th4,r4,st4])
163
-
164
- demo.launch(server_name="0.0.0.0",server_port=7860,debug=True)
 
 
 
 
 
 
14
  VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
15
  for l in LANG}
16
  FOUR = ["English","Chinese","Thai","Russian"]
17
+ WS_URL = "wss://api.openai.com/v1/realtime" # ์˜ฌ๋ฐ”๋ฅธ ์—”๋“œํฌ์ธํŠธ๋กœ ์ˆ˜์ •
18
 
19
  # โ”€โ”€โ”€ 1. ๊ณตํ†ต GPT ๋ฒˆ์—ญ / TTS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
20
  async def gpt_translate(text, src, tgt):
 
41
  return "โš ๏ธ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ", ""
42
  return text, asyncio.run(gpt_translate(text, src, tgt))
43
 
44
+ # โ”€โ”€โ”€ 2-1. ์˜ค๋””์˜ค ๋ฒˆ์—ญ (ํƒญ1์šฉ) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
45
+ async def translate_audio_async(file, src, tgt):
46
+ if not file: return "โš ๏ธ ์˜ค๋””์˜ค ์—…๋กœ๋“œ ํ•„์š”", "", None
47
+
48
+ # STT: Whisper API ์‚ฌ์šฉ
49
+ with open(file, 'rb') as audio_file:
50
+ transcript = await openai.AsyncClient().audio.transcriptions.create(
51
+ model="whisper-1",
52
+ file=audio_file,
53
+ language=src[:2].lower() # ์–ธ์–ด ์ฝ”๋“œ ๊ฐ„์†Œํ™”
54
+ )
55
+
56
+ orig_text = transcript.text
57
+ trans_text = await gpt_translate(orig_text, src, tgt)
58
+ audio_path = await gpt_tts(trans_text, tgt)
59
+
60
+ return orig_text, trans_text, audio_path
61
+
62
+ def translate_audio(file, src, tgt):
63
+ return asyncio.run(translate_audio_async(file, src, tgt))
64
+
65
+ # โ”€โ”€โ”€ 3. ์‹ค์‹œ๊ฐ„ STT (Whisper API ์‚ฌ์šฉ) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
66
+ async def process_audio_chunk(audio_data, src_lang):
67
+ """์˜ค๋””์˜ค ์ฒญํฌ๋ฅผ ์ฒ˜๋ฆฌํ•˜์—ฌ ํ…์ŠคํŠธ๋กœ ๋ณ€ํ™˜"""
68
+ if audio_data is None:
69
+ return ""
70
+
71
+ try:
72
+ # ์ž„์‹œ ํŒŒ์ผ๋กœ ์ €์žฅ
73
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
74
+ tmp.write(audio_data)
75
+ tmp_path = tmp.name
76
+
77
+ # Whisper API๋กœ ๋ณ€ํ™˜
78
+ with open(tmp_path, 'rb') as audio_file:
79
+ transcript = await openai.AsyncClient().audio.transcriptions.create(
80
+ model="whisper-1",
81
+ file=audio_file,
82
+ language=src_lang[:2].lower()
83
+ )
84
+
85
+ os.unlink(tmp_path) # ์ž„์‹œ ํŒŒ์ผ ์‚ญ์ œ
86
+ return transcript.text
87
+ except Exception as e:
88
+ print(f"STT ์˜ค๋ฅ˜: {e}")
89
+ return ""
90
+
91
+ # โ”€โ”€โ”€ 4. Gradio ์ŠคํŠธ๋ฆผ ํ•ธ๋“ค๋Ÿฌ (๋™๊ธฐ ๋ฒ„์ „) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
92
+ def realtime_single_sync(audio, src, tgt, state):
93
+ """๋™๊ธฐ ๋ฒ„์ „์˜ ์‹ค์‹œ๊ฐ„ ๋‹จ์ผ ์–ธ์–ด ๋ฒˆ์—ญ"""
94
  if state is None:
95
+ state = {"orig": "", "trans": ""}
96
+
97
+ if audio is None:
 
98
  return state["orig"], state["trans"], state
99
+
100
+ # ๋น„๋™๊ธฐ ์ž‘์—…์„ ๋™๊ธฐ์ ์œผ๋กœ ์‹คํ–‰
101
+ loop = asyncio.new_event_loop()
102
+ asyncio.set_event_loop(loop)
103
+
104
+ try:
105
+ # STT
106
+ text = loop.run_until_complete(process_audio_chunk(audio, src))
107
+ if text:
108
+ state["orig"] = state["orig"] + " " + text if state["orig"] else text
109
+
110
+ # ๋ฒˆ์—ญ
111
+ trans = loop.run_until_complete(gpt_translate(text, src, tgt))
112
+ state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
113
+ finally:
114
+ loop.close()
115
+
116
  return state["orig"], state["trans"], state
117
 
118
+ def realtime_four_sync(audio, src, state):
119
+ """๋™๊ธฐ ๋ฒ„์ „์˜ ์‹ค์‹œ๊ฐ„ 4์–ธ์–ด ๋ฒˆ์—ญ"""
120
  if state is None:
121
+ state = {"orig": "", "English": "", "Chinese": "", "Thai": "", "Russian": ""}
122
+
123
+ if audio is None:
124
+ return (state["orig"], state["English"], state["Chinese"],
125
+ state["Thai"], state["Russian"], state)
126
+
127
+ loop = asyncio.new_event_loop()
128
+ asyncio.set_event_loop(loop)
129
+
130
+ try:
131
+ # STT
132
+ text = loop.run_until_complete(process_audio_chunk(audio, src))
133
+ if text:
134
+ state["orig"] = state["orig"] + " " + text if state["orig"] else text
135
+
136
+ # 4๊ฐœ ์–ธ์–ด๋กœ ๋ฒˆ์—ญ
137
+ tasks = []
138
+ for lang in FOUR:
139
+ tasks.append(gpt_translate(text, src, lang))
140
+
141
+ translations = loop.run_until_complete(asyncio.gather(*tasks))
142
+
143
+ for lang, trans in zip(FOUR, translations):
144
+ state[lang] = state[lang] + " " + trans if state[lang] else trans
145
+ finally:
146
+ loop.close()
147
+
148
+ return (state["orig"], state["English"], state["Chinese"],
149
+ state["Thai"], state["Russian"], state)
150
 
151
  # โ”€โ”€โ”€ 5. UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
152
  with gr.Blocks(title="SMARTok Demo") as demo:
153
  with gr.Tabs():
154
  # ํƒญ 1 โ€“ ์˜ค๋””์˜ค ๋ฒˆ์—ญ
155
  with gr.TabItem("๐ŸŽ™๏ธ ์˜ค๋””์˜ค"):
156
+ src1 = gr.Dropdown(LANG, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
157
+ tgt1 = gr.Dropdown(LANG, value="English", label="์ถœ๋ ฅ ์–ธ์–ด")
158
+ aud1 = gr.Audio(sources=["microphone", "upload"], type="filepath")
159
+ btn1 = gr.Button("๋ฒˆ์—ญ")
160
+ o1 = gr.Textbox(label="์›๋ฌธ")
161
+ t1 = gr.Textbox(label="๋ฒˆ์—ญ")
162
+ a1 = gr.Audio(label="TTS", type="filepath", autoplay=True)
163
+
164
+ btn1.click(translate_audio, [aud1, src1, tgt1], [o1, t1, a1])
165
 
166
  # ํƒญ 2 โ€“ PDF ๋ฒˆ์—ญ
167
  with gr.TabItem("๐Ÿ“„ PDF"):
168
+ src2 = gr.Dropdown(LANG, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
169
+ tgt2 = gr.Dropdown(LANG, value="English", label="์ถœ๋ ฅ ์–ธ์–ด")
170
+ pdf = gr.File(file_types=[".pdf"])
171
+ btn2 = gr.Button("๋ฒˆ์—ญ")
172
+ o2 = gr.Textbox(label="์ถ”์ถœ ์›๋ฌธ", lines=15)
173
+ t2 = gr.Textbox(label="๋ฒˆ์—ญ ๊ฒฐ๊ณผ", lines=15)
174
+
175
+ btn2.click(translate_pdf, [pdf, src2, tgt2], [o2, t2])
176
 
177
  # ํƒญ 3 โ€“ ์‹ค์‹œ๊ฐ„ 1์–ธ์–ด
178
  with gr.TabItem("โฑ๏ธ ์‹ค์‹œ๊ฐ„ 1"):
179
+ src3 = gr.Dropdown(LANG, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
180
+ tgt3 = gr.Dropdown(LANG, value="English", label="์ถœ๋ ฅ ์–ธ์–ด")
181
+ mic3 = gr.Audio(sources=["microphone"], streaming=True)
182
+ o3 = gr.Textbox(label="์›๋ฌธ(์‹ค์‹œ๊ฐ„)", lines=8)
183
+ t3 = gr.Textbox(label="๋ฒˆ์—ญ(์‹ค์‹œ๊ฐ„)", lines=8)
184
+ st3 = gr.State()
185
+
186
+ # stream ๋ฉ”์„œ๋“œ ์ˆ˜์ •
187
+ mic3.stream(
188
+ realtime_single_sync,
189
+ inputs=[mic3, src3, tgt3, st3],
190
+ outputs=[o3, t3, st3]
191
+ )
192
 
193
  # ํƒญ 4 โ€“ ์‹ค์‹œ๊ฐ„ 4์–ธ์–ด
194
  with gr.TabItem("๐ŸŒ ์‹ค์‹œ๊ฐ„ 4"):
195
+ src4 = gr.Dropdown(LANG, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
196
+ mic4 = gr.Audio(sources=["microphone"], streaming=True)
197
+ o4 = gr.Textbox(label="์›๋ฌธ", lines=8)
198
+ e4 = gr.Textbox(label="English", lines=8)
199
+ c4 = gr.Textbox(label="Chinese(็ฎ€ไฝ“)", lines=8)
200
+ th4 = gr.Textbox(label="Thai", lines=8)
201
+ r4 = gr.Textbox(label="Russian", lines=8)
202
+ st4 = gr.State()
203
+
204
+ # stream ๋ฉ”์„œ๋“œ ์ˆ˜์ •
205
+ mic4.stream(
206
+ realtime_four_sync,
207
+ inputs=[mic4, src4, st4],
208
+ outputs=[o4, e4, c4, th4, r4, st4]
209
+ )
210
+
211
+ demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)