Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -14,7 +14,7 @@ LANG = ["Korean","English","Japanese","Chinese",
|
|
14 |
VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
|
15 |
for l in LANG}
|
16 |
FOUR = ["English","Chinese","Thai","Russian"]
|
17 |
-
WS_URL = "wss://api.openai.com/v1/
|
18 |
|
19 |
# โโโ 1. ๊ณตํต GPT ๋ฒ์ญ / TTS โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
20 |
async def gpt_translate(text, src, tgt):
|
@@ -41,124 +41,171 @@ def translate_pdf(file, src, tgt):
|
|
41 |
return "โ ๏ธ ํ
์คํธ ์ถ์ถ ์คํจ", ""
|
42 |
return text, asyncio.run(gpt_translate(text, src, tgt))
|
43 |
|
44 |
-
# โโโ
|
45 |
-
async def
|
46 |
-
"""
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
if state is None:
|
75 |
-
state = {"
|
76 |
-
|
77 |
-
if
|
78 |
-
await state["queue"].put(None)
|
79 |
return state["orig"], state["trans"], state
|
80 |
-
|
81 |
-
#
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
92 |
return state["orig"], state["trans"], state
|
93 |
|
94 |
-
|
|
|
95 |
if state is None:
|
96 |
-
state = {"
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
# โโโ 5. UI โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
117 |
with gr.Blocks(title="SMARTok Demo") as demo:
|
118 |
with gr.Tabs():
|
119 |
# ํญ 1 โ ์ค๋์ค ๋ฒ์ญ
|
120 |
with gr.TabItem("๐๏ธ ์ค๋์ค"):
|
121 |
-
src1=gr.Dropdown(LANG,value="Korean",label="์
๋ ฅ")
|
122 |
-
tgt1=gr.Dropdown(LANG,value="English",label="์ถ๋ ฅ")
|
123 |
-
aud1=gr.Audio(sources=["microphone","upload"],type="filepath")
|
124 |
-
btn1=gr.Button("๋ฒ์ญ")
|
125 |
-
o1=gr.Textbox(label="์๋ฌธ")
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
129 |
|
130 |
# ํญ 2 โ PDF ๋ฒ์ญ
|
131 |
with gr.TabItem("๐ PDF"):
|
132 |
-
src2=gr.Dropdown(LANG,value="Korean",label="์
๋ ฅ")
|
133 |
-
tgt2=gr.Dropdown(LANG,value="English",label="์ถ๋ ฅ")
|
134 |
-
pdf=gr.File(file_types=[".pdf"])
|
135 |
-
btn2=gr.Button("๋ฒ์ญ")
|
136 |
-
o2=gr.Textbox(label="์ถ์ถ ์๋ฌธ",lines=15)
|
137 |
-
t2=gr.Textbox(label="๋ฒ์ญ ๊ฒฐ๊ณผ",lines=15)
|
138 |
-
|
|
|
139 |
|
140 |
# ํญ 3 โ ์ค์๊ฐ 1์ธ์ด
|
141 |
with gr.TabItem("โฑ๏ธ ์ค์๊ฐ 1"):
|
142 |
-
src3=gr.Dropdown(LANG,value="Korean",label="์
๋ ฅ")
|
143 |
-
tgt3=gr.Dropdown(LANG,value="English",label="์ถ๋ ฅ")
|
144 |
-
mic3=gr.Audio(sources=["microphone"],streaming=True)
|
145 |
-
o3=gr.Textbox(label="์๋ฌธ(์ค์๊ฐ)",lines=8)
|
146 |
-
t3=gr.Textbox(label="๋ฒ์ญ(์ค์๊ฐ)",lines=8)
|
147 |
-
st3=gr.State()
|
148 |
-
|
149 |
-
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
# ํญ 4 โ ์ค์๊ฐ 4์ธ์ด
|
152 |
with gr.TabItem("๐ ์ค์๊ฐ 4"):
|
153 |
-
src4=gr.Dropdown(LANG,value="Korean",label="์
๋ ฅ")
|
154 |
-
mic4=gr.Audio(sources=["microphone"],streaming=True)
|
155 |
-
o4=gr.Textbox(label="์๋ฌธ",lines=8)
|
156 |
-
e4=gr.Textbox(label="English",lines=8)
|
157 |
-
c4=gr.Textbox(label="Chinese(็ฎไฝ)",lines=8)
|
158 |
-
th4=gr.Textbox(label="Thai",lines=8)
|
159 |
-
r4=gr.Textbox(label="Russian",lines=8)
|
160 |
-
st4=gr.State()
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
14 |
VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
|
15 |
for l in LANG}
|
16 |
FOUR = ["English","Chinese","Thai","Russian"]
|
17 |
+
WS_URL = "wss://api.openai.com/v1/realtime" # ์ฌ๋ฐ๋ฅธ ์๋ํฌ์ธํธ๋ก ์์
|
18 |
|
19 |
# โโโ 1. ๊ณตํต GPT ๋ฒ์ญ / TTS โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
20 |
async def gpt_translate(text, src, tgt):
|
|
|
41 |
return "โ ๏ธ ํ
์คํธ ์ถ์ถ ์คํจ", ""
|
42 |
return text, asyncio.run(gpt_translate(text, src, tgt))
|
43 |
|
44 |
+
# โโโ 2-1. ์ค๋์ค ๋ฒ์ญ (ํญ1์ฉ) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
45 |
+
async def translate_audio_async(file, src, tgt):
|
46 |
+
if not file: return "โ ๏ธ ์ค๋์ค ์
๋ก๋ ํ์", "", None
|
47 |
+
|
48 |
+
# STT: Whisper API ์ฌ์ฉ
|
49 |
+
with open(file, 'rb') as audio_file:
|
50 |
+
transcript = await openai.AsyncClient().audio.transcriptions.create(
|
51 |
+
model="whisper-1",
|
52 |
+
file=audio_file,
|
53 |
+
language=src[:2].lower() # ์ธ์ด ์ฝ๋ ๊ฐ์ํ
|
54 |
+
)
|
55 |
+
|
56 |
+
orig_text = transcript.text
|
57 |
+
trans_text = await gpt_translate(orig_text, src, tgt)
|
58 |
+
audio_path = await gpt_tts(trans_text, tgt)
|
59 |
+
|
60 |
+
return orig_text, trans_text, audio_path
|
61 |
+
|
62 |
+
def translate_audio(file, src, tgt):
|
63 |
+
return asyncio.run(translate_audio_async(file, src, tgt))
|
64 |
+
|
65 |
+
# โโโ 3. ์ค์๊ฐ STT (Whisper API ์ฌ์ฉ) โโโโโโโโโโโโโโโโโโโโโโโโโโ
|
66 |
+
async def process_audio_chunk(audio_data, src_lang):
|
67 |
+
"""์ค๋์ค ์ฒญํฌ๋ฅผ ์ฒ๋ฆฌํ์ฌ ํ
์คํธ๋ก ๋ณํ"""
|
68 |
+
if audio_data is None:
|
69 |
+
return ""
|
70 |
+
|
71 |
+
try:
|
72 |
+
# ์์ ํ์ผ๋ก ์ ์ฅ
|
73 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
74 |
+
tmp.write(audio_data)
|
75 |
+
tmp_path = tmp.name
|
76 |
+
|
77 |
+
# Whisper API๋ก ๋ณํ
|
78 |
+
with open(tmp_path, 'rb') as audio_file:
|
79 |
+
transcript = await openai.AsyncClient().audio.transcriptions.create(
|
80 |
+
model="whisper-1",
|
81 |
+
file=audio_file,
|
82 |
+
language=src_lang[:2].lower()
|
83 |
+
)
|
84 |
+
|
85 |
+
os.unlink(tmp_path) # ์์ ํ์ผ ์ญ์
|
86 |
+
return transcript.text
|
87 |
+
except Exception as e:
|
88 |
+
print(f"STT ์ค๋ฅ: {e}")
|
89 |
+
return ""
|
90 |
+
|
91 |
+
# โโโ 4. Gradio ์คํธ๋ฆผ ํธ๋ค๋ฌ (๋๊ธฐ ๋ฒ์ ) โโโโโโโโโโโโโโโโโโโโโ
|
92 |
+
def realtime_single_sync(audio, src, tgt, state):
|
93 |
+
"""๋๊ธฐ ๋ฒ์ ์ ์ค์๊ฐ ๋จ์ผ ์ธ์ด ๋ฒ์ญ"""
|
94 |
if state is None:
|
95 |
+
state = {"orig": "", "trans": ""}
|
96 |
+
|
97 |
+
if audio is None:
|
|
|
98 |
return state["orig"], state["trans"], state
|
99 |
+
|
100 |
+
# ๋น๋๊ธฐ ์์
์ ๋๊ธฐ์ ์ผ๋ก ์คํ
|
101 |
+
loop = asyncio.new_event_loop()
|
102 |
+
asyncio.set_event_loop(loop)
|
103 |
+
|
104 |
+
try:
|
105 |
+
# STT
|
106 |
+
text = loop.run_until_complete(process_audio_chunk(audio, src))
|
107 |
+
if text:
|
108 |
+
state["orig"] = state["orig"] + " " + text if state["orig"] else text
|
109 |
+
|
110 |
+
# ๋ฒ์ญ
|
111 |
+
trans = loop.run_until_complete(gpt_translate(text, src, tgt))
|
112 |
+
state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
|
113 |
+
finally:
|
114 |
+
loop.close()
|
115 |
+
|
116 |
return state["orig"], state["trans"], state
|
117 |
|
118 |
+
def realtime_four_sync(audio, src, state):
|
119 |
+
"""๋๊ธฐ ๋ฒ์ ์ ์ค์๊ฐ 4์ธ์ด ๋ฒ์ญ"""
|
120 |
if state is None:
|
121 |
+
state = {"orig": "", "English": "", "Chinese": "", "Thai": "", "Russian": ""}
|
122 |
+
|
123 |
+
if audio is None:
|
124 |
+
return (state["orig"], state["English"], state["Chinese"],
|
125 |
+
state["Thai"], state["Russian"], state)
|
126 |
+
|
127 |
+
loop = asyncio.new_event_loop()
|
128 |
+
asyncio.set_event_loop(loop)
|
129 |
+
|
130 |
+
try:
|
131 |
+
# STT
|
132 |
+
text = loop.run_until_complete(process_audio_chunk(audio, src))
|
133 |
+
if text:
|
134 |
+
state["orig"] = state["orig"] + " " + text if state["orig"] else text
|
135 |
+
|
136 |
+
# 4๊ฐ ์ธ์ด๋ก ๋ฒ์ญ
|
137 |
+
tasks = []
|
138 |
+
for lang in FOUR:
|
139 |
+
tasks.append(gpt_translate(text, src, lang))
|
140 |
+
|
141 |
+
translations = loop.run_until_complete(asyncio.gather(*tasks))
|
142 |
+
|
143 |
+
for lang, trans in zip(FOUR, translations):
|
144 |
+
state[lang] = state[lang] + " " + trans if state[lang] else trans
|
145 |
+
finally:
|
146 |
+
loop.close()
|
147 |
+
|
148 |
+
return (state["orig"], state["English"], state["Chinese"],
|
149 |
+
state["Thai"], state["Russian"], state)
|
150 |
|
151 |
# โโโ 5. UI โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
152 |
with gr.Blocks(title="SMARTok Demo") as demo:
|
153 |
with gr.Tabs():
|
154 |
# ํญ 1 โ ์ค๋์ค ๋ฒ์ญ
|
155 |
with gr.TabItem("๐๏ธ ์ค๋์ค"):
|
156 |
+
src1 = gr.Dropdown(LANG, value="Korean", label="์
๋ ฅ ์ธ์ด")
|
157 |
+
tgt1 = gr.Dropdown(LANG, value="English", label="์ถ๋ ฅ ์ธ์ด")
|
158 |
+
aud1 = gr.Audio(sources=["microphone", "upload"], type="filepath")
|
159 |
+
btn1 = gr.Button("๋ฒ์ญ")
|
160 |
+
o1 = gr.Textbox(label="์๋ฌธ")
|
161 |
+
t1 = gr.Textbox(label="๋ฒ์ญ")
|
162 |
+
a1 = gr.Audio(label="TTS", type="filepath", autoplay=True)
|
163 |
+
|
164 |
+
btn1.click(translate_audio, [aud1, src1, tgt1], [o1, t1, a1])
|
165 |
|
166 |
# ํญ 2 โ PDF ๋ฒ์ญ
|
167 |
with gr.TabItem("๐ PDF"):
|
168 |
+
src2 = gr.Dropdown(LANG, value="Korean", label="์
๋ ฅ ์ธ์ด")
|
169 |
+
tgt2 = gr.Dropdown(LANG, value="English", label="์ถ๋ ฅ ์ธ์ด")
|
170 |
+
pdf = gr.File(file_types=[".pdf"])
|
171 |
+
btn2 = gr.Button("๋ฒ์ญ")
|
172 |
+
o2 = gr.Textbox(label="์ถ์ถ ์๋ฌธ", lines=15)
|
173 |
+
t2 = gr.Textbox(label="๋ฒ์ญ ๊ฒฐ๊ณผ", lines=15)
|
174 |
+
|
175 |
+
btn2.click(translate_pdf, [pdf, src2, tgt2], [o2, t2])
|
176 |
|
177 |
# ํญ 3 โ ์ค์๊ฐ 1์ธ์ด
|
178 |
with gr.TabItem("โฑ๏ธ ์ค์๊ฐ 1"):
|
179 |
+
src3 = gr.Dropdown(LANG, value="Korean", label="์
๋ ฅ ์ธ์ด")
|
180 |
+
tgt3 = gr.Dropdown(LANG, value="English", label="์ถ๋ ฅ ์ธ์ด")
|
181 |
+
mic3 = gr.Audio(sources=["microphone"], streaming=True)
|
182 |
+
o3 = gr.Textbox(label="์๋ฌธ(์ค์๊ฐ)", lines=8)
|
183 |
+
t3 = gr.Textbox(label="๋ฒ์ญ(์ค์๊ฐ)", lines=8)
|
184 |
+
st3 = gr.State()
|
185 |
+
|
186 |
+
# stream ๋ฉ์๋ ์์
|
187 |
+
mic3.stream(
|
188 |
+
realtime_single_sync,
|
189 |
+
inputs=[mic3, src3, tgt3, st3],
|
190 |
+
outputs=[o3, t3, st3]
|
191 |
+
)
|
192 |
|
193 |
# ํญ 4 โ ์ค์๊ฐ 4์ธ์ด
|
194 |
with gr.TabItem("๐ ์ค์๊ฐ 4"):
|
195 |
+
src4 = gr.Dropdown(LANG, value="Korean", label="์
๋ ฅ ์ธ์ด")
|
196 |
+
mic4 = gr.Audio(sources=["microphone"], streaming=True)
|
197 |
+
o4 = gr.Textbox(label="์๋ฌธ", lines=8)
|
198 |
+
e4 = gr.Textbox(label="English", lines=8)
|
199 |
+
c4 = gr.Textbox(label="Chinese(็ฎไฝ)", lines=8)
|
200 |
+
th4 = gr.Textbox(label="Thai", lines=8)
|
201 |
+
r4 = gr.Textbox(label="Russian", lines=8)
|
202 |
+
st4 = gr.State()
|
203 |
+
|
204 |
+
# stream ๋ฉ์๋ ์์
|
205 |
+
mic4.stream(
|
206 |
+
realtime_four_sync,
|
207 |
+
inputs=[mic4, src4, st4],
|
208 |
+
outputs=[o4, e4, c4, th4, r4, st4]
|
209 |
+
)
|
210 |
+
|
211 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)
|