openfree commited on
Commit
e1fe24a
ยท
verified ยท
1 Parent(s): 2b6f990

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -13
app.py CHANGED
@@ -1,9 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os, asyncio, json, tempfile, websockets, pdfplumber
2
  import gradio as gr
3
  import openai
4
  from dotenv import load_dotenv
5
  import numpy as np
6
  import wave
 
 
7
 
8
  # โ”€โ”€โ”€ 0. ์ดˆ๊ธฐํ™” โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
9
  load_dotenv()
@@ -11,6 +25,19 @@ openai.api_key = os.getenv("OPENAI_API_KEY")
11
  if not openai.api_key:
12
  raise RuntimeError("OPENAI_API_KEY ๊ฐ€ .env ์— ์—†์Šต๋‹ˆ๋‹ค!")
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  LANG = ["Korean","English","Japanese","Chinese",
15
  "Thai","Russian","Vietnamese","Spanish","French"]
16
  VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
@@ -63,27 +90,121 @@ def translate_pdf(file, src, tgt):
63
  return text, asyncio.run(gpt_translate(text, src, tgt))
64
 
65
  # โ”€โ”€โ”€ 2-1. ์˜ค๋””์˜ค ๋ฒˆ์—ญ (ํƒญ1์šฉ) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  async def translate_audio_async(file, src, tgt):
67
- if not file: return "โš ๏ธ ์˜ค๋””์˜ค ์—…๋กœ๋“œ ํ•„์š”", "", None
68
 
69
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  # STT: Whisper API ์‚ฌ์šฉ
 
71
  client = get_client()
72
- with open(file, 'rb') as audio_file:
73
  transcript = await client.audio.transcriptions.create(
74
  model="whisper-1",
75
  file=audio_file,
76
  language=src[:2].lower() # ์–ธ์–ด ์ฝ”๋“œ ๊ฐ„์†Œํ™”
77
  )
78
 
 
 
 
 
79
  orig_text = transcript.text
 
 
 
 
 
 
 
80
  trans_text = await gpt_translate(orig_text, src, tgt)
 
 
 
81
  audio_path = await gpt_tts(trans_text, tgt)
82
 
83
  return orig_text, trans_text, audio_path
84
  except Exception as e:
85
  print(f"์˜ค๋””์˜ค ๋ฒˆ์—ญ ์˜ค๋ฅ˜: {e}")
86
- return "โš ๏ธ ๋ฒˆ์—ญ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ", str(e), None
 
 
 
 
 
 
 
 
87
 
88
  def translate_audio(file, src, tgt):
89
  return asyncio.run(translate_audio_async(file, src, tgt))
@@ -306,19 +427,83 @@ def realtime_four_sync(audio, src, state):
306
  state["Thai"], state["Russian"], state)
307
 
308
  # โ”€โ”€โ”€ 5. UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
309
- with gr.Blocks(title="SMARTok Demo") as demo:
 
 
 
 
 
 
 
 
310
  with gr.Tabs():
311
  # ํƒญ 1 โ€“ ์˜ค๋””์˜ค ๋ฒˆ์—ญ
312
- with gr.TabItem("๐ŸŽ™๏ธ ์˜ค๋””์˜ค"):
313
- src1 = gr.Dropdown(LANG, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
314
- tgt1 = gr.Dropdown(LANG, value="English", label="์ถœ๋ ฅ ์–ธ์–ด")
315
- aud1 = gr.Audio(sources=["microphone", "upload"], type="filepath")
316
- btn1 = gr.Button("๋ฒˆ์—ญ")
317
- o1 = gr.Textbox(label="์›๋ฌธ")
318
- t1 = gr.Textbox(label="๋ฒˆ์—ญ")
319
- a1 = gr.Audio(label="TTS", type="filepath", autoplay=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
- btn1.click(translate_audio, [aud1, src1, tgt1], [o1, t1, a1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
  # ํƒญ 2 โ€“ PDF ๋ฒˆ์—ญ
324
  with gr.TabItem("๐Ÿ“„ PDF"):
 
1
+ # SMARTok Demo - ์‹ค์‹œ๊ฐ„ ๋‹ค๊ตญ์–ด ๋ฒˆ์—ญ ์‹œ์Šคํ…œ
2
+ #
3
+ # ํ•„์ˆ˜ ํŒจํ‚ค์ง€:
4
+ # pip install gradio openai python-dotenv pdfplumber numpy websockets
5
+ #
6
+ # ์„ ํƒ ํŒจํ‚ค์ง€ (๋น„๋””์˜ค ์ฒ˜๋ฆฌ):
7
+ # - ffmpeg ์„ค์น˜: sudo apt-get install ffmpeg (Linux) / brew install ffmpeg (Mac)
8
+ # - ๋˜๋Š” pip install moviepy
9
+ #
10
+ # ํ™˜๊ฒฝ ๋ณ€์ˆ˜:
11
+ # .env ํŒŒ์ผ์— OPENAI_API_KEY ์„ค์ • ํ•„์š”
12
+
13
  import os, asyncio, json, tempfile, websockets, pdfplumber
14
  import gradio as gr
15
  import openai
16
  from dotenv import load_dotenv
17
  import numpy as np
18
  import wave
19
+ import subprocess
20
+ import mimetypes
21
 
22
  # โ”€โ”€โ”€ 0. ์ดˆ๊ธฐํ™” โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
23
  load_dotenv()
 
25
  if not openai.api_key:
26
  raise RuntimeError("OPENAI_API_KEY ๊ฐ€ .env ์— ์—†์Šต๋‹ˆ๋‹ค!")
27
 
28
+ # ffmpeg ์„ค์น˜ ํ™•์ธ
29
+ def check_ffmpeg():
30
+ try:
31
+ subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
32
+ return True
33
+ except:
34
+ return False
35
+
36
+ HAS_FFMPEG = check_ffmpeg()
37
+ if not HAS_FFMPEG:
38
+ print("โš ๏ธ ffmpeg๊ฐ€ ์„ค์น˜๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค. ๋น„๋””์˜ค ์ฒ˜๋ฆฌ๊ฐ€ ์ œํ•œ๋  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
39
+ print("์„ค์น˜ ๋ฐฉ๋ฒ•: sudo apt-get install ffmpeg (Linux) / brew install ffmpeg (Mac)")
40
+
41
  LANG = ["Korean","English","Japanese","Chinese",
42
  "Thai","Russian","Vietnamese","Spanish","French"]
43
  VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
 
90
  return text, asyncio.run(gpt_translate(text, src, tgt))
91
 
92
  # โ”€โ”€โ”€ 2-1. ์˜ค๋””์˜ค ๋ฒˆ์—ญ (ํƒญ1์šฉ) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
93
+ def extract_audio_from_video(video_path):
94
+ """MP4 ๋“ฑ ๋น„๋””์˜ค ํŒŒ์ผ์—์„œ ์˜ค๋””์˜ค ์ถ”์ถœ"""
95
+ audio_output = None
96
+ try:
97
+ # ์ž„์‹œ ์˜ค๋””์˜ค ํŒŒ์ผ ์ƒ์„ฑ
98
+ audio_output = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
99
+ audio_output.close()
100
+
101
+ # ๋ฐฉ๋ฒ• 1: ffmpeg ์‚ฌ์šฉ ์‹œ๋„
102
+ if HAS_FFMPEG:
103
+ cmd = [
104
+ 'ffmpeg',
105
+ '-i', video_path,
106
+ '-vn', # ๋น„๋””์˜ค ์ŠคํŠธ๋ฆผ ์ œ๊ฑฐ
107
+ '-acodec', 'pcm_s16le', # WAV ํฌ๋งท
108
+ '-ar', '16000', # 16kHz ์ƒ˜ํ”Œ๋ง
109
+ '-ac', '1', # ๋ชจ๋…ธ
110
+ '-y', # ๋ฎ์–ด์“ฐ๊ธฐ
111
+ audio_output.name
112
+ ]
113
+
114
+ result = subprocess.run(cmd, capture_output=True, text=True)
115
+
116
+ if result.returncode == 0:
117
+ return audio_output.name
118
+ else:
119
+ print(f"ffmpeg ์˜ค๋ฅ˜: {result.stderr}")
120
+
121
+ # ๋ฐฉ๋ฒ• 2: moviepy ์‚ฌ์šฉ ์‹œ๋„
122
+ try:
123
+ from moviepy.editor import VideoFileClip
124
+ print("moviepy๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์˜ค๋””์˜ค ์ถ”์ถœ ์ค‘...")
125
+ video = VideoFileClip(video_path)
126
+ video.audio.write_audiofile(
127
+ audio_output.name,
128
+ fps=16000,
129
+ nbytes=2,
130
+ codec='pcm_s16le',
131
+ verbose=False,
132
+ logger=None
133
+ )
134
+ video.close()
135
+ return audio_output.name
136
+ except ImportError:
137
+ raise Exception(
138
+ "๋น„๋””์˜ค ์ฒ˜๋ฆฌ๋ฅผ ์œ„ํ•ด ffmpeg ๋˜๋Š” moviepy๊ฐ€ ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.\n"
139
+ "์„ค์น˜: pip install moviepy ๋˜๋Š” ffmpeg ์„ค์น˜"
140
+ )
141
+ except Exception as e:
142
+ raise Exception(f"moviepy ์˜ค๋ฅ˜: {str(e)}")
143
+
144
+ except Exception as e:
145
+ # ์˜ค๋ฅ˜ ์‹œ ์ž„์‹œ ํŒŒ์ผ ์ •๋ฆฌ
146
+ if audio_output and os.path.exists(audio_output.name):
147
+ os.unlink(audio_output.name)
148
+ raise e
149
+
150
  async def translate_audio_async(file, src, tgt):
151
+ if not file: return "โš ๏ธ ์˜ค๋””์˜ค/๋น„๋””์˜ค ์—…๋กœ๋“œ ํ•„์š”", "", None
152
 
153
  try:
154
+ # ํŒŒ์ผ ํƒ€์ž… ํ™•์ธ
155
+ mime_type, _ = mimetypes.guess_type(file)
156
+ audio_file_path = file
157
+ temp_audio_path = None
158
+
159
+ # ๋น„๋””์˜ค ํŒŒ์ผ์ธ ๊ฒฝ์šฐ ์˜ค๋””์˜ค ์ถ”์ถœ
160
+ if mime_type and mime_type.startswith('video/'):
161
+ print(f"๋น„๋””์˜ค ํŒŒ์ผ ๊ฐ์ง€: {mime_type}")
162
+ print(f"ํŒŒ์ผ ํฌ๊ธฐ: {os.path.getsize(file) / 1024 / 1024:.1f} MB")
163
+ print("๋น„๋””์˜ค์—์„œ ์˜ค๋””์˜ค ์ถ”์ถœ ์ค‘... (์‹œ๊ฐ„์ด ๊ฑธ๋ฆด ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค)")
164
+ temp_audio_path = extract_audio_from_video(file)
165
+ audio_file_path = temp_audio_path
166
+ print("์˜ค๋””์˜ค ์ถ”์ถœ ์™„๋ฃŒ!")
167
+
168
  # STT: Whisper API ์‚ฌ์šฉ
169
+ print("์Œ์„ฑ ์ธ์‹ ์ค‘...")
170
  client = get_client()
171
+ with open(audio_file_path, 'rb') as audio_file:
172
  transcript = await client.audio.transcriptions.create(
173
  model="whisper-1",
174
  file=audio_file,
175
  language=src[:2].lower() # ์–ธ์–ด ์ฝ”๋“œ ๊ฐ„์†Œํ™”
176
  )
177
 
178
+ # ์ž„์‹œ ํŒŒ์ผ ์ •๋ฆฌ
179
+ if temp_audio_path and os.path.exists(temp_audio_path):
180
+ os.unlink(temp_audio_path)
181
+
182
  orig_text = transcript.text
183
+ if not orig_text.strip():
184
+ return "โš ๏ธ ์Œ์„ฑ์ด ๊ฐ์ง€๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค", "", None
185
+
186
+ print(f"์ธ์‹๋œ ํ…์ŠคํŠธ: {orig_text[:50]}...")
187
+
188
+ # ๋ฒˆ์—ญ
189
+ print(f"{src} โ†’ {tgt} ๋ฒˆ์—ญ ์ค‘...")
190
  trans_text = await gpt_translate(orig_text, src, tgt)
191
+
192
+ # TTS
193
+ print("์Œ์„ฑ ํ•ฉ์„ฑ ์ค‘...")
194
  audio_path = await gpt_tts(trans_text, tgt)
195
 
196
  return orig_text, trans_text, audio_path
197
  except Exception as e:
198
  print(f"์˜ค๋””์˜ค ๋ฒˆ์—ญ ์˜ค๋ฅ˜: {e}")
199
+ # ์ž„์‹œ ํŒŒ์ผ ์ •๋ฆฌ
200
+ if 'temp_audio_path' in locals() and temp_audio_path and os.path.exists(temp_audio_path):
201
+ os.unlink(temp_audio_path)
202
+
203
+ error_msg = str(e)
204
+ if "ffmpeg" in error_msg.lower():
205
+ error_msg += "\n\n๐Ÿ’ก ํ•ด๊ฒฐ ๋ฐฉ๋ฒ•:\n1. ffmpeg ์„ค์น˜: sudo apt-get install ffmpeg\n2. ๋˜๋Š” pip install moviepy"
206
+
207
+ return "โš ๏ธ ๋ฒˆ์—ญ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ", error_msg, None
208
 
209
  def translate_audio(file, src, tgt):
210
  return asyncio.run(translate_audio_async(file, src, tgt))
 
427
  state["Thai"], state["Russian"], state)
428
 
429
  # โ”€โ”€โ”€ 5. UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
430
+ with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as demo:
431
+ gr.Markdown(
432
+ """
433
+ # ๐ŸŒ SMARTok ์‹ค์‹œ๊ฐ„ ๋ฒˆ์—ญ ์‹œ์Šคํ…œ
434
+
435
+ ๋‹ค๊ตญ์–ด ์‹ค์‹œ๊ฐ„ ๋ฒˆ์—ญ์„ ์ง€์›ํ•˜๋Š” ํ†ตํ•ฉ ๋ฒˆ์—ญ ํ”Œ๋žซํผ
436
+ """
437
+ )
438
+
439
  with gr.Tabs():
440
  # ํƒญ 1 โ€“ ์˜ค๋””์˜ค ๋ฒˆ์—ญ
441
+ with gr.TabItem("๐ŸŽ™๏ธ ์˜ค๋””์˜ค/๋น„๋””์˜ค"):
442
+ gr.Markdown("### ๐ŸŒ ์˜ค๋””์˜ค/๋น„๋””์˜ค ํŒŒ์ผ ๋ฒˆ์—ญ")
443
+
444
+ with gr.Row():
445
+ src1 = gr.Dropdown(LANG, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
446
+ tgt1 = gr.Dropdown(LANG, value="English", label="์ถœ๋ ฅ ์–ธ์–ด")
447
+
448
+ with gr.Tabs():
449
+ with gr.TabItem("๐Ÿ“ ํŒŒ์ผ ์—…๋กœ๋“œ"):
450
+ # ํŒŒ์ผ ์—…๋กœ๋“œ - ์˜ค๋””์˜ค์™€ ๋น„๋””์˜ค ๋ชจ๋‘ ์ง€์›
451
+ aud1_file = gr.File(
452
+ label="์˜ค๋””์˜ค/๋น„๋””์˜ค ํŒŒ์ผ ์—…๋กœ๋“œ",
453
+ file_types=[".mp3", ".wav", ".m4a", ".flac", ".ogg", ".opus",
454
+ ".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv"],
455
+ type="filepath"
456
+ )
457
+ gr.Markdown(
458
+ "๐Ÿ“Œ **์ง€์› ํ˜•์‹**\n"
459
+ "- ์˜ค๋””์˜ค: MP3, WAV, M4A, FLAC, OGG, OPUS\n"
460
+ "- ๋น„๋””์˜ค: MP4, AVI, MOV, MKV, WebM, FLV\n\n"
461
+ "โš ๏ธ **์ฃผ์˜์‚ฌํ•ญ**\n"
462
+ "- ๋น„๋””์˜ค ํŒŒ์ผ์€ ์˜ค๋””์˜ค ์ถ”์ถœ ์‹œ๊ฐ„์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค\n"
463
+ "- ๋Œ€์šฉ๋Ÿ‰ ํŒŒ์ผ์€ ์ฒ˜๋ฆฌ ์‹œ๊ฐ„์ด ์˜ค๋ž˜ ๊ฑธ๋ฆด ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค"
464
+ )
465
+
466
+ with gr.TabItem("๐ŸŽค ๋งˆ์ดํฌ ๋…น์Œ"):
467
+ aud1_mic = gr.Audio(
468
+ sources=["microphone"],
469
+ type="filepath",
470
+ label="๋งˆ์ดํฌ ๋…น์Œ"
471
+ )
472
+ gr.Markdown("๐Ÿ’ก **ํŒ**: ๋…น์Œ ํ›„ '์ •์ง€' ๋ฒ„ํŠผ์„ ๋ˆŒ๋Ÿฌ์ฃผ์„ธ์š”")
473
 
474
+ btn1 = gr.Button("๐Ÿ”„ ๋ฒˆ์—ญ ์‹œ์ž‘", variant="primary", size="lg")
475
+
476
+ # ์ง„ํ–‰ ์ƒํƒœ ํ‘œ์‹œ
477
+ status1 = gr.Textbox(label="์ง„ํ–‰ ์ƒํƒœ", value="๋Œ€๊ธฐ ์ค‘...", interactive=False)
478
+
479
+ with gr.Row():
480
+ with gr.Column():
481
+ o1 = gr.Textbox(label="๐Ÿ“ ์›๋ฌธ", lines=6)
482
+ with gr.Column():
483
+ t1 = gr.Textbox(label="๐Ÿ“ ๋ฒˆ์—ญ", lines=6)
484
+
485
+ a1 = gr.Audio(label="๐Ÿ”Š ๋ฒˆ์—ญ๋œ ์Œ์„ฑ (TTS)", type="filepath", autoplay=True)
486
+
487
+ # ํŒŒ์ผ๏ฟฝ๏ฟฝ๏ฟฝ๋‚˜ ๋งˆ์ดํฌ ์ค‘ ํ™œ์„ฑํ™”๋œ ์ž…๋ ฅ ์‚ฌ์šฉ
488
+ def translate_with_status(file_input, mic_input, src, tgt):
489
+ active_input = file_input if file_input else mic_input
490
+ if not active_input:
491
+ return "โš ๏ธ ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•˜๊ฑฐ๋‚˜ ๋…น์Œ์„ ํ•ด์ฃผ์„ธ์š”", "", None
492
+
493
+ # ์ƒํƒœ ์—…๋ฐ์ดํŠธ๋Š” ๋™๊ธฐ ํ•จ์ˆ˜์—์„œ ์ฒ˜๋ฆฌ
494
+ return translate_audio(active_input, src, tgt)
495
+
496
+ btn1.click(
497
+ lambda: "์ฒ˜๋ฆฌ ์ค‘... ์ž ์‹œ๋งŒ ๊ธฐ๋‹ค๋ ค์ฃผ์„ธ์š” โณ",
498
+ outputs=status1
499
+ ).then(
500
+ translate_with_status,
501
+ [aud1_file, aud1_mic, src1, tgt1],
502
+ [o1, t1, a1]
503
+ ).then(
504
+ lambda: "โœ… ์™„๋ฃŒ!",
505
+ outputs=status1
506
+ )
507
 
508
  # ํƒญ 2 โ€“ PDF ๋ฒˆ์—ญ
509
  with gr.TabItem("๐Ÿ“„ PDF"):