Nelson commited on
Commit
db1ac4f
·
unverified ·
1 Parent(s): 24455af

Added app.py, packages.txt, and requirements.txt. First commit

Browse files
Files changed (3) hide show
  1. app.py +227 -0
  2. packages.txt +1 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import tempfile
4
+ from pathlib import Path
5
+ from typing import List, Tuple
6
+
7
+ import gradio as gr
8
+ from faster_whisper import WhisperModel
9
+ import yt_dlp
10
+
11
+
12
+ # --------- Config ---------
13
+ # English-only, higher-accuracy than multilingual at similar size.
14
+ # Quantized INT8 for CPU-friendly inference on free Spaces.
15
+ MODEL_NAME = os.environ.get("ASR_MODEL", "Systran/faster-whisper-medium.en")
16
+ MODEL_CACHE = os.environ.get("ASR_CACHE", "./models")
17
+ COMPUTE_TYPE = os.environ.get("ASR_COMPUTE_TYPE", "int8") # int8 is great for CPU
18
+ DEFAULT_GROUP_CHARS = 280 # target characters per timestamped paragraph
19
+
20
+
21
+ # --------- Utilities ---------
22
+ def _format_ts(seconds: float) -> str:
23
+ if seconds is None:
24
+ return "00:00:00.000"
25
+ ms = int(round(seconds * 1000))
26
+ h = ms // 3600000
27
+ ms %= 3600000
28
+ m = ms // 60000
29
+ ms %= 60000
30
+ s = ms // 1000
31
+ ms %= 1000
32
+ return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
33
+
34
+
35
+ def _group_segments(segments, target_chars: int = DEFAULT_GROUP_CHARS) -> List[Tuple[float, float, str]]:
36
+ """
37
+ Combine short segments into paragraph-style groups with a single [start - end] timestamp.
38
+ Returns a list of tuples: (start_sec, end_sec, text).
39
+ """
40
+ groups = []
41
+ current_text = []
42
+ current_len = 0
43
+ group_start = None
44
+ last_end = None
45
+
46
+ for seg in segments:
47
+ txt = (seg.text or "").strip()
48
+ if not txt:
49
+ continue
50
+ if group_start is None:
51
+ group_start = seg.start
52
+ current_text.append(txt)
53
+ current_len += len(txt) + 1
54
+ last_end = seg.end
55
+
56
+ if current_len >= target_chars:
57
+ groups.append((group_start, last_end, " ".join(current_text).strip()))
58
+ current_text, current_len, group_start = [], 0, None
59
+
60
+ if current_text:
61
+ groups.append((group_start or 0.0, last_end or 0.0, " ".join(current_text).strip()))
62
+ return groups
63
+
64
+
65
+ def _download_youtube_audio(url: str, tmpdir: str) -> Tuple[str, str]:
66
+ """
67
+ Downloads bestaudio and extracts to mp3 via ffmpeg.
68
+ Returns (audio_path, title).
69
+ """
70
+ ydl_opts = {
71
+ "format": "bestaudio/best",
72
+ "outtmpl": os.path.join(tmpdir, "%(id)s.%(ext)s"),
73
+ "noplaylist": True,
74
+ "quiet": True,
75
+ "no_warnings": True,
76
+ "restrictfilenames": True,
77
+ "postprocessors": [
78
+ {
79
+ "key": "FFmpegExtractAudio",
80
+ "preferredcodec": "mp3",
81
+ "preferredquality": "192",
82
+ }
83
+ ],
84
+ "prefer_ffmpeg": True,
85
+ "cachedir": False,
86
+ }
87
+
88
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
89
+ info = ydl.extract_info(url, download=True)
90
+ vid = info.get("id")
91
+ title = info.get("title") or "YouTube Audio"
92
+ candidate = os.path.join(tmpdir, f"{vid}.mp3")
93
+ if os.path.exists(candidate):
94
+ return candidate, title
95
+
96
+ # Fallback: first mp3 in tmpdir
97
+ for p in Path(tmpdir).glob("*.mp3"):
98
+ return str(p), title
99
+ raise RuntimeError("Failed to download and extract audio.")
100
+
101
+
102
+ # --------- Model (lazy init) ---------
103
+ _model = None
104
+
105
+
106
+ def _get_model():
107
+ global _model
108
+ if _model is None:
109
+ _model = WhisperModel(
110
+ MODEL_NAME,
111
+ device="cpu",
112
+ compute_type=COMPUTE_TYPE,
113
+ download_root=MODEL_CACHE,
114
+ cpu_threads=max(1, os.cpu_count() or 2),
115
+ )
116
+ return _model
117
+
118
+
119
+ # --------- Core Inference ---------
120
+ def transcribe_from_youtube(
121
+ url: str,
122
+ output_mode: str,
123
+ group_target_chars: int = DEFAULT_GROUP_CHARS,
124
+ beam_size: int = 5,
125
+ vad_filter: bool = True,
126
+ progress: gr.Progress = gr.Progress()
127
+ ):
128
+ if not url or not url.strip():
129
+ raise gr.Error("Please paste a valid YouTube video URL.")
130
+
131
+ progress(0.02, desc="Preparing…")
132
+ tmpdir = tempfile.mkdtemp(prefix="asr_")
133
+ audio_path = None
134
+
135
+ try:
136
+ progress(0.10, desc="Downloading audio from YouTube…")
137
+ audio_path, title = _download_youtube_audio(url.strip(), tmpdir)
138
+
139
+ progress(0.30, desc="Loading ASR model… (first time may take a bit)")
140
+ model = _get_model()
141
+
142
+ progress(0.45, desc="Transcribing audio…")
143
+ segments_iter, info = model.transcribe(
144
+ audio_path,
145
+ language="en", # Force English-only
146
+ task="transcribe",
147
+ beam_size=beam_size,
148
+ vad_filter=vad_filter,
149
+ vad_parameters={"min_silence_duration_ms": 500},
150
+ )
151
+
152
+ segments = list(segments_iter)
153
+
154
+ # Build outputs
155
+ plain_text = " ".join((seg.text or "").strip() for seg in segments).strip()
156
+
157
+ if output_mode == "Timestamped (grouped)":
158
+ groups = _group_segments(segments, max(40, int(group_target_chars)))
159
+ lines = []
160
+ for start, end, text in groups:
161
+ lines.append(f"[{_format_ts(start)} - {_format_ts(end)}] {text}")
162
+ ts_text = "\n".join(lines).strip()
163
+ return title, ts_text, plain_text
164
+ else:
165
+ # Plain transcript only
166
+ return title, "", plain_text
167
+
168
+ except Exception as e:
169
+ raise gr.Error(f"Transcription failed: {e}")
170
+ finally:
171
+ try:
172
+ if audio_path and os.path.exists(audio_path):
173
+ os.remove(audio_path)
174
+ shutil.rmtree(tmpdir, ignore_errors=True)
175
+ except Exception:
176
+ pass
177
+
178
+
179
+ # --------- UI ---------
180
+ with gr.Blocks(title="YouTube → English Transcript") as demo:
181
+ gr.Markdown(
182
+ """
183
+ # 🎧 YouTube → English Transcript
184
+ Paste a YouTube link and get either a **timestamped transcript (grouped paragraphs)** or a **plain transcript**.
185
+
186
+ - Model: `Systran/faster-whisper-medium.en` (English-only, INT8 for CPU)
187
+ - Works on free Hugging Face Spaces (CPU).
188
+ """
189
+ )
190
+
191
+ with gr.Row():
192
+ url = gr.Textbox(
193
+ label="YouTube URL",
194
+ placeholder="https://www.youtube.com/watch?v=...",
195
+ lines=1,
196
+ )
197
+
198
+ with gr.Row():
199
+ output_mode = gr.Radio(
200
+ ["Timestamped (grouped)", "Plain transcript"],
201
+ value="Timestamped (grouped)",
202
+ label="Output Format",
203
+ )
204
+
205
+ with gr.Accordion("Advanced (optional)", open=False):
206
+ group_chars = gr.Slider(120, 1200, value=DEFAULT_GROUP_CHARS, step=20,
207
+ label="Target characters per group (for timestamped output)")
208
+ beam_size = gr.Slider(1, 8, value=5, step=1, label="Beam size")
209
+ vad_filter = gr.Checkbox(value=True, label="Voice Activity Detection (trim long silences)")
210
+
211
+ submit = gr.Button("Transcribe", variant="primary")
212
+ clear = gr.Button("Clear")
213
+
214
+ title_out = gr.Textbox(label="Video Title", interactive=False)
215
+ ts_out = gr.Textbox(label="Timestamped Transcript", lines=16)
216
+ plain_out = gr.Textbox(label="Plain Transcript", lines=16)
217
+
218
+ submit.click(
219
+ transcribe_from_youtube,
220
+ inputs=[url, output_mode, group_chars, beam_size, vad_filter],
221
+ outputs=[title_out, ts_out, plain_out]
222
+ )
223
+
224
+ clear.click(lambda: ("", "", "", "", ""), None, [url, title_out, ts_out, plain_out, output_mode])
225
+
226
+ if __name__ == "__main__":
227
+ demo.launch()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ yt-dlp
3
+ faster-whisper
4
+ ctranslate2
5
+ numpy