Spaces:

EchoSudos
/

ytvid-to-text

Running

App Files Files Community

Nelson commited on Aug 30

Commit

db1ac4f

unverified ·

1 Parent(s): 24455af

Added app.py, packages.txt, and requirements.txt. First commit

Browse files

Files changed (3) hide show

app.py +227 -0
packages.txt +1 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import os
+import shutil
+import tempfile
+from pathlib import Path
+from typing import List, Tuple
+import gradio as gr
+from faster_whisper import WhisperModel
+import yt_dlp
+# --------- Config ---------
+# English-only, higher-accuracy than multilingual at similar size.
+# Quantized INT8 for CPU-friendly inference on free Spaces.
+MODEL_NAME = os.environ.get("ASR_MODEL", "Systran/faster-whisper-medium.en")
+MODEL_CACHE = os.environ.get("ASR_CACHE", "./models")
+COMPUTE_TYPE = os.environ.get("ASR_COMPUTE_TYPE", "int8")  # int8 is great for CPU
+DEFAULT_GROUP_CHARS = 280  # target characters per timestamped paragraph
+# --------- Utilities ---------
+def _format_ts(seconds: float) -> str:
+    if seconds is None:
+        return "00:00:00.000"
+    ms = int(round(seconds * 1000))
+    h = ms // 3600000
+    ms %= 3600000
+    m = ms // 60000
+    ms %= 60000
+    s = ms // 1000
+    ms %= 1000
+    return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
+def _group_segments(segments, target_chars: int = DEFAULT_GROUP_CHARS) -> List[Tuple[float, float, str]]:
+    """
+    Combine short segments into paragraph-style groups with a single [start - end] timestamp.
+    Returns a list of tuples: (start_sec, end_sec, text).
+    """
+    groups = []
+    current_text = []
+    current_len = 0
+    group_start = None
+    last_end = None
+    for seg in segments:
+        txt = (seg.text or "").strip()
+        if not txt:
+            continue
+        if group_start is None:
+            group_start = seg.start
+        current_text.append(txt)
+        current_len += len(txt) + 1
+        last_end = seg.end
+        if current_len >= target_chars:
+            groups.append((group_start, last_end, " ".join(current_text).strip()))
+            current_text, current_len, group_start = [], 0, None
+    if current_text:
+        groups.append((group_start or 0.0, last_end or 0.0, " ".join(current_text).strip()))
+    return groups
+def _download_youtube_audio(url: str, tmpdir: str) -> Tuple[str, str]:
+    """
+    Downloads bestaudio and extracts to mp3 via ffmpeg.
+    Returns (audio_path, title).
+    """
+    ydl_opts = {
+        "format": "bestaudio/best",
+        "outtmpl": os.path.join(tmpdir, "%(id)s.%(ext)s"),
+        "noplaylist": True,
+        "quiet": True,
+        "no_warnings": True,
+        "restrictfilenames": True,
+        "postprocessors": [
+            {
+                "key": "FFmpegExtractAudio",
+                "preferredcodec": "mp3",
+                "preferredquality": "192",
+            }
+        ],
+        "prefer_ffmpeg": True,
+        "cachedir": False,
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=True)
+        vid = info.get("id")
+        title = info.get("title") or "YouTube Audio"
+        candidate = os.path.join(tmpdir, f"{vid}.mp3")
+        if os.path.exists(candidate):
+            return candidate, title
+    # Fallback: first mp3 in tmpdir
+    for p in Path(tmpdir).glob("*.mp3"):
+        return str(p), title
+    raise RuntimeError("Failed to download and extract audio.")
+# --------- Model (lazy init) ---------
+_model = None
+def _get_model():
+    global _model
+    if _model is None:
+        _model = WhisperModel(
+            MODEL_NAME,
+            device="cpu",
+            compute_type=COMPUTE_TYPE,
+            download_root=MODEL_CACHE,
+            cpu_threads=max(1, os.cpu_count() or 2),
+        )
+    return _model
+# --------- Core Inference ---------
+def transcribe_from_youtube(
+    url: str,
+    output_mode: str,
+    group_target_chars: int = DEFAULT_GROUP_CHARS,
+    beam_size: int = 5,
+    vad_filter: bool = True,
+    progress: gr.Progress = gr.Progress()
+):
+    if not url or not url.strip():
+        raise gr.Error("Please paste a valid YouTube video URL.")
+    progress(0.02, desc="Preparing…")
+    tmpdir = tempfile.mkdtemp(prefix="asr_")
+    audio_path = None
+    try:
+        progress(0.10, desc="Downloading audio from YouTube…")
+        audio_path, title = _download_youtube_audio(url.strip(), tmpdir)
+        progress(0.30, desc="Loading ASR model… (first time may take a bit)")
+        model = _get_model()
+        progress(0.45, desc="Transcribing audio…")
+        segments_iter, info = model.transcribe(
+            audio_path,
+            language="en",          # Force English-only
+            task="transcribe",
+            beam_size=beam_size,
+            vad_filter=vad_filter,
+            vad_parameters={"min_silence_duration_ms": 500},
+        )
+        segments = list(segments_iter)
+        # Build outputs
+        plain_text = " ".join((seg.text or "").strip() for seg in segments).strip()
+        if output_mode == "Timestamped (grouped)":
+            groups = _group_segments(segments, max(40, int(group_target_chars)))
+            lines = []
+            for start, end, text in groups:
+                lines.append(f"[{_format_ts(start)} - {_format_ts(end)}] {text}")
+            ts_text = "\n".join(lines).strip()
+            return title, ts_text, plain_text
+        else:
+            # Plain transcript only
+            return title, "", plain_text
+    except Exception as e:
+        raise gr.Error(f"Transcription failed: {e}")
+    finally:
+        try:
+            if audio_path and os.path.exists(audio_path):
+                os.remove(audio_path)
+            shutil.rmtree(tmpdir, ignore_errors=True)
+        except Exception:
+            pass
+# --------- UI ---------
+with gr.Blocks(title="YouTube → English Transcript") as demo:
+    gr.Markdown(
+        """
+        # 🎧 YouTube → English Transcript
+        Paste a YouTube link and get either a **timestamped transcript (grouped paragraphs)** or a **plain transcript**.
+        - Model: `Systran/faster-whisper-medium.en` (English-only, INT8 for CPU)
+        - Works on free Hugging Face Spaces (CPU).
+        """
+    )
+    with gr.Row():
+        url = gr.Textbox(
+            label="YouTube URL",
+            placeholder="https://www.youtube.com/watch?v=...",
+            lines=1,
+        )
+    with gr.Row():
+        output_mode = gr.Radio(
+            ["Timestamped (grouped)", "Plain transcript"],
+            value="Timestamped (grouped)",
+            label="Output Format",
+        )
+    with gr.Accordion("Advanced (optional)", open=False):
+        group_chars = gr.Slider(120, 1200, value=DEFAULT_GROUP_CHARS, step=20,
+                                label="Target characters per group (for timestamped output)")
+        beam_size = gr.Slider(1, 8, value=5, step=1, label="Beam size")
+        vad_filter = gr.Checkbox(value=True, label="Voice Activity Detection (trim long silences)")
+    submit = gr.Button("Transcribe", variant="primary")
+    clear = gr.Button("Clear")
+    title_out = gr.Textbox(label="Video Title", interactive=False)
+    ts_out = gr.Textbox(label="Timestamped Transcript", lines=16)
+    plain_out = gr.Textbox(label="Plain Transcript", lines=16)
+    submit.click(
+        transcribe_from_youtube,
+        inputs=[url, output_mode, group_chars, beam_size, vad_filter],
+        outputs=[title_out, ts_out, plain_out]
+    )
+    clear.click(lambda: ("", "", "", "", ""), None, [url, title_out, ts_out, plain_out, output_mode])
+if __name__ == "__main__":
+    demo.launch()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+yt-dlp
+faster-whisper
+ctranslate2
+numpy