Spaces:
Build error
Build error
import asyncio | |
import subprocess | |
from pathlib import Path | |
from typing import List | |
import torchaudio | |
from yt_dlp import YoutubeDL | |
import webrtcvad | |
from .config import AUDIO_CACHE | |
# --------------------------------------------------------------------------- | |
# ffmpeg helpers | |
# --------------------------------------------------------------------------- | |
def _run(cmd: List[str]): | |
proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
if proc.returncode != 0: | |
raise RuntimeError(proc.stderr.decode()) | |
# --------------------------------------------------------------------------- | |
# Video → Audio | |
# --------------------------------------------------------------------------- | |
async def download_video(url: str, out_dir: Path) -> Path: | |
"""Async wrapper around yt‑dlp to pull remote video assets.""" | |
ydl_opts = { | |
"quiet": True, | |
"no_warnings": True, | |
"outtmpl": str(out_dir / "download.%(ext)s"), | |
"format": "bestvideo+bestaudio/best / best", | |
} | |
loop = asyncio.get_running_loop() | |
def _job(): | |
with YoutubeDL(ydl_opts) as ydl: | |
ydl.download([url]) | |
await loop.run_in_executor(None, _job) | |
return next(out_dir.glob("download.*")) | |
async def extract_audio(video_path: Path, wav_path: Path, sr: int = 16000): | |
cmd = [ | |
"ffmpeg", "-y", "-i", str(video_path), | |
"-vn", "-ac", "1", "-ar", str(sr), str(wav_path) | |
] | |
loop = asyncio.get_running_loop() | |
await loop.run_in_executor(None, _run, cmd) | |
# --------------------------------------------------------------------------- | |
# VAD trimming (WebRTC) | |
# --------------------------------------------------------------------------- | |
def _frame_gen(frame_ms, pcm16, sr): | |
n = int(sr * (frame_ms / 1000.0) * 2) | |
for i in range(0, len(pcm16), n): | |
yield pcm16[i : i + n] | |
def trim_silence(wav_path: Path, aggressiveness: int = 3) -> Path: | |
sig, sr = torchaudio.load(str(wav_path)) | |
sig = sig.squeeze(0).numpy() | |
vad = webrtcvad.Vad(aggressiveness) | |
frames = list(_frame_gen(30, (sig * 32768).astype("int16").tobytes(), sr)) | |
voiced = [vad.is_speech(f, sr) for f in frames] | |
if not any(voiced): | |
return wav_path | |
first, last = voiced.index(True), len(voiced) - 1 - voiced[::-1].index(True) | |
kept = sig[first * 480 : (last + 1) * 480] | |
out = wav_path.with_name(wav_path.stem + "_trim.wav") | |
torchaudio.save(str(out), torchaudio.tensor(kept).unsqueeze(0), sr) | |
return out |