import gradio as gr import replicate import os import tempfile from moviepy import VideoFileClip REPLICATE_API_TOKEN = os.getenv("REPLICATE_API_TOKEN") def process_video(video_file): if not video_file: return "No video file uploaded." temp_audio_file = None try: with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio: temp_audio_file = temp_audio.name try: video = VideoFileClip(video_file) video.audio.write_audiofile(temp_audio_file, logger=None) except Exception as e: return f"Failed to extract audio from video: {e}" with open(temp_audio_file, "rb") as audio_f: output = replicate.run( "victor-upmeet/whisperx:84d2ad2d6194fe98a17d2b60bef1c7f910c46b2f6fd38996ca457afd9c8abfcb", input={"audio_file": audio_f}, language="en", batch_size=512, api_token=REPLICATE_API_TOKEN, align_output=False, diarization=False ) finally: if temp_audio_file and os.path.exists(temp_audio_file): os.remove(temp_audio_file) segments = output.get("segments") if isinstance(output, dict) else output script = " ".join(seg["text"] for seg in segments) if segments else output.get("text", "No transcription found.") return script with gr.Blocks(theme="monochrome", css=""" .centered-container { width: 80vw; min-width: 400px; max-width: 1400px; margin-left: auto !important; margin-right: auto !important; margin-top: 2.5em; margin-bottom: 2.5em; background: var(--block-background-fill); border-radius: 1.2em; box-shadow: 0 0 16px 0 #0001; padding: 2em 2em 2em 2em; } @media (max-width: 900px) { .centered-container { width: 98vw; padding: 1em 0.5em 1em 0.5em; } } .transcribe-btn-center { display: flex; justify-content: center; margin-top: 1em; } """) as demo: with gr.Column(elem_classes="centered-container"): gr.Markdown("# Automatic Video Transcriber", elem_id="title") gr.Markdown("## Upload a video file and click 'Transcribe' to begin.", elem_id="subtitle") with gr.Row(): with gr.Column(scale=1, min_width=320): video_input = gr.Video( label="Input Video File (.mp4)", interactive=True, sources=["upload"], ) with gr.Row(elem_classes="transcribe-btn-center"): transcribe_btn = gr.Button("Transcribe", scale=0) gr.Markdown("### Please note that file uploads may take a few minutes to process due to network rate limits. A local version of this app is available [here](https://github.com/sam-mata/video-transcriber).", elem_id="note") with gr.Column(scale=1, min_width=320): text_output = gr.Textbox( label="Raw Text Output", show_copy_button=True, lines=14, interactive=False, ) transcribe_btn.click( fn=process_video, inputs=video_input, outputs=text_output ) demo.launch(max_file_size="200MB")