File size: 3,324 Bytes
ee1c1bd
 
 
174bd6d
 
ee1c1bd
 
 
174bd6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be6da01
493e130
 
 
174bd6d
 
 
 
ee1c1bd
174bd6d
ee1c1bd
 
 
174bd6d
 
 
 
be6da01
174bd6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f9d625
174bd6d
 
 
 
 
 
493e130
174bd6d
 
 
1f9d625
174bd6d
 
 
 
be6da01
174bd6d
 
 
 
 
 
 
ee1c1bd
493e130
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
import replicate
import os
import tempfile
from moviepy import VideoFileClip

REPLICATE_API_TOKEN = os.getenv("REPLICATE_API_TOKEN")

def process_video(video_file):
    if not video_file:
        return "No video file uploaded."
    temp_audio_file = None
    try:
        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
            temp_audio_file = temp_audio.name
        try:
            video = VideoFileClip(video_file)
            video.audio.write_audiofile(temp_audio_file, logger=None)
        except Exception as e:
            return f"Failed to extract audio from video: {e}"

        with open(temp_audio_file, "rb") as audio_f:
            output = replicate.run(
                "victor-upmeet/whisperx:84d2ad2d6194fe98a17d2b60bef1c7f910c46b2f6fd38996ca457afd9c8abfcb",
                input={"audio_file": audio_f},
                language="en",
                batch_size=512,
                api_token=REPLICATE_API_TOKEN,
                align_output=False,
                diarization=False
            )
    finally:
        if temp_audio_file and os.path.exists(temp_audio_file):
            os.remove(temp_audio_file)

    segments = output.get("segments") if isinstance(output, dict) else output
    script = " ".join(seg["text"] for seg in segments) if segments else output.get("text", "No transcription found.")
    return script

with gr.Blocks(theme="monochrome", css="""
.centered-container {
    width: 80vw;
    min-width: 400px;
    max-width: 1400px;
    margin-left: auto !important;
    margin-right: auto !important;
    margin-top: 2.5em;
    margin-bottom: 2.5em;
    background: var(--block-background-fill);
    border-radius: 1.2em;
    box-shadow: 0 0 16px 0 #0001;
    padding: 2em 2em 2em 2em;
}
@media (max-width: 900px) {
    .centered-container {
        width: 98vw;
        padding: 1em 0.5em 1em 0.5em;
    }
}
.transcribe-btn-center {
    display: flex;
    justify-content: center;
    margin-top: 1em;
}
""") as demo:
    with gr.Column(elem_classes="centered-container"):
        gr.Markdown("# Automatic Video Transcriber", elem_id="title")
        gr.Markdown("## Upload a video file and click 'Transcribe' to begin.", elem_id="subtitle")
        with gr.Row():
            with gr.Column(scale=1, min_width=320):
                video_input = gr.Video(
                    label="Input Video File (.mp4)",
                    interactive=True,
                    sources=["upload"],
                )
                with gr.Row(elem_classes="transcribe-btn-center"):
                    transcribe_btn = gr.Button("Transcribe", scale=0)
                gr.Markdown("### Please note that file uploads may take a few minutes to process due to network rate limits. A local version of this app is available [here](https://github.com/sam-mata/video-transcriber).", elem_id="note")
            with gr.Column(scale=1, min_width=320):
                text_output = gr.Textbox(
                    label="Raw Text Output",
                    show_copy_button=True,
                    lines=14,
                    interactive=False,
                )
        transcribe_btn.click(
            fn=process_video,
            inputs=video_input,
            outputs=text_output
        )

demo.launch(max_file_size="200MB")