Spaces:
Running
Running
Update app.py
Browse filesfourth changes
app.py
CHANGED
@@ -1,36 +1,125 @@
|
|
1 |
import gradio as gr
|
2 |
import pytube
|
3 |
from transformers import pipeline
|
|
|
|
|
4 |
|
5 |
# Initialize pipelines
|
6 |
asr = pipeline("automatic-speech-recognition", model="openai/whisper-base", chunk_length_s=30)
|
7 |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
8 |
|
9 |
-
def
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
# Build Gradio app
|
29 |
-
with gr.Blocks() as demo:
|
30 |
-
gr.Markdown("## π Multi
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
btn.click(summarize_youtube, inputs=url_input, outputs=[vid, txt, summ])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
-
|
|
|
|
1 |
import gradio as gr
|
2 |
import pytube
|
3 |
from transformers import pipeline
|
4 |
+
import os
|
5 |
+
import re
|
6 |
|
7 |
# Initialize pipelines
|
8 |
asr = pipeline("automatic-speech-recognition", model="openai/whisper-base", chunk_length_s=30)
|
9 |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
10 |
|
11 |
+
def extract_video_id(url):
|
12 |
+
"""Extract video ID from various YouTube URL formats"""
|
13 |
+
patterns = [
|
14 |
+
r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
|
15 |
+
r'(?:embed\/)([0-9A-Za-z_-]{11})',
|
16 |
+
r'(?:v\/)([0-9A-Za-z_-]{11})'
|
17 |
+
]
|
18 |
+
for pattern in patterns:
|
19 |
+
match = re.search(pattern, url)
|
20 |
+
if match:
|
21 |
+
return match.group(1)
|
22 |
+
return None
|
23 |
|
24 |
+
def summarize_youtube(url):
|
25 |
+
try:
|
26 |
+
# Clean up any existing audio file
|
27 |
+
if os.path.exists("audio.mp4"):
|
28 |
+
os.remove("audio.mp4")
|
29 |
+
|
30 |
+
# Create YouTube object with error handling
|
31 |
+
yt = pytube.YouTube(url, use_oauth=False, allow_oauth_cache=False)
|
32 |
+
|
33 |
+
# Get audio stream with better filtering
|
34 |
+
audio_streams = yt.streams.filter(only_audio=True, file_extension='mp4')
|
35 |
+
if not audio_streams:
|
36 |
+
# Fallback to any audio stream
|
37 |
+
audio_streams = yt.streams.filter(only_audio=True)
|
38 |
+
|
39 |
+
if not audio_streams:
|
40 |
+
return "β Error: No audio streams available", "Could not extract audio from video", "No summary available"
|
41 |
+
|
42 |
+
stream = audio_streams.first()
|
43 |
+
|
44 |
+
# Download with proper filename
|
45 |
+
audio_file = stream.download(filename="audio")
|
46 |
+
|
47 |
+
# Transcribe
|
48 |
+
result = asr(audio_file)
|
49 |
+
transcript = result["text"]
|
50 |
+
|
51 |
+
# Clean up audio file
|
52 |
+
if os.path.exists(audio_file):
|
53 |
+
os.remove(audio_file)
|
54 |
+
|
55 |
+
# Check transcript length for summarization
|
56 |
+
if len(transcript.split()) < 10:
|
57 |
+
return "β Error: Transcript too short", transcript, "Cannot summarize - transcript too brief"
|
58 |
+
|
59 |
+
# Summarize with better parameters
|
60 |
+
max_chunk = 1024 # BART's max input length
|
61 |
+
if len(transcript) > max_chuck:
|
62 |
+
# Split transcript into chunks if too long
|
63 |
+
words = transcript.split()
|
64 |
+
chunks = [' '.join(words[i:i+200]) for i in range(0, len(words), 200)]
|
65 |
+
summaries = []
|
66 |
+
|
67 |
+
for chunk in chunks[:3]: # Limit to first 3 chunks to avoid timeout
|
68 |
+
if len(chunk.strip()) > 50:
|
69 |
+
chunk_summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
|
70 |
+
summaries.append(chunk_summary)
|
71 |
+
|
72 |
+
summary = " ".join(summaries)
|
73 |
+
else:
|
74 |
+
summary = summarizer(transcript, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
|
75 |
+
|
76 |
+
# Create embed HTML
|
77 |
+
v_id = extract_video_id(url)
|
78 |
+
if v_id:
|
79 |
+
embed_html = f'<iframe width="560" height="315" src="https://www.youtube.com/embed/{v_id}" frameborder="0" allowfullscreen></iframe>'
|
80 |
+
else:
|
81 |
+
embed_html = "β Could not extract video ID"
|
82 |
+
|
83 |
+
return embed_html, transcript, summary
|
84 |
+
|
85 |
+
except pytube.exceptions.RegexMatchError:
|
86 |
+
return "β Error: Invalid YouTube URL", "Please check the URL format", "No summary available"
|
87 |
+
except pytube.exceptions.VideoUnavailable:
|
88 |
+
return "β Error: Video unavailable", "Video may be private or deleted", "No summary available"
|
89 |
+
except Exception as e:
|
90 |
+
return f"β Error: {str(e)}", "An error occurred during processing", "No summary available"
|
91 |
|
92 |
# Build Gradio app
|
93 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
94 |
+
gr.Markdown("## π Multi-lingual YouTube Summarizer (Hindi / Hinglish / English)")
|
95 |
+
gr.Markdown("Enter a YouTube URL to get an AI-generated summary of the video content.")
|
96 |
+
|
97 |
+
with gr.Row():
|
98 |
+
with gr.Column():
|
99 |
+
url_input = gr.Textbox(
|
100 |
+
label="YouTube URL",
|
101 |
+
placeholder="https://www.youtube.com/watch?v=...",
|
102 |
+
lines=1
|
103 |
+
)
|
104 |
+
btn = gr.Button("π Summarize Video", variant="primary")
|
105 |
+
|
106 |
+
with gr.Row():
|
107 |
+
with gr.Column():
|
108 |
+
vid = gr.HTML(label="Video Player")
|
109 |
+
with gr.Column():
|
110 |
+
with gr.Accordion("π Transcript", open=False):
|
111 |
+
txt = gr.Textbox(label="Full Transcript", lines=10, max_lines=15)
|
112 |
+
summ = gr.Textbox(label="π Summary", lines=5)
|
113 |
+
|
114 |
btn.click(summarize_youtube, inputs=url_input, outputs=[vid, txt, summ])
|
115 |
+
|
116 |
+
# Add examples
|
117 |
+
gr.Examples(
|
118 |
+
examples=[
|
119 |
+
["https://www.youtube.com/watch?v=dQw4w9WgXcQ"], # Replace with actual examples
|
120 |
+
],
|
121 |
+
inputs=url_input
|
122 |
+
)
|
123 |
|
124 |
+
if __name__ == "__main__":
|
125 |
+
demo.launch(share=True)
|