divython commited on
Commit
943d5b9
ยท
verified ยท
1 Parent(s): c64a626

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +303 -78
app.py CHANGED
@@ -1,19 +1,32 @@
1
  import gradio as gr
2
- import pytube
3
- from transformers import pipeline
4
- import os
5
  import re
 
 
 
 
 
6
 
7
- # Initialize pipelines
8
- asr = pipeline("automatic-speech-recognition", model="openai/whisper-base", chunk_length_s=30)
9
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 
 
 
 
 
 
 
 
 
 
10
 
11
  def extract_video_id(url):
12
  """Extract video ID from various YouTube URL formats"""
13
  patterns = [
14
  r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
15
  r'(?:embed\/)([0-9A-Za-z_-]{11})',
16
- r'(?:v\/)([0-9A-Za-z_-]{11})'
 
17
  ]
18
  for pattern in patterns:
19
  match = re.search(pattern, url)
@@ -21,105 +34,317 @@ def extract_video_id(url):
21
  return match.group(1)
22
  return None
23
 
24
- def summarize_youtube(url):
 
25
  try:
26
- # Clean up any existing audio file
27
- if os.path.exists("audio.mp4"):
28
- os.remove("audio.mp4")
29
-
30
- # Create YouTube object with error handling
31
- yt = pytube.YouTube(url, use_oauth=False, allow_oauth_cache=False)
32
 
33
- # Get audio stream with better filtering
34
- audio_streams = yt.streams.filter(only_audio=True, file_extension='mp4')
35
- if not audio_streams:
36
- # Fallback to any audio stream
37
- audio_streams = yt.streams.filter(only_audio=True)
38
 
39
- if not audio_streams:
40
- return "โŒ Error: No audio streams available", "Could not extract audio from video", "No summary available"
 
 
 
 
 
 
 
41
 
42
- stream = audio_streams.first()
 
 
 
 
 
 
 
43
 
44
- # Download with proper filename
45
- audio_file = stream.download(filename="audio")
46
-
47
- # Transcribe
48
- result = asr(audio_file)
49
- transcript = result["text"]
 
 
50
 
51
- # Clean up audio file
52
- if os.path.exists(audio_file):
53
- os.remove(audio_file)
54
 
55
- # Check transcript length for summarization
56
- if len(transcript.split()) < 10:
57
- return "โŒ Error: Transcript too short", transcript, "Cannot summarize - transcript too brief"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- # Summarize with better parameters
60
- max_chunk = 1024 # BART's max input length
61
- if len(transcript) > max_chuck:
62
- # Split transcript into chunks if too long
63
- words = transcript.split()
64
- chunks = [' '.join(words[i:i+200]) for i in range(0, len(words), 200)]
65
  summaries = []
66
 
67
- for chunk in chunks[:3]: # Limit to first 3 chunks to avoid timeout
68
- if len(chunk.strip()) > 50:
69
- chunk_summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
70
- summaries.append(chunk_summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- summary = " ".join(summaries)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  else:
74
- summary = summarizer(transcript, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
75
-
76
- # Create embed HTML
77
- v_id = extract_video_id(url)
78
- if v_id:
79
- embed_html = f'<iframe width="560" height="315" src="https://www.youtube.com/embed/{v_id}" frameborder="0" allowfullscreen></iframe>'
80
- else:
81
- embed_html = "โŒ Could not extract video ID"
82
-
83
- return embed_html, transcript, summary
84
-
85
- except pytube.exceptions.RegexMatchError:
86
- return "โŒ Error: Invalid YouTube URL", "Please check the URL format", "No summary available"
87
- except pytube.exceptions.VideoUnavailable:
88
- return "โŒ Error: Video unavailable", "Video may be private or deleted", "No summary available"
89
  except Exception as e:
90
- return f"โŒ Error: {str(e)}", "An error occurred during processing", "No summary available"
91
 
92
- # Build Gradio app
93
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
94
- gr.Markdown("## ๐ŸŽ“ Multi-lingual YouTube Summarizer (Hindi / Hinglish / English)")
95
- gr.Markdown("Enter a YouTube URL to get an AI-generated summary of the video content.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  with gr.Row():
98
- with gr.Column():
99
  url_input = gr.Textbox(
100
- label="YouTube URL",
101
  placeholder="https://www.youtube.com/watch?v=...",
102
- lines=1
 
 
 
 
 
 
 
 
103
  )
104
- btn = gr.Button("๐Ÿš€ Summarize Video", variant="primary")
105
 
 
106
  with gr.Row():
107
  with gr.Column():
108
- vid = gr.HTML(label="Video Player")
 
109
  with gr.Column():
110
- with gr.Accordion("๐Ÿ“ Transcript", open=False):
111
- txt = gr.Textbox(label="Full Transcript", lines=10, max_lines=15)
112
- summ = gr.Textbox(label="๐Ÿ“‹ Summary", lines=5)
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- btn.click(summarize_youtube, inputs=url_input, outputs=[vid, txt, summ])
 
115
 
116
- # Add examples
117
  gr.Examples(
118
  examples=[
119
- ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"], # Replace with actual examples
 
120
  ],
121
- inputs=url_input
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  )
123
 
 
124
  if __name__ == "__main__":
125
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
2
  import re
3
+ import requests
4
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
5
+ from youtube_transcript_api import YouTubeTranscriptApi
6
+ import torch
7
+ import gc
8
 
9
+ # Optimize for HuggingFace Spaces - Use smaller models and efficient loading
10
+ print("๐Ÿš€ Loading models for HuggingFace Spaces...")
11
+
12
+ # Use smaller, efficient models
13
+ @torch.no_grad()
14
+ def load_summarizer():
15
+ model_name = "facebook/bart-large-cnn"
16
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
17
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
18
+ return pipeline("summarization", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
19
+
20
+ # Initialize summarizer
21
+ summarizer = load_summarizer()
22
 
23
  def extract_video_id(url):
24
  """Extract video ID from various YouTube URL formats"""
25
  patterns = [
26
  r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
27
  r'(?:embed\/)([0-9A-Za-z_-]{11})',
28
+ r'(?:v\/)([0-9A-Za-z_-]{11})',
29
+ r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})'
30
  ]
31
  for pattern in patterns:
32
  match = re.search(pattern, url)
 
34
  return match.group(1)
35
  return None
36
 
37
+ def get_youtube_transcript(video_id):
38
+ """Get transcript using YouTube Transcript API - Most reliable for HF Spaces"""
39
  try:
40
+ # Priority order for languages (Hindi, English variants)
41
+ language_codes = ['hi', 'en', 'en-IN', 'en-US', 'en-GB']
 
 
 
 
42
 
43
+ transcript_data = None
44
+ used_language = None
 
 
 
45
 
46
+ # Try each language
47
+ for lang_code in language_codes:
48
+ try:
49
+ transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=[lang_code])
50
+ transcript_data = transcript_list
51
+ used_language = lang_code
52
+ break
53
+ except:
54
+ continue
55
 
56
+ # If specific languages fail, try auto-generated
57
+ if not transcript_data:
58
+ try:
59
+ transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
60
+ transcript_data = transcript_list
61
+ used_language = "auto-detected"
62
+ except Exception as e:
63
+ return None, f"No transcript available: {str(e)}"
64
 
65
+ # Process transcript
66
+ if transcript_data:
67
+ transcript_text = ' '.join([item['text'].replace('\n', ' ') for item in transcript_data])
68
+ # Clean up common transcript artifacts
69
+ transcript_text = re.sub(r'\[.*?\]', '', transcript_text) # Remove [Music], [Applause] etc
70
+ transcript_text = re.sub(r'\s+', ' ', transcript_text).strip() # Clean whitespace
71
+
72
+ return transcript_text, f"Transcript found in: {used_language}"
73
 
74
+ return None, "No transcript data found"
 
 
75
 
76
+ except Exception as e:
77
+ return None, f"Transcript API Error: {str(e)}"
78
+
79
+ def chunk_text_for_summarization(text, max_chunk_size=800):
80
+ """Split text into chunks for summarization"""
81
+ sentences = text.replace('เฅค', '.').split('.') # Handle Hindi sentences
82
+ chunks = []
83
+ current_chunk = ""
84
+
85
+ for sentence in sentences:
86
+ sentence = sentence.strip()
87
+ if not sentence:
88
+ continue
89
+
90
+ # Check if adding this sentence would exceed limit
91
+ if len(current_chunk) + len(sentence) + 1 < max_chunk_size:
92
+ current_chunk += sentence + ". "
93
+ else:
94
+ if current_chunk:
95
+ chunks.append(current_chunk.strip())
96
+ current_chunk = sentence + ". "
97
+
98
+ # Add the last chunk
99
+ if current_chunk:
100
+ chunks.append(current_chunk.strip())
101
+
102
+ return chunks
103
+
104
+ def summarize_text_optimized(text):
105
+ """Optimized summarization for HuggingFace Spaces"""
106
+ if not text or len(text.strip()) < 100:
107
+ return "Text too short to summarize (minimum 100 characters required)"
108
+
109
+ try:
110
+ # Clean memory before processing
111
+ if torch.cuda.is_available():
112
+ torch.cuda.empty_cache()
113
+ gc.collect()
114
 
115
+ # For very long texts, chunk them
116
+ if len(text) > 1500:
117
+ chunks = chunk_text_for_summarization(text, max_chunk_size=900)
 
 
 
118
  summaries = []
119
 
120
+ # Process first 3 chunks to avoid timeout
121
+ for i, chunk in enumerate(chunks[:3]):
122
+ if len(chunk.strip()) < 50:
123
+ continue
124
+
125
+ try:
126
+ summary = summarizer(
127
+ chunk,
128
+ max_length=120,
129
+ min_length=30,
130
+ do_sample=False,
131
+ num_beams=2, # Reduced for speed
132
+ length_penalty=1.0
133
+ )[0]["summary_text"]
134
+ summaries.append(summary)
135
+ except Exception as chunk_error:
136
+ print(f"Error processing chunk {i}: {chunk_error}")
137
+ continue
138
 
139
+ if summaries:
140
+ combined_summary = " ".join(summaries)
141
+ # If combined summary is still too long, summarize it again
142
+ if len(combined_summary) > 600:
143
+ try:
144
+ final_summary = summarizer(
145
+ combined_summary,
146
+ max_length=200,
147
+ min_length=80,
148
+ do_sample=False,
149
+ num_beams=2
150
+ )[0]["summary_text"]
151
+ return final_summary
152
+ except:
153
+ return combined_summary
154
+ return combined_summary
155
+ else:
156
+ return "Could not generate summary from chunks"
157
  else:
158
+ # For shorter texts, direct summarization
159
+ summary = summarizer(
160
+ text,
161
+ max_length=150,
162
+ min_length=50,
163
+ do_sample=False,
164
+ num_beams=2,
165
+ length_penalty=1.0
166
+ )[0]["summary_text"]
167
+ return summary
168
+
 
 
 
 
169
  except Exception as e:
170
+ return f"Summarization error: {str(e)}"
171
 
172
+ def process_youtube_video(url):
173
+ """Main processing function optimized for HuggingFace Spaces"""
174
+
175
+ # Input validation
176
+ if not url or not url.strip():
177
+ return "โŒ Please enter a YouTube URL", "", "No summary available"
178
+
179
+ # Extract video ID
180
+ video_id = extract_video_id(url.strip())
181
+ if not video_id:
182
+ return "โŒ Invalid YouTube URL format", "Please check the URL format", "No summary available"
183
+
184
+ # Update progress
185
+ progress_msg = "๐Ÿ” Extracting video transcript..."
186
+
187
+ # Get transcript
188
+ transcript, status = get_youtube_transcript(video_id)
189
+
190
+ if not transcript:
191
+ return (
192
+ "โŒ Could not extract transcript",
193
+ f"Status: {status}\n\nThis video might not have captions/subtitles available.",
194
+ "Cannot generate summary without transcript"
195
+ )
196
+
197
+ # Generate summary
198
+ progress_msg = "๐Ÿค– Generating AI summary..."
199
+ summary = summarize_text_optimized(transcript)
200
+
201
+ # Create video embed
202
+ embed_html = f'''
203
+ <div style="text-align: center;">
204
+ <iframe width="560" height="315"
205
+ src="https://www.youtube.com/embed/{video_id}"
206
+ frameborder="0"
207
+ allowfullscreen
208
+ style="max-width: 100%; border-radius: 10px;">
209
+ </iframe>
210
+ </div>
211
+ '''
212
+
213
+ # Format transcript info
214
+ transcript_info = f"""๐Ÿ“Š Processing Status: โœ… Success
215
+ ๐ŸŽฏ Method: YouTube Transcript API
216
+ ๐ŸŒ Language: {status}
217
+ ๐Ÿ“ Transcript Length: {len(transcript)} characters
218
+ ๐Ÿ“„ Word Count: ~{len(transcript.split())} words
219
+
220
+ ๐Ÿ“‹ Full Transcript:
221
+ {transcript}"""
222
+
223
+ return embed_html, transcript_info, summary
224
+
225
+ # Custom CSS for better UI
226
+ custom_css = """
227
+ #component-0 {
228
+ max-width: 900px;
229
+ margin: auto;
230
+ }
231
+ .gradio-container {
232
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
233
+ }
234
+ """
235
+
236
+ # Create Gradio Interface optimized for HuggingFace Spaces
237
+ with gr.Blocks(css=custom_css, title="YouTube Video Summarizer", theme=gr.themes.Soft()) as demo:
238
+ gr.HTML("""
239
+ <div style="text-align: center; padding: 20px;">
240
+ <h1>๐ŸŽ“ YouTube Video Summarizer</h1>
241
+ <p style="font-size: 18px; color: #666;">
242
+ AI-powered summarization for Hindi, Hinglish & English videos
243
+ </p>
244
+ <p style="color: #888;">
245
+ Optimized for HuggingFace Spaces โ€ข Uses YouTube Transcript API
246
+ </p>
247
+ </div>
248
+ """)
249
 
250
  with gr.Row():
251
+ with gr.Column(scale=2):
252
  url_input = gr.Textbox(
253
+ label="๐Ÿ“บ YouTube URL",
254
  placeholder="https://www.youtube.com/watch?v=...",
255
+ lines=1,
256
+ info="Paste any YouTube video URL here"
257
+ )
258
+
259
+ with gr.Column(scale=1):
260
+ submit_btn = gr.Button(
261
+ "๐Ÿš€ Summarize Video",
262
+ variant="primary",
263
+ size="lg"
264
  )
 
265
 
266
+ # Results section
267
  with gr.Row():
268
  with gr.Column():
269
+ video_embed = gr.HTML(label="๐Ÿ“บ Video Player")
270
+
271
  with gr.Column():
272
+ summary_output = gr.Textbox(
273
+ label="๐Ÿ“‹ AI Summary",
274
+ lines=8,
275
+ max_lines=12,
276
+ info="AI-generated summary of the video content"
277
+ )
278
+
279
+ # Expandable transcript section
280
+ with gr.Accordion("๐Ÿ“ Full Transcript & Details", open=False):
281
+ transcript_output = gr.Textbox(
282
+ label="Complete Transcript",
283
+ lines=15,
284
+ max_lines=25,
285
+ info="Full video transcript with processing details"
286
+ )
287
 
288
+ # Examples section
289
+ gr.HTML("<h3 style='margin-top: 30px;'>๐ŸŽฏ Try these examples:</h3>")
290
 
 
291
  gr.Examples(
292
  examples=[
293
+ ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"],
294
+ ["https://youtu.be/dQw4w9WgXcQ"],
295
  ],
296
+ inputs=url_input,
297
+ label="Sample URLs"
298
+ )
299
+
300
+ # Info section
301
+ with gr.Accordion("โ„น๏ธ How it works", open=False):
302
+ gr.Markdown("""
303
+ ### ๐Ÿ”ง How this tool works:
304
+
305
+ 1. **Extract Video ID**: Parses the YouTube URL to get the video identifier
306
+ 2. **Fetch Transcript**: Uses YouTube Transcript API to get captions/subtitles
307
+ 3. **AI Summarization**: Processes text through BART model for intelligent summarization
308
+ 4. **Multi-language Support**: Handles Hindi, Hinglish, and English content
309
+
310
+ ### ๐Ÿ“‹ Supported Languages:
311
+ - ๐Ÿ‡ฎ๐Ÿ‡ณ **Hindi**: Full support for Hindi captions
312
+ - ๐ŸŒ **Hinglish**: Mixed Hindi-English content
313
+ - ๐Ÿ‡บ๐Ÿ‡ธ **English**: All English variants
314
+
315
+ ### โšก Optimizations for HuggingFace Spaces:
316
+ - Efficient model loading with memory management
317
+ - Chunked processing for long videos
318
+ - GPU acceleration when available
319
+ - Automatic text cleanup and formatting
320
+
321
+ ### โš ๏ธ Limitations:
322
+ - Requires videos to have captions/subtitles
323
+ - Processing time depends on transcript length
324
+ - Very long videos are chunked to prevent timeouts
325
+ """)
326
+
327
+ # Event handlers
328
+ submit_btn.click(
329
+ fn=process_youtube_video,
330
+ inputs=[url_input],
331
+ outputs=[video_embed, transcript_output, summary_output]
332
+ )
333
+
334
+ url_input.submit(
335
+ fn=process_youtube_video,
336
+ inputs=[url_input],
337
+ outputs=[video_embed, transcript_output, summary_output]
338
  )
339
 
340
+ # Launch configuration for HuggingFace Spaces
341
  if __name__ == "__main__":
342
+ demo.queue(concurrency_count=2) # Limit concurrent users for stability
343
+ demo.launch(
344
+ server_name="0.0.0.0",
345
+ server_port=7860,
346
+ share=False, # Don't need share link in HF Spaces
347
+ debug=False, # Disable debug in production
348
+ enable_queue=True,
349
+ show_error=True
350
+ )