divython commited on
Commit
9b856f1
·
verified ·
1 Parent(s): 4568e79

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -971
app.py CHANGED
@@ -1,981 +1,67 @@
1
  import gradio as gr
2
- import re
3
- import requests
4
- from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
5
  import torch
6
  import gc
7
- import time
8
- from urllib.parse import urlparse, parse_qs
9
- import json
10
- from typing import Optional, Tuple
11
- import random
12
- import html
13
-
14
- # Try to import YouTube Transcript API, but don't fail if it's not available
15
- try:
16
- from youtube_transcript_api import YouTubeTranscriptApi
17
- from youtube_transcript_api.formatters import TextFormatter
18
- TRANSCRIPT_API_AVAILABLE = True
19
- except ImportError:
20
- TRANSCRIPT_API_AVAILABLE = False
21
- print("⚠️ YouTube Transcript API not available, using alternative methods")
22
-
23
- print("🚀 Loading models for enhanced YouTube Summarizer...")
24
-
25
- # List of User-Agent strings to rotate
26
- USER_AGENTS = [
27
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
28
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
29
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
30
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
31
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0'
32
- ]
33
 
 
34
  @torch.no_grad()
35
  def load_summarizer():
36
- """Load summarization model with fallback options"""
37
- models_to_try = [
38
- "facebook/bart-large-cnn",
39
- "sshleifer/distilbart-cnn-12-6",
40
- "google/pegasus-xsum",
41
- "t5-small"
42
- ]
43
-
44
- for model_name in models_to_try:
45
- try:
46
- print(f"Trying to load {model_name}...")
47
- if "t5" in model_name.lower():
48
- tokenizer = AutoTokenizer.from_pretrained(model_name)
49
- model = AutoModelForSeq2SeqLM.from_pretrained(
50
- model_name,
51
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
52
- )
53
- return pipeline("summarization", model=model, tokenizer=tokenizer,
54
- device=0 if torch.cuda.is_available() else -1)
55
- else:
56
- return pipeline("summarization", model=model_name,
57
- device=0 if torch.cuda.is_available() else -1)
58
- except Exception as e:
59
- print(f"Failed to load {model_name}: {e}")
60
- continue
61
-
62
- print("❌ No summarization model could be loaded")
63
- return None
64
 
65
- # Initialize summarizer
66
  summarizer = load_summarizer()
67
 
68
- def extract_video_id(url: str) -> Optional[str]:
69
- """Extract video ID from various YouTube URL formats"""
70
- if not url:
71
- return None
72
-
73
- url = url.strip()
74
-
75
- patterns = [
76
- r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
77
- r'(?:embed\/)([0-9A-Za-z_-]{11})',
78
- r'(?:v\/)([0-9A-Za-z_-]{11})',
79
- r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})',
80
- r'(?:watch\?v=)([0-9A-Za-z_-]{11})'
81
- ]
82
-
83
- for pattern in patterns:
84
- match = re.search(pattern, url)
85
- if match:
86
- video_id = match.group(1)
87
- if len(video_id) == 11:
88
- return video_id
89
- return None
90
-
91
- def get_random_headers():
92
- """Get random headers to avoid detection"""
93
- return {
94
- 'User-Agent': random.choice(USER_AGENTS),
95
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
96
- 'Accept-Language': 'en-US,en;q=0.5',
97
- 'Accept-Encoding': 'gzip, deflate',
98
- 'Connection': 'keep-alive',
99
- 'Upgrade-Insecure-Requests': '1',
100
- 'Sec-Fetch-Dest': 'document',
101
- 'Sec-Fetch-Mode': 'navigate',
102
- 'Sec-Fetch-Site': 'none',
103
- 'Cache-Control': 'max-age=0'
104
  }
105
-
106
- def get_transcript_via_api(video_id: str) -> Tuple[Optional[str], str]:
107
- """Original YouTube Transcript API method with enhanced error handling"""
108
- if not TRANSCRIPT_API_AVAILABLE:
109
- return None, "YouTube Transcript API not available"
110
-
111
- language_codes = ['hi', 'en', 'en-IN', 'en-US', 'en-GB']
112
-
113
- for attempt in range(2):
114
- try:
115
- transcript_data = None
116
- used_language = None
117
-
118
- # Try each language
119
- for lang_code in language_codes:
120
- try:
121
- transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=[lang_code])
122
- transcript_data = transcript_list
123
- used_language = lang_code
124
- break
125
- except:
126
- continue
127
-
128
- # Try auto-generated if specific languages fail
129
- if not transcript_data:
130
- try:
131
- transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
132
- transcript_data = transcript_list
133
- used_language = "auto-detected"
134
- except:
135
- pass
136
-
137
- if transcript_data:
138
- formatter = TextFormatter()
139
- transcript_text = formatter.format_transcript(transcript_data)
140
-
141
- # Clean up the transcript
142
- transcript_text = re.sub(r'\[.*?\]', '', transcript_text)
143
- transcript_text = re.sub(r'\s+', ' ', transcript_text).strip()
144
-
145
- if len(transcript_text) > 50:
146
- return transcript_text, f"API Success - {used_language}"
147
-
148
- if attempt < 1:
149
- time.sleep(1)
150
-
151
- except Exception as e:
152
- error_msg = str(e).lower()
153
- if any(term in error_msg for term in ["ip", "block", "banned", "rate"]):
154
- return None, "IP blocked - trying alternative methods"
155
- elif "disabled" in error_msg:
156
- return None, "Transcripts disabled for this video"
157
-
158
- return None, "API method failed"
159
-
160
- def extract_json_data(html_content: str) -> dict:
161
- """Extract JSON data from YouTube page"""
162
- try:
163
- # Look for ytInitialData
164
- pattern = r'var ytInitialData = ({.*?});'
165
- match = re.search(pattern, html_content)
166
- if match:
167
- json_str = match.group(1)
168
- return json.loads(json_str)
169
-
170
- # Alternative pattern
171
- pattern = r'ytInitialData":\s*({.*?})(?:;|,\s*")'
172
- match = re.search(pattern, html_content)
173
- if match:
174
- json_str = match.group(1)
175
- return json.loads(json_str)
176
-
177
- except Exception as e:
178
- print(f"JSON extraction error: {e}")
179
-
180
- return {}
181
-
182
- def extract_video_details(json_data: dict) -> Tuple[Optional[str], Optional[str], Optional[str]]:
183
- """Extract video details from JSON data"""
184
- try:
185
- # Navigate through the JSON structure
186
- contents = json_data.get('contents', {})
187
- two_column = contents.get('twoColumnWatchNextResults', {})
188
- results = two_column.get('results', {})
189
- primary_results = results.get('results', {})
190
- contents_list = primary_results.get('contents', [])
191
-
192
- title = None
193
- description = None
194
- view_count = None
195
-
196
- for content in contents_list:
197
- # Extract video primary info
198
- if 'videoPrimaryInfoRenderer' in content:
199
- video_info = content['videoPrimaryInfoRenderer']
200
-
201
- # Get title
202
- title_runs = video_info.get('title', {}).get('runs', [])
203
- if title_runs:
204
- title = title_runs[0].get('text', '')
205
-
206
- # Get view count
207
- view_count_text = video_info.get('viewCount', {}).get('videoViewCountRenderer', {}).get('viewCount', {}).get('simpleText', '')
208
- if view_count_text:
209
- view_count = view_count_text
210
-
211
- # Extract video secondary info (description)
212
- if 'videoSecondaryInfoRenderer' in content:
213
- secondary_info = content['videoSecondaryInfoRenderer']
214
-
215
- # Get description
216
- description_runs = secondary_info.get('description', {}).get('runs', [])
217
- if description_runs:
218
- description_parts = []
219
- for run in description_runs[:10]: # Limit to first 10 parts
220
- if 'text' in run:
221
- description_parts.append(run['text'])
222
- description = ''.join(description_parts)
223
-
224
- return title, description, view_count
225
-
226
- except Exception as e:
227
- print(f"Video details extraction error: {e}")
228
- return None, None, None
229
-
230
- def extract_from_youtube_page(video_id: str) -> Tuple[Optional[str], str]:
231
- """Enhanced method: Extract comprehensive data from YouTube page"""
232
- try:
233
- url = f"https://www.youtube.com/watch?v={video_id}"
234
- headers = get_random_headers()
235
-
236
- # Add some delay to avoid rate limiting
237
- time.sleep(random.uniform(1, 3))
238
-
239
- response = requests.get(url, headers=headers, timeout=15)
240
- if response.status_code != 200:
241
- return None, f"Page access failed: {response.status_code}"
242
-
243
- html_content = response.text
244
-
245
- # Method 1: Extract from JSON data (most reliable)
246
- json_data = extract_json_data(html_content)
247
- if json_data:
248
- title, description, view_count = extract_video_details(json_data)
249
-
250
- content_parts = []
251
- if title:
252
- content_parts.append(f"Title: {title}")
253
- if view_count:
254
- content_parts.append(f"Views: {view_count}")
255
- if description and len(description.strip()) > 50:
256
- # Clean description
257
- description = html.unescape(description)
258
- description = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '[LINK]', description)
259
- description = re.sub(r'\s+', ' ', description).strip()
260
- content_parts.append(f"Description: {description[:800]}...")
261
-
262
- if content_parts:
263
- combined_content = " | ".join(content_parts)
264
- return combined_content, "JSON data extraction successful"
265
-
266
- # Method 2: Enhanced regex patterns for modern YouTube
267
- enhanced_patterns = [
268
- r'"title":"([^"]{20,200})"',
269
- r'"description":{"simpleText":"([^"]{50,1000})"}',
270
- r'"shortDescription":"([^"]{50,1000})"',
271
- r'<meta name="description" content="([^"]{50,500})"',
272
- r'<meta property="og:description" content="([^"]{50,500})"',
273
- r'<meta name="twitter:description" content="([^"]{50,500})"',
274
- r'"videoDetails":{[^}]*"shortDescription":"([^"]{50,1000})"',
275
- r'"microformat":{[^}]*"description":"([^"]{50,1000})"'
276
- ]
277
-
278
- extracted_content = []
279
-
280
- for pattern in enhanced_patterns:
281
- matches = re.findall(pattern, html_content)
282
- for match in matches:
283
- if len(match.strip()) > 50:
284
- # Clean the match
285
- cleaned = html.unescape(match)
286
- cleaned = re.sub(r'\\+', ' ', cleaned)
287
- cleaned = re.sub(r'\s+', ' ', cleaned).strip()
288
-
289
- # Avoid generic YouTube descriptions
290
- if not any(generic in cleaned.lower() for generic in [
291
- 'enjoy the videos and music you love',
292
- 'created using youtube video editor',
293
- 'default description'
294
- ]):
295
- extracted_content.append(cleaned)
296
-
297
- if extracted_content:
298
- # Combine unique content
299
- unique_content = []
300
- for content in extracted_content:
301
- if content not in unique_content:
302
- unique_content.append(content)
303
-
304
- combined = " | ".join(unique_content[:3]) # Limit to 3 pieces
305
- return combined[:1000], "Enhanced regex extraction successful"
306
-
307
- # Method 3: Try to extract video title at minimum
308
- title_patterns = [
309
- r'<title>([^<]+)</title>',
310
- r'"title":"([^"]+)"',
311
- r'<meta property="og:title" content="([^"]+)"'
312
- ]
313
-
314
- for pattern in title_patterns:
315
- match = re.search(pattern, html_content)
316
- if match:
317
- title = html.unescape(match.group(1))
318
- title = title.replace(' - YouTube', '').strip()
319
- if len(title) > 10:
320
- return f"Video Title: {title}", "Title extraction only"
321
-
322
- return None, "No meaningful content found"
323
-
324
- except Exception as e:
325
- return None, f"Page extraction failed: {str(e)}"
326
-
327
- def get_video_info_alternative(video_id: str) -> Tuple[Optional[str], str]:
328
- """Get video information using alternative APIs"""
329
- methods_tried = []
330
-
331
- # Method 1: oEmbed API
332
- try:
333
- oembed_url = f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json"
334
- headers = get_random_headers()
335
-
336
- response = requests.get(oembed_url, headers=headers, timeout=10)
337
- if response.status_code == 200:
338
- data = response.json()
339
- title = data.get('title', '')
340
- author = data.get('author_name', '')
341
-
342
- if title and len(title) > 10:
343
- summary_text = f"Video: {title}"
344
- if author:
345
- summary_text += f" by {author}"
346
- methods_tried.append("oEmbed API successful")
347
- return summary_text, "oEmbed API extraction"
348
-
349
- methods_tried.append("oEmbed API failed")
350
-
351
- except Exception as e:
352
- methods_tried.append(f"oEmbed API error: {str(e)}")
353
-
354
- # Method 2: Try Invidious API (alternative YouTube frontend)
355
- try:
356
- invidious_instances = [
357
- "https://inv.riverside.rocks",
358
- "https://invidious.snopyta.org",
359
- "https://yewtu.be"
360
- ]
361
-
362
- for instance in invidious_instances:
363
- try:
364
- api_url = f"{instance}/api/v1/videos/{video_id}"
365
- response = requests.get(api_url, timeout=10)
366
-
367
- if response.status_code == 200:
368
- data = response.json()
369
- title = data.get('title', '')
370
- description = data.get('description', '')
371
- author = data.get('author', '')
372
-
373
- if title:
374
- content_parts = [f"Title: {title}"]
375
- if author:
376
- content_parts.append(f"Author: {author}")
377
- if description and len(description) > 50:
378
- content_parts.append(f"Description: {description[:500]}...")
379
-
380
- combined = " | ".join(content_parts)
381
- methods_tried.append(f"Invidious API successful ({instance})")
382
- return combined, f"Invidious API via {instance}"
383
-
384
- except:
385
- continue
386
-
387
- methods_tried.append("All Invidious instances failed")
388
-
389
- except Exception as e:
390
- methods_tried.append(f"Invidious API error: {str(e)}")
391
-
392
- return None, f"All alternative methods failed: {', '.join(methods_tried)}"
393
-
394
- def create_enhanced_demo_content(video_id: str, methods_tried: list) -> Tuple[str, str, str]:
395
- """Create enhanced demo content with detailed troubleshooting"""
396
- embed_html = f'''
397
- <div style="text-align: center; margin: 10px 0;">
398
- <iframe width="100%" height="315"
399
- src="https://www.youtube.com/embed/{video_id}"
400
- frameborder="0"
401
- allowfullscreen
402
- style="max-width: 560px; border-radius: 10px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
403
- </iframe>
404
- </div>
405
- '''
406
-
407
- methods_status = "\n".join([f"• {method}" for method in methods_tried])
408
-
409
- info_text = f"""🔍 **All Extraction Methods Attempted**:
410
- {methods_status}
411
-
412
- ❌ **Why This Happens**:
413
- • Video has no captions/subtitles enabled
414
- • Video description is minimal or generic
415
- • Content is protected or restricted
416
- • IP blocking from cloud hosting platforms
417
- • Geographic restrictions
418
-
419
- 💡 **Recommendations**:
420
- • Try educational videos (TED, Khan Academy, Coursera)
421
- • Look for videos with the CC (closed captions) icon
422
- • Try videos from popular channels (they often have auto-generated captions)
423
- • Check if the video has a detailed description on YouTube
424
-
425
- 📋 **Alternative Approaches**:
426
- • Use YouTube's auto-generated transcript feature directly
427
- • Try videos in English (higher transcript availability)
428
- • Look for lecture or tutorial content
429
- • Try shorter videos (under 10 minutes)"""
430
-
431
- summary_text = f"""🎯 **Video Processing Summary**:
432
-
433
- **Video ID**: {video_id}
434
- **Status**: No extractable content found
435
- **Methods Tried**: {len(methods_tried)} different approaches
436
-
437
- **What This Tool Can Do** (when content is available):
438
- ✅ Extract and summarize video transcripts
439
- ✅ Process long-form content (lectures, tutorials)
440
- ✅ Handle multiple languages (Hindi, English, Hinglish)
441
- ✅ Provide intelligent chunking for long videos
442
- ✅ Generate concise, meaningful summaries
443
-
444
- **Success Rate by Content Type**:
445
- • Educational content: ~85% success
446
- • Tutorial videos: ~75% success
447
- • News/interviews: ~70% success
448
- • Entertainment/music: ~30% success
449
- • User-generated content: ~25% success
450
-
451
- Try pasting a URL from an educational channel or a video with visible captions for better results!"""
452
-
453
- return embed_html, info_text, summary_text
454
-
455
- def chunk_text_for_summarization(text: str, max_chunk_size: int = 800) -> list:
456
- """Split text into chunks for summarization"""
457
- if not text:
458
- return []
459
-
460
- sentences = re.split(r'[.।!?]+', text)
461
- chunks = []
462
- current_chunk = ""
463
-
464
- for sentence in sentences:
465
- sentence = sentence.strip()
466
- if not sentence:
467
- continue
468
-
469
- if len(current_chunk) + len(sentence) + 2 < max_chunk_size:
470
- current_chunk += sentence + ". "
471
- else:
472
- if current_chunk.strip():
473
- chunks.append(current_chunk.strip())
474
- current_chunk = sentence + ". "
475
-
476
- if current_chunk.strip():
477
- chunks.append(current_chunk.strip())
478
-
479
- return [chunk for chunk in chunks if len(chunk.strip()) > 20]
480
-
481
- def summarize_text_optimized(text: str) -> str:
482
- """Optimized summarization with multiple fallback strategies"""
483
- if not text or len(text.strip()) < 50:
484
- return "❌ Text too short to summarize"
485
-
486
- if not summarizer:
487
- # Enhanced fallback: Smart extractive summary
488
- sentences = re.split(r'[.।!?]+', text)
489
- sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
490
-
491
- if len(sentences) <= 3:
492
- return " ".join(sentences) + "."
493
- else:
494
- # Take first, middle, and last sentences for better coverage
495
- selected = [
496
- sentences[0],
497
- sentences[len(sentences)//2],
498
- sentences[-1]
499
- ]
500
- return " ".join(selected) + " [Extractive summary - AI model unavailable]"
501
-
502
- try:
503
- # Clean memory
504
- if torch.cuda.is_available():
505
- torch.cuda.empty_cache()
506
- gc.collect()
507
-
508
- # Handle long texts with chunking
509
- if len(text) > 1000:
510
- chunks = chunk_text_for_summarization(text, max_chunk_size=700)
511
- summaries = []
512
-
513
- for i, chunk in enumerate(chunks[:4]): # Increased limit
514
- if len(chunk.strip()) < 50:
515
- continue
516
-
517
- try:
518
- summary = summarizer(
519
- chunk,
520
- max_length=120,
521
- min_length=30,
522
- do_sample=False,
523
- num_beams=3,
524
- length_penalty=1.0,
525
- early_stopping=True
526
- )[0]["summary_text"]
527
- summaries.append(summary)
528
- except Exception as e:
529
- print(f"Chunk {i} error: {e}")
530
- continue
531
-
532
- if summaries:
533
- combined = " ".join(summaries)
534
- if len(combined) > 500:
535
- try:
536
- final = summarizer(
537
- combined,
538
- max_length=200,
539
- min_length=60,
540
- do_sample=False,
541
- num_beams=3
542
- )[0]["summary_text"]
543
- return final
544
- except:
545
- return combined[:500] + "..."
546
- return combined
547
- else:
548
- # Direct summarization for shorter texts
549
- word_count = len(text.split())
550
- max_length = min(150, max(40, word_count // 3))
551
- min_length = min(30, max(15, word_count // 6))
552
-
553
- summary = summarizer(
554
- text,
555
- max_length=max_length,
556
- min_length=min_length,
557
- do_sample=False,
558
- num_beams=3,
559
- length_penalty=1.0
560
- )[0]["summary_text"]
561
- return summary
562
-
563
- except Exception as e:
564
- # Enhanced fallback with better sentence selection
565
- sentences = re.split(r'[.।!?]+', text)
566
- sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
567
-
568
- if len(sentences) > 5:
569
- # Select more representative sentences
570
- selected = [
571
- sentences[0], # First sentence
572
- sentences[len(sentences)//4], # Quarter point
573
- sentences[len(sentences)//2], # Middle
574
- sentences[3*len(sentences)//4], # Three-quarter point
575
- sentences[-1] # Last sentence
576
- ]
577
- return ". ".join(selected) + f". [Enhanced fallback summary - AI error: {str(e)[:50]}]"
578
- else:
579
- return ". ".join(sentences) + f". [Simple fallback - AI error: {str(e)[:50]}]"
580
-
581
- def process_youtube_video(url: str, progress=gr.Progress()) -> Tuple[str, str, str]:
582
- """Enhanced main processing function with comprehensive fallback methods"""
583
-
584
- if not url or not url.strip():
585
- return "❌ Please enter a YouTube URL", "", "❌ No URL provided"
586
-
587
- progress(0.1, desc="Validating URL...")
588
-
589
- video_id = extract_video_id(url.strip())
590
- if not video_id:
591
- return ("❌ Invalid YouTube URL",
592
- "Please use formats like:\n• https://www.youtube.com/watch?v=VIDEO_ID\n• https://youtu.be/VIDEO_ID",
593
- "❌ Invalid URL format")
594
-
595
- methods_tried = []
596
-
597
- progress(0.2, desc="Trying transcript extraction...")
598
-
599
- # Method 1: Try YouTube Transcript API
600
- transcript, status1 = get_transcript_via_api(video_id)
601
- methods_tried.append(f"YouTube Transcript API: {status1}")
602
-
603
- if transcript:
604
- progress(0.7, desc="Generating AI summary...")
605
- summary = summarize_text_optimized(transcript)
606
-
607
- embed_html = f'''
608
- <div style="text-align: center; margin: 10px 0;">
609
- <iframe width="100%" height="315"
610
- src="https://www.youtube.com/embed/{video_id}"
611
- frameborder="0" allowfullscreen
612
- style="max-width: 560px; border-radius: 10px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
613
- </iframe>
614
- </div>
615
- '''
616
-
617
- info = f"""✅ **Success**: {status1}
618
- 📊 **Statistics**: {len(transcript):,} characters, ~{len(transcript.split()):,} words
619
- 🎯 **Confidence**: High (Full transcript available)
620
-
621
- 📋 **Full Transcript**:
622
- {transcript[:2000]}{'...' if len(transcript) > 2000 else ''}"""
623
-
624
- progress(1.0, desc="Complete!")
625
- return embed_html, info, summary
626
-
627
- progress(0.4, desc="Trying enhanced page extraction...")
628
-
629
- # Method 2: Try enhanced page extraction
630
- alt_content, status2 = extract_from_youtube_page(video_id)
631
- methods_tried.append(f"Page Extraction: {status2}")
632
-
633
- if alt_content and len(alt_content) > 100:
634
- progress(0.8, desc="Processing extracted content...")
635
- summary = summarize_text_optimized(alt_content)
636
-
637
- embed_html = f'''
638
- <div style="text-align: center; margin: 10px 0;">
639
- <iframe width="100%" height="315"
640
- src="https://www.youtube.com/embed/{video_id}"
641
- frameborder="0" allowfullscreen
642
- style="max-width: 560px; border-radius: 10px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
643
- </iframe>
644
- </div>
645
- '''
646
-
647
- info = f"""⚠️ **Partial Success**: {status2}
648
- 🔍 **Content Type**: Video metadata and description
649
- 📊 **Extracted**: {len(alt_content):,} characters
650
- 🎯 **Confidence**: Medium (Description-based)
651
-
652
- 📝 **Extracted Content**:
653
- {alt_content}
654
-
655
- **Note**: Full transcript not available, summary based on video description and metadata."""
656
-
657
- progress(1.0, desc="Complete!")
658
- return embed_html, info, summary
659
-
660
- progress(0.6, desc="Trying alternative APIs...")
661
-
662
- # Method 3: Try alternative APIs
663
- basic_info, status3 = get_video_info_alternative(video_id)
664
- methods_tried.append(f"Alternative APIs: {status3}")
665
-
666
- if basic_info and len(basic_info) > 50:
667
- # Try to create a summary from the basic info
668
- summary = summarize_text_optimized(basic_info)
669
-
670
- embed_html = f'''
671
- <div style="text-align: center; margin: 10px 0;">
672
- <iframe width="100%" height="315"
673
- src="https://www.youtube.com/embed/{video_id}"
674
- frameborder="0" allowfullscreen
675
- style="max-width: 560px; border-radius: 10px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
676
- </iframe>
677
- </div>
678
- '''
679
-
680
- info = f"""ℹ️ **Basic Info Retrieved**: {status3}
681
- 📹 **Video Info**: {basic_info}
682
- 🎯 **Confidence**: Low (Title/author only)
683
-
684
- **Note**: Only basic video information available. Full content extraction failed."""
685
-
686
- progress(1.0, desc="Complete!")
687
- return embed_html, info, summary
688
-
689
- # Method 4: Enhanced demo mode with troubleshooting
690
- progress(1.0, desc="Generating detailed analysis...")
691
- return create_enhanced_demo_content(video_id, methods_tried)
692
-
693
- # Custom CSS
694
- custom_css = """
695
- #component-0 {
696
- max-width: 1200px;
697
- margin: auto;
698
- }
699
- .gradio-container {
700
- font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
701
- }
702
- .progress-bar {
703
- background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
704
- }
705
- .status-success { color: #4CAF50; font-weight: bold; }
706
- .status-warning { color: #FF9800; font-weight: bold; }
707
- .status-error { color: #f44336; font-weight: bold; }
708
- """
709
-
710
- # Create Gradio Interface
711
- with gr.Blocks(css=custom_css, title="Enhanced YouTube Summarizer Pro", theme=gr.themes.Soft()) as demo:
712
- gr.HTML("""
713
- <div style="text-align: center; padding: 25px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 25px; color: white;">
714
- <h1 style="margin: 0; font-size: 2.8em;">🚀 Enhanced YouTube Summarizer Pro</h1>
715
- <p style="font-size: 20px; margin: 15px 0; opacity: 0.95;">
716
- Advanced multi-method extraction with comprehensive fallback systems
717
- </p>
718
- <p style="opacity: 0.85; margin: 0; font-size: 16px;">
719
- ⚡ 6+ extraction methods • 🌐 Multi-language • 🛡️ Anti-blocking • 🔧 Enhanced troubleshooting
720
- </p>
721
- </div>
722
- """)
723
-
724
- with gr.Row():
725
- with gr.Column(scale=4):
726
- url_input = gr.Textbox(
727
- label="📺 YouTube URL",
728
- placeholder="https://www.youtube.com/watch?v=dQw4w9WgXcQ",
729
- lines=1,
730
- info="Enter any YouTube URL - we'll try 6+ different extraction methods"
731
- )
732
-
733
- with gr.Column(scale=1):
734
- submit_btn = gr.Button(
735
- "🎯 Analyze Video",
736
- variant="primary",
737
- size="lg"
738
- )
739
-
740
- # Enhanced progress indicator
741
- gr.HTML("""
742
- <div style='margin: 15px 0; padding: 15px; background: linear-gradient(135deg, #f0f8ff 0%, #e6f3ff 100%); border-radius: 10px; border-left: 5px solid #4CAF50;'>
743
- <strong>🔄 Processing Pipeline:</strong><br>
744
- <span style="font-size: 14px;">
745
- 1️⃣ YouTube Transcript API → 2️⃣ Enhanced Page Extraction → 3️⃣ JSON Data Mining →
746
- 4️⃣ Alternative APIs → 5️⃣ Invidious Backend → 6️⃣ Comprehensive Analysis
747
- </span>
748
- </div>
749
- """)
750
-
751
- # Results section
752
- with gr.Row():
753
- with gr.Column(scale=1):
754
- video_embed = gr.HTML(label="📺 Video Player")
755
-
756
- with gr.Column(scale=1):
757
- summary_output = gr.Textbox(
758
- label="🤖 AI-Generated Summary",
759
- lines=15,
760
- max_lines=25,
761
- info="Intelligent summary using best available content",
762
- show_copy_button=True
763
- )
764
-
765
- # Detailed analysis section
766
- with gr.Accordion("📋 Detailed Extraction Analysis & Full Content", open=False):
767
- transcript_output = gr.Textbox(
768
- label="Complete Processing Report",
769
- lines=30,
770
- max_lines=40,
771
- info="Full extraction details, methods tried, and complete content",
772
- show_copy_button=True
773
- )
774
-
775
- # Success examples
776
- gr.HTML("<h3 style='margin-top: 30px; text-align: center; color: #2c3e50;'>✅ High Success Rate Examples:</h3>")
777
-
778
- gr.Examples(
779
- examples=[
780
- ["https://www.youtube.com/watch?v=kJQP7kiw5Fk"], # TED Talk
781
- ["https://www.youtube.com/watch?v=aircAruvnKk"], # 3Blue1Brown - Neural Networks
782
- ["https://www.youtube.com/watch?v=R9OHn5ZF4Uo"], # Educational content
783
- ["https://www.youtube.com/watch?v=9bZkp7q19f0"], # Popular format
784
- ["https://www.youtube.com/watch?v=HEfHFsfGXjs"], # Khan Academy
785
- ],
786
- inputs=url_input,
787
- label="🎓 Educational Content (85%+ Success Rate)"
788
- )
789
-
790
- # Comprehensive help and troubleshooting
791
- with gr.Accordion("🛠️ Enhanced Methods & Advanced Troubleshooting", open=False):
792
- gr.Markdown("""
793
- ## 🚀 **Enhanced Extraction Pipeline**
794
-
795
- This advanced version implements **6+ different extraction methods** with intelligent fallbacks:
796
-
797
- ### 1. 🎯 **YouTube Transcript API** (Primary Method)
798
- - **What it does**: Direct access to official captions/subtitles
799
- - **Languages**: Hindi, English, English-India, Auto-generated
800
- - **Success rate**: 60-70% (varies by content type)
801
- - **Limitations**: Often blocked on cloud platforms, requires captions to be enabled
802
-
803
- ### 2. 🌐 **Enhanced Page Extraction** (Major Upgrade)
804
- - **What's new**: Extracts from ytInitialData JSON structure
805
- - **Improvements**: Gets video title, description, view count, and metadata
806
- - **Patterns**: 8+ different regex patterns for comprehensive extraction
807
- - **Success rate**: 75-85% for videos with descriptions
808
-
809
- ### 3. 📊 **JSON Data Mining** (New Method)
810
- - **Technology**: Parses YouTube's internal JSON data structure
811
- - **Data extracted**: Video details, descriptions, metadata
812
- - **Advantages**: More reliable than regex scraping
813
- - **Bypass**: Works even when HTML patterns change
814
-
815
- ### 4. 🔄 **Alternative APIs**
816
- - **oEmbed API**: YouTube's official embedding API
817
- - **Invidious API**: Alternative YouTube frontend APIs
818
- - **Multiple instances**: Tries different Invidious servers
819
- - **Fallback**: Always provides at least basic video information
820
-
821
- ### 5. 🛡️ **Anti-Detection Measures**
822
- - **User-Agent rotation**: 5+ different browser signatures
823
- - **Header spoofing**: Mimics real browser requests
824
- - **Request delays**: Random delays to avoid rate limiting
825
- - **Session management**: Proper cookie and session handling
826
-
827
- ### 6. 🧠 **Enhanced AI Summarization**
828
- - **Smart chunking**: Handles long content intelligently
829
- - **Multiple models**: BART, Pegasus, T5 fallbacks
830
- - **Extractive fallback**: Works even without AI models
831
- - **Quality control**: Filters out generic/meaningless content
832
-
833
- ## 🎯 **Why Videos Fail & Solutions**
834
-
835
- ### ❌ **Common Failure Reasons:**
836
-
837
- **1. No Captions Available (40% of failures)**
838
- - Video creator didn't enable captions
839
- - Auto-generated captions disabled
840
- - Language not supported
841
- - **Solution**: Try educational content, popular channels
842
-
843
- **2. Minimal/Generic Descriptions (25% of failures)**
844
- - Generic YouTube descriptions
845
- - Very short descriptions
846
- - No meaningful metadata
847
- - **Solution**: Look for detailed video descriptions on YouTube
848
-
849
- **3. IP Blocking (20% of failures)**
850
- - Cloud platform IPs blocked
851
- - Rate limiting active
852
- - Geographic restrictions
853
- - **Solution**: Try different times, use VPN for local deployment
854
-
855
- **4. Content Restrictions (10% of failures)**
856
- - Age-restricted content
857
- - Private/unlisted videos
858
- - Copyright-protected content
859
- - **Solution**: Use public, unrestricted videos
860
-
861
- **5. Technical Issues (5% of failures)**
862
- - Network timeouts
863
- - API rate limits
864
- - Server errors
865
- - **Solution**: Retry after waiting, check video accessibility
866
-
867
- ## 📊 **Success Rates by Content Type**
868
-
869
- | Content Type | Success Rate | Best Method | Notes |
870
- |-------------|-------------|-------------|-------|
871
- | 🎓 Educational (Khan Academy, Coursera) | **90-95%** | Transcript API | Usually have captions |
872
- | 🎤 TED Talks & Conferences | **85-90%** | Transcript API | Professional captions |
873
- | 📚 Tutorial Videos | **75-85%** | Page Extraction | Good descriptions |
874
- | 📺 Popular YouTubers | **70-80%** | Mixed Methods | Varies by creator |
875
- | 🎵 Music Videos | **60-70%** | Page Extraction | Lyrics in description |
876
- | 🎮 Gaming Content | **50-60%** | Page Extraction | Depends on description |
877
- | 📱 Short-form Content | **40-50%** | Alternative APIs | Limited content |
878
- | 🎭 User-generated | **30-40%** | Basic Info Only | Minimal metadata |
879
-
880
- ## 🔧 **Advanced Features**
881
-
882
- ### 🧠 **Smart Content Processing**
883
- - **Duplicate filtering**: Removes repeated content
884
- - **Quality scoring**: Ranks extracted content by usefulness
885
- - **Language detection**: Handles multilingual content
886
- - **Format cleaning**: Removes URLs, special characters, formatting
887
-
888
- ### ⚡ **Performance Optimizations**
889
- - **Memory management**: Prevents crashes on limited resources
890
- - **Parallel processing**: Multiple extraction methods simultaneously
891
- - **Caching**: Avoids repeated API calls
892
- - **Timeout handling**: Graceful failures with useful error messages
893
-
894
- ### 📱 **Multi-platform Support**
895
- - **URL format handling**: All YouTube URL variants
896
- - **Mobile URLs**: youtu.be, m.youtube.com
897
- - **Embedded URLs**: youtube.com/embed/
898
- - **Playlist handling**: Extracts individual video IDs
899
-
900
- ## 💡 **Pro Tips for Maximum Success**
901
-
902
- ### 🎯 **Choose the Right Videos**
903
- 1. **Look for CC icon**: Videos with captions have 90%+ success rate
904
- 2. **Educational channels**: Almost always have transcripts
905
- 3. **Popular content**: Auto-generated captions more likely
906
- 4. **Longer videos**: Usually have more detailed descriptions
907
- 5. **Professional creators**: Better metadata and descriptions
908
-
909
- ### 🔍 **Troubleshooting Steps**
910
- 1. **Check video accessibility**: Can you view it normally?
911
- 2. **Look for captions**: CC button available on YouTube?
912
- 3. **Read description**: Is there meaningful text content?
913
- 4. **Try similar videos**: From the same creator or channel
914
- 5. **Check video age**: Newer videos might have better metadata
915
-
916
- ### 🚀 **Optimization Strategies**
917
- 1. **Batch processing**: Try multiple videos from same channel
918
- 2. **Time of day**: Success rates vary by server load
919
- 3. **Video selection**: Educational > Entertainment > Music
920
- 4. **Language preference**: English content has highest success rate
921
- 5. **Channel reputation**: Established channels have better metadata
922
-
923
- ## 🆘 **Still Having Issues?**
924
-
925
- ### 🔧 **Immediate Solutions**
926
- 1. **Try the examples**: Start with our tested working examples
927
- 2. **Check video type**: Educational content works best
928
- 3. **Verify URL format**: Ensure proper YouTube URL structure
929
- 4. **Test with captions**: Try videos with visible CC icon
930
- 5. **Use different videos**: Success varies significantly by content
931
-
932
- ### 📞 **Advanced Support**
933
- 1. **Local deployment**: Run on your own machine for better IP reputation
934
- 2. **API keys**: Use your own YouTube API credentials
935
- 3. **VPN usage**: Change IP location for better access
936
- 4. **Browser testing**: First test if you can access transcripts manually
937
- 5. **Alternative tools**: Consider YouTube-dl or similar tools
938
-
939
- ### 📈 **Expected Behavior**
940
- - **First attempt**: ~70% success rate with good content
941
- - **With retries**: ~85% success rate for extractable content
942
- - **Fallback info**: 95%+ success rate for basic video information
943
- - **Complete failure**: <5% for public, accessible videos
944
-
945
- ## 🎉 **Success Indicators**
946
-
947
- **✅ Full Success**: Complete transcript + AI summary
948
- **⚠️ Partial Success**: Description/metadata + AI summary
949
- **ℹ️ Basic Success**: Video title/author + basic summary
950
- **❌ Failure**: No extractable content (with detailed troubleshooting)
951
-
952
- ---
953
-
954
- *This enhanced version provides comprehensive extraction with intelligent fallbacks.
955
- Even when transcripts aren't available, you'll get useful information and clear explanations of what was attempted.*
956
- """)
957
-
958
- # Event handlers
959
- submit_btn.click(
960
- fn=process_youtube_video,
961
- inputs=[url_input],
962
- outputs=[video_embed, transcript_output, summary_output]
963
- )
964
-
965
- url_input.submit(
966
- fn=process_youtube_video,
967
- inputs=[url_input],
968
- outputs=[video_embed, transcript_output, summary_output]
969
- )
970
-
971
- # Launch configuration
972
- if __name__ == "__main__":
973
- demo.queue(max_size=5, default_concurrency_limit=2)
974
- demo.launch(
975
- server_name="0.0.0.0",
976
- server_port=7860,
977
- share=False,
978
- debug=False,
979
- show_error=True,
980
- max_threads=2
981
- )
 
1
  import gradio as gr
2
+ import yt_dlp
3
+ import os
 
4
  import torch
5
  import gc
6
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
7
+ import tempfile
8
+ import whisper
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ # Load summarizer
11
  @torch.no_grad()
12
  def load_summarizer():
13
+ model_name = "facebook/bart-large-cnn"
14
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
15
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
16
+ return pipeline("summarization", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
 
18
  summarizer = load_summarizer()
19
 
20
+ # Load Whisper model
21
+ whisper_model = whisper.load_model("base") # or "small" for better accuracy
22
+
23
+ def download_audio(url: str, temp_dir: str) -> str:
24
+ """Download audio using yt-dlp and return path"""
25
+ output_path = os.path.join(temp_dir, "audio.%(ext)s")
26
+ ydl_opts = {
27
+ 'format': 'bestaudio/best',
28
+ 'outtmpl': output_path,
29
+ 'quiet': True,
30
+ 'postprocessors': [{
31
+ 'key': 'FFmpegExtractAudio',
32
+ 'preferredcodec': 'mp3',
33
+ 'preferredquality': '192',
34
+ }],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  }
36
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
37
+ ydl.download([url])
38
+ return output_path.replace('%(ext)s', 'mp3')
39
+
40
+ def transcribe_audio(audio_path: str) -> str:
41
+ """Transcribe audio with Whisper"""
42
+ result = whisper_model.transcribe(audio_path)
43
+ return result['text']
44
+
45
+ def summarize_text(text: str) -> str:
46
+ """Summarize text"""
47
+ if len(text.strip()) < 50:
48
+ return "❌ Transcription too short to summarize"
49
+ gc.collect()
50
+ if torch.cuda.is_available():
51
+ torch.cuda.empty_cache()
52
+ summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
53
+ return summary[0]['summary_text']
54
+
55
+ def process_video(url: str) -> str:
56
+ with tempfile.TemporaryDirectory() as tmpdir:
57
+ audio_path = download_audio(url, tmpdir)
58
+ transcription = transcribe_audio(audio_path)
59
+ summary = summarize_text(transcription)
60
+ return summary
61
+
62
+ def main(youtube_url):
63
+ return process_video(youtube_url)
64
+
65
+ iface = gr.Interface(fn=main, inputs="text", outputs="text", title="YouTube Audio Summarizer")
66
+
67
+ iface.launch()