Spaces:
Sleeping
Sleeping
import gradio as gr | |
import re | |
import requests | |
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM | |
import torch | |
import gc | |
import time | |
from urllib.parse import urlparse, parse_qs | |
import json | |
from typing import Optional, Tuple | |
import random | |
# Try to import YouTube Transcript API, but don't fail if it's not available | |
try: | |
from youtube_transcript_api import YouTubeTranscriptApi | |
from youtube_transcript_api.formatters import TextFormatter | |
TRANSCRIPT_API_AVAILABLE = True | |
except ImportError: | |
TRANSCRIPT_API_AVAILABLE = False | |
print("β οΈ YouTube Transcript API not available, using alternative methods") | |
print("π Loading models for enhanced YouTube Summarizer...") | |
# List of User-Agent strings to rotate | |
USER_AGENTS = [ | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0' | |
] | |
def load_summarizer(): | |
"""Load summarization model with fallback options""" | |
models_to_try = [ | |
"facebook/bart-large-cnn", | |
"sshleifer/distilbart-cnn-12-6", | |
"google/pegasus-xsum", | |
"t5-small" | |
] | |
for model_name in models_to_try: | |
try: | |
print(f"Trying to load {model_name}...") | |
if "t5" in model_name.lower(): | |
# T5 models need different handling | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSeq2SeqLM.from_pretrained( | |
model_name, | |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 | |
) | |
return pipeline("summarization", model=model, tokenizer=tokenizer, | |
device=0 if torch.cuda.is_available() else -1) | |
else: | |
return pipeline("summarization", model=model_name, | |
device=0 if torch.cuda.is_available() else -1) | |
except Exception as e: | |
print(f"Failed to load {model_name}: {e}") | |
continue | |
print("β No summarization model could be loaded") | |
return None | |
# Initialize summarizer | |
summarizer = load_summarizer() | |
def extract_video_id(url: str) -> Optional[str]: | |
"""Extract video ID from various YouTube URL formats""" | |
if not url: | |
return None | |
url = url.strip() | |
patterns = [ | |
r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', | |
r'(?:embed\/)([0-9A-Za-z_-]{11})', | |
r'(?:v\/)([0-9A-Za-z_-]{11})', | |
r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})', | |
r'(?:watch\?v=)([0-9A-Za-z_-]{11})' | |
] | |
for pattern in patterns: | |
match = re.search(pattern, url) | |
if match: | |
video_id = match.group(1) | |
if len(video_id) == 11: | |
return video_id | |
return None | |
def get_random_headers(): | |
"""Get random headers to avoid detection""" | |
return { | |
'User-Agent': random.choice(USER_AGENTS), | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Accept-Encoding': 'gzip, deflate', | |
'Connection': 'keep-alive', | |
'Upgrade-Insecure-Requests': '1', | |
} | |
def get_transcript_via_api(video_id: str) -> Tuple[Optional[str], str]: | |
"""Original YouTube Transcript API method with enhanced error handling""" | |
if not TRANSCRIPT_API_AVAILABLE: | |
return None, "YouTube Transcript API not available" | |
language_codes = ['hi', 'en', 'en-IN', 'en-US', 'en-GB'] | |
for attempt in range(2): # Reduced attempts for faster fallback | |
try: | |
transcript_data = None | |
used_language = None | |
# Try each language | |
for lang_code in language_codes: | |
try: | |
transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=[lang_code]) | |
transcript_data = transcript_list | |
used_language = lang_code | |
break | |
except: | |
continue | |
# Try auto-generated if specific languages fail | |
if not transcript_data: | |
try: | |
transcript_list = YouTubeTranscriptApi.get_transcript(video_id) | |
transcript_data = transcript_list | |
used_language = "auto-detected" | |
except: | |
pass | |
if transcript_data: | |
formatter = TextFormatter() | |
transcript_text = formatter.format_transcript(transcript_data) | |
# Clean up the transcript | |
transcript_text = re.sub(r'\[.*?\]', '', transcript_text) | |
transcript_text = re.sub(r'\s+', ' ', transcript_text).strip() | |
if len(transcript_text) > 50: | |
return transcript_text, f"API Success - {used_language}" | |
if attempt < 1: | |
time.sleep(1) | |
except Exception as e: | |
error_msg = str(e).lower() | |
if any(term in error_msg for term in ["ip", "block", "banned", "rate"]): | |
return None, "IP blocked - trying alternative methods" | |
elif "disabled" in error_msg: | |
return None, "Transcripts disabled for this video" | |
return None, "API method failed" | |
def extract_from_youtube_page(video_id: str) -> Tuple[Optional[str], str]: | |
"""Alternative method: Extract data from YouTube page HTML""" | |
try: | |
url = f"https://www.youtube.com/watch?v={video_id}" | |
headers = get_random_headers() | |
response = requests.get(url, headers=headers, timeout=10) | |
if response.status_code != 200: | |
return None, f"Page access failed: {response.status_code}" | |
html_content = response.text | |
# Look for video metadata in the page | |
patterns = [ | |
r'"videoDetails":\s*{[^}]*"shortDescription":"([^"]*)"', | |
r'"description":\s*{"simpleText":"([^"]*)"', | |
r'<meta name="description" content="([^"]*)"', | |
r'"content":"([^"]*?)","lengthText"' | |
] | |
for pattern in patterns: | |
match = re.search(pattern, html_content) | |
if match: | |
description = match.group(1) | |
# Clean up the description | |
description = description.replace('\\n', ' ').replace('\\', '') | |
description = re.sub(r'\s+', ' ', description).strip() | |
if len(description) > 100: # Ensure meaningful content | |
return description, "Extracted from video description" | |
return None, "No usable content found in page" | |
except Exception as e: | |
return None, f"Page extraction failed: {str(e)}" | |
def get_video_info_alternative(video_id: str) -> Tuple[Optional[str], str]: | |
"""Get video information using alternative methods""" | |
try: | |
# Try oEmbed API (usually works even when other methods fail) | |
oembed_url = f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json" | |
headers = get_random_headers() | |
response = requests.get(oembed_url, headers=headers, timeout=5) | |
if response.status_code == 200: | |
data = response.json() | |
title = data.get('title', '') | |
author = data.get('author_name', '') | |
if title: | |
# Create a basic summary from title and author | |
summary_text = f"Video: {title}" | |
if author: | |
summary_text += f" by {author}" | |
return summary_text, "Basic info from oEmbed API" | |
return None, "oEmbed API failed" | |
except Exception as e: | |
return None, f"Alternative info extraction failed: {str(e)}" | |
def create_demo_content(video_id: str) -> Tuple[str, str, str]: | |
"""Create demo content when transcript is not available""" | |
embed_html = f''' | |
<div style="text-align: center; margin: 10px 0;"> | |
<iframe width="100%" height="315" | |
src="https://www.youtube.com/embed/{video_id}" | |
frameborder="0" | |
allowfullscreen | |
style="max-width: 560px; border-radius: 10px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);"> | |
</iframe> | |
</div> | |
''' | |
info_text = """βΉοΈ **Transcript Unavailable**: This video doesn't have accessible captions or transcripts. | |
π **What we tried**: | |
β’ YouTube Transcript API (multiple languages) | |
β’ Alternative data extraction methods | |
β’ Video metadata extraction | |
π‘ **Suggestions**: | |
β’ Try a video with captions/subtitles enabled | |
β’ Look for educational content (usually has better transcripts) | |
β’ Try popular channels (often have auto-generated captions) | |
π **Working Video Examples**: | |
β’ TED Talks | |
β’ Educational channels (Khan Academy, Crash Course) | |
β’ Tutorial videos | |
β’ News broadcasts""" | |
summary_text = """π― **Demo Mode**: Since transcript extraction failed, here's what this tool can do: | |
**AI Summarization Features**: | |
β’ Intelligent text chunking for long videos | |
β’ Multi-language support (Hindi, English, Hinglish) | |
β’ Key point extraction | |
β’ Automatic content optimization | |
**When transcripts are available, you'll get**: | |
β’ Comprehensive video summary | |
β’ Key topics and themes | |
β’ Main points and conclusions | |
β’ Time-efficient content overview | |
Try with a video that has captions enabled for full functionality!""" | |
return embed_html, info_text, summary_text | |
def chunk_text_for_summarization(text: str, max_chunk_size: int = 800) -> list: | |
"""Split text into chunks for summarization""" | |
if not text: | |
return [] | |
sentences = re.split(r'[.ΰ₯€!?]+', text) | |
chunks = [] | |
current_chunk = "" | |
for sentence in sentences: | |
sentence = sentence.strip() | |
if not sentence: | |
continue | |
if len(current_chunk) + len(sentence) + 2 < max_chunk_size: | |
current_chunk += sentence + ". " | |
else: | |
if current_chunk.strip(): | |
chunks.append(current_chunk.strip()) | |
current_chunk = sentence + ". " | |
if current_chunk.strip(): | |
chunks.append(current_chunk.strip()) | |
return [chunk for chunk in chunks if len(chunk.strip()) > 20] | |
def summarize_text_optimized(text: str) -> str: | |
"""Optimized summarization with multiple fallback strategies""" | |
if not text or len(text.strip()) < 50: | |
return "β Text too short to summarize" | |
if not summarizer: | |
# Fallback: Simple extractive summary | |
sentences = re.split(r'[.ΰ₯€!?]+', text) | |
sentences = [s.strip() for s in sentences if len(s.strip()) > 20] | |
if len(sentences) <= 3: | |
return " ".join(sentences[:2]) + "." | |
else: | |
# Take first, middle, and last sentences | |
selected = [sentences[0], sentences[len(sentences)//2], sentences[-1]] | |
return " ".join(selected) + " [Simple extractive summary - AI model unavailable]" | |
try: | |
# Clean memory | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
gc.collect() | |
# Handle long texts with chunking | |
if len(text) > 1000: | |
chunks = chunk_text_for_summarization(text, max_chunk_size=700) | |
summaries = [] | |
for i, chunk in enumerate(chunks[:3]): # Limit chunks | |
if len(chunk.strip()) < 50: | |
continue | |
try: | |
summary = summarizer( | |
chunk, | |
max_length=100, | |
min_length=20, | |
do_sample=False, | |
num_beams=2, | |
length_penalty=1.0, | |
early_stopping=True | |
)[0]["summary_text"] | |
summaries.append(summary) | |
except Exception as e: | |
print(f"Chunk {i} error: {e}") | |
continue | |
if summaries: | |
combined = " ".join(summaries) | |
if len(combined) > 400: | |
try: | |
final = summarizer( | |
combined, | |
max_length=150, | |
min_length=50, | |
do_sample=False, | |
num_beams=2 | |
)[0]["summary_text"] | |
return final | |
except: | |
return combined[:400] + "..." | |
return combined | |
else: | |
# Direct summarization for shorter texts | |
word_count = len(text.split()) | |
max_length = min(120, max(30, word_count // 3)) | |
min_length = min(25, max(10, word_count // 6)) | |
summary = summarizer( | |
text, | |
max_length=max_length, | |
min_length=min_length, | |
do_sample=False, | |
num_beams=2, | |
length_penalty=1.0 | |
)[0]["summary_text"] | |
return summary | |
except Exception as e: | |
# Final fallback: extractive summary | |
sentences = text.split('.')[:3] | |
return ". ".join(sentences) + f". [Fallback summary due to: {str(e)}]" | |
def process_youtube_video(url: str, progress=gr.Progress()) -> Tuple[str, str, str]: | |
"""Enhanced main processing function with multiple fallback methods""" | |
if not url or not url.strip(): | |
return "β Please enter a YouTube URL", "", "β No URL provided" | |
progress(0.1, desc="Validating URL...") | |
video_id = extract_video_id(url.strip()) | |
if not video_id: | |
return ("β Invalid YouTube URL", | |
"Please use formats like:\nβ’ https://www.youtube.com/watch?v=VIDEO_ID\nβ’ https://youtu.be/VIDEO_ID", | |
"β Invalid URL format") | |
progress(0.2, desc="Trying transcript extraction...") | |
# Method 1: Try YouTube Transcript API | |
transcript, status1 = get_transcript_via_api(video_id) | |
if transcript: | |
progress(0.7, desc="Generating summary...") | |
summary = summarize_text_optimized(transcript) | |
embed_html = f''' | |
<div style="text-align: center; margin: 10px 0;"> | |
<iframe width="100%" height="315" | |
src="https://www.youtube.com/embed/{video_id}" | |
frameborder="0" allowfullscreen | |
style="max-width: 560px; border-radius: 10px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);"> | |
</iframe> | |
</div> | |
''' | |
info = f"""β **Success**: {status1} | |
π **Statistics**: {len(transcript):,} characters, ~{len(transcript.split()):,} words | |
π **Transcript**: | |
{transcript}""" | |
progress(1.0, desc="Complete!") | |
return embed_html, info, summary | |
progress(0.4, desc="Trying alternative methods...") | |
# Method 2: Try page extraction | |
alt_content, status2 = extract_from_youtube_page(video_id) | |
if alt_content: | |
progress(0.8, desc="Processing alternative content...") | |
summary = summarize_text_optimized(alt_content) | |
embed_html = f''' | |
<div style="text-align: center; margin: 10px 0;"> | |
<iframe width="100%" height="315" | |
src="https://www.youtube.com/embed/{video_id}" | |
frameborder="0" allowfullscreen | |
style="max-width: 560px; border-radius: 10px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);"> | |
</iframe> | |
</div> | |
''' | |
info = f"""β οΈ **Limited Success**: {status2} | |
π **Method**: Alternative extraction | |
π **Content**: {alt_content} | |
**Note**: Full transcript not available, using alternative content.""" | |
progress(1.0, desc="Complete!") | |
return embed_html, info, summary | |
progress(0.6, desc="Trying basic video info...") | |
# Method 3: Try basic video info | |
basic_info, status3 = get_video_info_alternative(video_id) | |
if basic_info: | |
embed_html = f''' | |
<div style="text-align: center; margin: 10px 0;"> | |
<iframe width="100%" height="315" | |
src="https://www.youtube.com/embed/{video_id}" | |
frameborder="0" allowfullscreen | |
style="max-width: 560px; border-radius: 10px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);"> | |
</iframe> | |
</div> | |
''' | |
info = f"""βΉοΈ **Basic Info Retrieved**: {status3} | |
πΉ **Video Info**: {basic_info} | |
**Note**: Transcript not available, showing basic video information.""" | |
summary = f"Video information: {basic_info}. Full transcript and detailed summary not available due to access restrictions." | |
progress(1.0, desc="Complete!") | |
return embed_html, info, summary | |
# Method 4: Demo mode | |
progress(1.0, desc="Showing demo mode...") | |
return create_demo_content(video_id) | |
# Custom CSS | |
custom_css = """ | |
#component-0 { | |
max-width: 1100px; | |
margin: auto; | |
} | |
.gradio-container { | |
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; | |
} | |
.progress-bar { | |
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); | |
} | |
""" | |
# Create Gradio Interface | |
with gr.Blocks(css=custom_css, title="Enhanced YouTube Summarizer", theme=gr.themes.Soft()) as demo: | |
gr.HTML(""" | |
<div style="text-align: center; padding: 25px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 25px; color: white;"> | |
<h1 style="margin: 0; font-size: 2.8em;">π Enhanced YouTube Summarizer</h1> | |
<p style="font-size: 20px; margin: 15px 0; opacity: 0.95;"> | |
Multi-method AI summarization with IP blocking workarounds | |
</p> | |
<p style="opacity: 0.85; margin: 0; font-size: 16px;"> | |
β‘ Multiple extraction methods β’ π Multi-language β’ π‘οΈ Anti-blocking features | |
</p> | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(scale=4): | |
url_input = gr.Textbox( | |
label="πΊ YouTube URL", | |
placeholder="https://www.youtube.com/watch?v=dQw4w9WgXcQ", | |
lines=1, | |
info="Enter any YouTube URL - we'll try multiple methods to get content" | |
) | |
with gr.Column(scale=1): | |
submit_btn = gr.Button( | |
"π― Analyze Video", | |
variant="primary", | |
size="lg" | |
) | |
# Progress and status | |
gr.HTML("<div style='margin: 10px 0; padding: 10px; background: #f0f8ff; border-radius: 8px; border-left: 4px solid #4CAF50;'><strong>π Processing Methods:</strong> YouTube API β Page Extraction β Video Info β Demo Mode</div>") | |
# Results | |
with gr.Row(): | |
with gr.Column(scale=1): | |
video_embed = gr.HTML(label="πΊ Video Player") | |
with gr.Column(scale=1): | |
summary_output = gr.Textbox( | |
label="π€ AI Summary", | |
lines=12, | |
max_lines=18, | |
info="AI-generated summary using available content", | |
show_copy_button=True | |
) | |
# Full details | |
with gr.Accordion("π Processing Details & Full Content", open=False): | |
transcript_output = gr.Textbox( | |
label="Complete Processing Log", | |
lines=25, | |
max_lines=35, | |
info="Full extraction details and content", | |
show_copy_button=True | |
) | |
# Working examples | |
gr.HTML("<h3 style='margin-top: 30px; text-align: center;'>β Try these working examples:</h3>") | |
gr.Examples( | |
examples=[ | |
["https://www.youtube.com/watch?v=kJQP7kiw5Fk"], # TED Talk | |
["https://www.youtube.com/watch?v=aircAruvnKk"], # 3Blue1Brown | |
["https://www.youtube.com/watch?v=R9OHn5ZF4Uo"], # Educational | |
["https://youtu.be/9bZkp7q19f0"], # Short format | |
], | |
inputs=url_input, | |
label="Educational Videos (Higher Success Rate)" | |
) | |
# Comprehensive help | |
with gr.Accordion("π οΈ Methods & Troubleshooting Guide", open=False): | |
gr.Markdown(""" | |
## π **Multiple Extraction Methods** | |
This enhanced version tries **4 different approaches** in sequence: | |
### 1. π― **YouTube Transcript API** (Primary) | |
- Direct access to official captions/subtitles | |
- Supports multiple languages (Hi, En, Auto-generated) | |
- **Limitation**: Often blocked on cloud platforms | |
### 2. π **Page Content Extraction** (Fallback #1) | |
- Scrapes video description and metadata from page HTML | |
- Uses rotating user agents to avoid detection | |
- **Works when**: Video has detailed description | |
### 3. π **oEmbed API** (Fallback #2) | |
- Gets basic video information (title, author) | |
- Usually works even when other methods fail | |
- **Provides**: Limited but useful summary | |
### 4. π **Demo Mode** (Final Fallback) | |
- Shows video player and explains tool capabilities | |
- Demonstrates what would happen with working transcript | |
- **Always works**: Never fails completely | |
## π« **IP Blocking Solutions** | |
**Why it happens:** | |
- YouTube blocks cloud provider IPs (AWS, Google Cloud, HuggingFace) | |
- Anti-bot measures to prevent automated access | |
- Rate limiting and geographic restrictions | |
**Our solutions:** | |
- Multiple extraction methods with different approaches | |
- Random user agent rotation | |
- Graceful degradation with useful fallbacks | |
- Clear explanations when methods fail | |
## π **Success Rate by Video Type** | |
**Highest Success (90%+):** | |
- Educational channels (Khan Academy, Crash Course) | |
- TED Talks and conferences | |
- Tutorial and how-to videos | |
- News broadcasts | |
**Medium Success (60-80%):** | |
- Popular YouTubers with good descriptions | |
- Music videos with lyrics in description | |
- Gaming videos with detailed explanations | |
**Lower Success (30-50%):** | |
- Short clips without captions | |
- User-generated content without descriptions | |
- Videos in less common languages | |
- Private or restricted content | |
## π‘ **Pro Tips for Best Results** | |
1. **Choose videos with captions**: Look for CC icon on YouTube | |
2. **Educational content works best**: Formal channels have better transcripts | |
3. **Try multiple videos**: Success varies by content type | |
4. **Check video description**: Rich descriptions help alternative methods | |
5. **Use popular channels**: They often have auto-generated captions | |
## π§ **Technical Features** | |
- **Smart chunking**: Handles long videos efficiently | |
- **Memory optimization**: Prevents crashes on limited resources | |
- **Multi-language support**: Hindi, English, Hinglish detection | |
- **Error recovery**: Continues processing despite partial failures | |
- **Progress tracking**: Real-time status updates | |
## π **Still Having Issues?** | |
1. **Try different videos**: Success varies significantly | |
2. **Check video accessibility**: Must be public with some form of text content | |
3. **Wait and retry**: IP blocks are often temporary | |
4. **Use local deployment**: Download and run on your own machine | |
5. **Report issues**: Let us know which videos consistently fail | |
""") | |
# Event handlers | |
submit_btn.click( | |
fn=process_youtube_video, | |
inputs=[url_input], | |
outputs=[video_embed, transcript_output, summary_output] | |
) | |
url_input.submit( | |
fn=process_youtube_video, | |
inputs=[url_input], | |
outputs=[video_embed, transcript_output, summary_output] | |
) | |
# Launch configuration | |
if __name__ == "__main__": | |
demo.queue(max_size=3, default_concurrency_limit=1) | |
demo.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=False, | |
debug=False, | |
show_error=True, | |
max_threads=1 | |
) |