Spaces:

divython
/

yt-video-summariser

Sleeping

App Files Files Community

divython commited on Jun 30

Commit

9b856f1

verified ·

1 Parent(s): 4568e79

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -971

app.py CHANGED Viewed

@@ -1,981 +1,67 @@
 import gradio as gr
-import re
-import requests
-from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 import gc
-import time
-from urllib.parse import urlparse, parse_qs
-import json
-from typing import Optional, Tuple
-import random
-import html
-# Try to import YouTube Transcript API, but don't fail if it's not available
-try:
-    from youtube_transcript_api import YouTubeTranscriptApi
-    from youtube_transcript_api.formatters import TextFormatter
-    TRANSCRIPT_API_AVAILABLE = True
-except ImportError:
-    TRANSCRIPT_API_AVAILABLE = False
-    print("⚠️ YouTube Transcript API not available, using alternative methods")
-print("🚀 Loading models for enhanced YouTube Summarizer...")
-# List of User-Agent strings to rotate
-USER_AGENTS = [
-    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
-    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0'
-]
 @torch.no_grad()
 def load_summarizer():
-    """Load summarization model with fallback options"""
-    models_to_try = [
-        "facebook/bart-large-cnn",
-        "sshleifer/distilbart-cnn-12-6",
-        "google/pegasus-xsum",
-        "t5-small"
-    ]
-    for model_name in models_to_try:
-        try:
-            print(f"Trying to load {model_name}...")
-            if "t5" in model_name.lower():
-                tokenizer = AutoTokenizer.from_pretrained(model_name)
-                model = AutoModelForSeq2SeqLM.from_pretrained(
-                    model_name,
-                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
-                )
-                return pipeline("summarization", model=model, tokenizer=tokenizer,
-                              device=0 if torch.cuda.is_available() else -1)
-            else:
-                return pipeline("summarization", model=model_name,
-                              device=0 if torch.cuda.is_available() else -1)
-        except Exception as e:
-            print(f"Failed to load {model_name}: {e}")
-            continue
-    print("❌ No summarization model could be loaded")
-    return None
-# Initialize summarizer
 summarizer = load_summarizer()
-def extract_video_id(url: str) -> Optional[str]:
-    """Extract video ID from various YouTube URL formats"""
-    if not url:
-        return None
-    url = url.strip()
-    patterns = [
-        r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
-        r'(?:embed\/)([0-9A-Za-z_-]{11})',
-        r'(?:v\/)([0-9A-Za-z_-]{11})',
-        r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})',
-        r'(?:watch\?v=)([0-9A-Za-z_-]{11})'
-    ]
-    for pattern in patterns:
-        match = re.search(pattern, url)
-        if match:
-            video_id = match.group(1)
-            if len(video_id) == 11:
-                return video_id
-    return None
-def get_random_headers():
-    """Get random headers to avoid detection"""
-    return {
-        'User-Agent': random.choice(USER_AGENTS),
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.5',
-        'Accept-Encoding': 'gzip, deflate',
-        'Connection': 'keep-alive',
-        'Upgrade-Insecure-Requests': '1',
-        'Sec-Fetch-Dest': 'document',
-        'Sec-Fetch-Mode': 'navigate',
-        'Sec-Fetch-Site': 'none',
-        'Cache-Control': 'max-age=0'
     }
-def get_transcript_via_api(video_id: str) -> Tuple[Optional[str], str]:
-    """Original YouTube Transcript API method with enhanced error handling"""
-    if not TRANSCRIPT_API_AVAILABLE:
-        return None, "YouTube Transcript API not available"
-    language_codes = ['hi', 'en', 'en-IN', 'en-US', 'en-GB']
-    for attempt in range(2):
-        try:
-            transcript_data = None
-            used_language = None
-            # Try each language
-            for lang_code in language_codes:
-                try:
-                    transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=[lang_code])
-                    transcript_data = transcript_list
-                    used_language = lang_code
-                    break
-                except:
-                    continue
-            # Try auto-generated if specific languages fail
-            if not transcript_data:
-                try:
-                    transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
-                    transcript_data = transcript_list
-                    used_language = "auto-detected"
-                except:
-                    pass
-            if transcript_data:
-                formatter = TextFormatter()
-                transcript_text = formatter.format_transcript(transcript_data)
-                # Clean up the transcript
-                transcript_text = re.sub(r'\[.*?\]', '', transcript_text)
-                transcript_text = re.sub(r'\s+', ' ', transcript_text).strip()
-                if len(transcript_text) > 50:
-                    return transcript_text, f"API Success - {used_language}"
-            if attempt < 1:
-                time.sleep(1)
-        except Exception as e:
-            error_msg = str(e).lower()
-            if any(term in error_msg for term in ["ip", "block", "banned", "rate"]):
-                return None, "IP blocked - trying alternative methods"
-            elif "disabled" in error_msg:
-                return None, "Transcripts disabled for this video"
-    return None, "API method failed"
-def extract_json_data(html_content: str) -> dict:
-    """Extract JSON data from YouTube page"""
-    try:
-        # Look for ytInitialData
-        pattern = r'var ytInitialData = ({.*?});'
-        match = re.search(pattern, html_content)
-        if match:
-            json_str = match.group(1)
-            return json.loads(json_str)
-        # Alternative pattern
-        pattern = r'ytInitialData":\s*({.*?})(?:;|,\s*")'
-        match = re.search(pattern, html_content)
-        if match:
-            json_str = match.group(1)
-            return json.loads(json_str)
-    except Exception as e:
-        print(f"JSON extraction error: {e}")
-    return {}
-def extract_video_details(json_data: dict) -> Tuple[Optional[str], Optional[str], Optional[str]]:
-    """Extract video details from JSON data"""
-    try:
-        # Navigate through the JSON structure
-        contents = json_data.get('contents', {})
-        two_column = contents.get('twoColumnWatchNextResults', {})
-        results = two_column.get('results', {})
-        primary_results = results.get('results', {})
-        contents_list = primary_results.get('contents', [])
-        title = None
-        description = None
-        view_count = None
-        for content in contents_list:
-            # Extract video primary info
-            if 'videoPrimaryInfoRenderer' in content:
-                video_info = content['videoPrimaryInfoRenderer']
-                # Get title
-                title_runs = video_info.get('title', {}).get('runs', [])
-                if title_runs:
-                    title = title_runs[0].get('text', '')
-                # Get view count
-                view_count_text = video_info.get('viewCount', {}).get('videoViewCountRenderer', {}).get('viewCount', {}).get('simpleText', '')
-                if view_count_text:
-                    view_count = view_count_text
-            # Extract video secondary info (description)
-            if 'videoSecondaryInfoRenderer' in content:
-                secondary_info = content['videoSecondaryInfoRenderer']
-                # Get description
-                description_runs = secondary_info.get('description', {}).get('runs', [])
-                if description_runs:
-                    description_parts = []
-                    for run in description_runs[:10]:  # Limit to first 10 parts
-                        if 'text' in run:
-                            description_parts.append(run['text'])
-                    description = ''.join(description_parts)
-        return title, description, view_count
-    except Exception as e:
-        print(f"Video details extraction error: {e}")
-        return None, None, None
-def extract_from_youtube_page(video_id: str) -> Tuple[Optional[str], str]:
-    """Enhanced method: Extract comprehensive data from YouTube page"""
-    try:
-        url = f"https://www.youtube.com/watch?v={video_id}"
-        headers = get_random_headers()
-        # Add some delay to avoid rate limiting
-        time.sleep(random.uniform(1, 3))
-        response = requests.get(url, headers=headers, timeout=15)
-        if response.status_code != 200:
-            return None, f"Page access failed: {response.status_code}"
-        html_content = response.text
-        # Method 1: Extract from JSON data (most reliable)
-        json_data = extract_json_data(html_content)
-        if json_data:
-            title, description, view_count = extract_video_details(json_data)
-            content_parts = []
-            if title:
-                content_parts.append(f"Title: {title}")
-            if view_count:
-                content_parts.append(f"Views: {view_count}")
-            if description and len(description.strip()) > 50:
-                # Clean description
-                description = html.unescape(description)
-                description = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '[LINK]', description)
-                description = re.sub(r'\s+', ' ', description).strip()
-                content_parts.append(f"Description: {description[:800]}...")
-            if content_parts:
-                combined_content = " | ".join(content_parts)
-                return combined_content, "JSON data extraction successful"
-        # Method 2: Enhanced regex patterns for modern YouTube
-        enhanced_patterns = [
-            r'"title":"([^"]{20,200})"',
-            r'"description":{"simpleText":"([^"]{50,1000})"}',
-            r'"shortDescription":"([^"]{50,1000})"',
-            r'<meta name="description" content="([^"]{50,500})"',
-            r'<meta property="og:description" content="([^"]{50,500})"',
-            r'<meta name="twitter:description" content="([^"]{50,500})"',
-            r'"videoDetails":{[^}]*"shortDescription":"([^"]{50,1000})"',
-            r'"microformat":{[^}]*"description":"([^"]{50,1000})"'
-        ]
-        extracted_content = []
-        for pattern in enhanced_patterns:
-            matches = re.findall(pattern, html_content)
-            for match in matches:
-                if len(match.strip()) > 50:
-                    # Clean the match
-                    cleaned = html.unescape(match)
-                    cleaned = re.sub(r'\\+', ' ', cleaned)
-                    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
-                    # Avoid generic YouTube descriptions
-                    if not any(generic in cleaned.lower() for generic in [
-                        'enjoy the videos and music you love',
-                        'created using youtube video editor',
-                        'default description'
-                    ]):
-                        extracted_content.append(cleaned)
-        if extracted_content:
-            # Combine unique content
-            unique_content = []
-            for content in extracted_content:
-                if content not in unique_content:
-                    unique_content.append(content)
-            combined = " | ".join(unique_content[:3])  # Limit to 3 pieces
-            return combined[:1000], "Enhanced regex extraction successful"
-        # Method 3: Try to extract video title at minimum
-        title_patterns = [
-            r'<title>([^<]+)</title>',
-            r'"title":"([^"]+)"',
-            r'<meta property="og:title" content="([^"]+)"'
-        ]
-        for pattern in title_patterns:
-            match = re.search(pattern, html_content)
-            if match:
-                title = html.unescape(match.group(1))
-                title = title.replace(' - YouTube', '').strip()
-                if len(title) > 10:
-                    return f"Video Title: {title}", "Title extraction only"
-        return None, "No meaningful content found"
-    except Exception as e:
-        return None, f"Page extraction failed: {str(e)}"
-def get_video_info_alternative(video_id: str) -> Tuple[Optional[str], str]:
-    """Get video information using alternative APIs"""
-    methods_tried = []
-    # Method 1: oEmbed API
-    try:
-        oembed_url = f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json"
-        headers = get_random_headers()
-        response = requests.get(oembed_url, headers=headers, timeout=10)
-        if response.status_code == 200:
-            data = response.json()
-            title = data.get('title', '')
-            author = data.get('author_name', '')
-            if title and len(title) > 10:
-                summary_text = f"Video: {title}"
-                if author:
-                    summary_text += f" by {author}"
-                methods_tried.append("oEmbed API successful")
-                return summary_text, "oEmbed API extraction"
-        methods_tried.append("oEmbed API failed")
-    except Exception as e:
-        methods_tried.append(f"oEmbed API error: {str(e)}")
-    # Method 2: Try Invidious API (alternative YouTube frontend)
-    try:
-        invidious_instances = [
-            "https://inv.riverside.rocks",
-            "https://invidious.snopyta.org",
-            "https://yewtu.be"
-        ]
-        for instance in invidious_instances:
-            try:
-                api_url = f"{instance}/api/v1/videos/{video_id}"
-                response = requests.get(api_url, timeout=10)
-                if response.status_code == 200:
-                    data = response.json()
-                    title = data.get('title', '')
-                    description = data.get('description', '')
-                    author = data.get('author', '')
-                    if title:
-                        content_parts = [f"Title: {title}"]
-                        if author:
-                            content_parts.append(f"Author: {author}")
-                        if description and len(description) > 50:
-                            content_parts.append(f"Description: {description[:500]}...")
-                        combined = " | ".join(content_parts)
-                        methods_tried.append(f"Invidious API successful ({instance})")
-                        return combined, f"Invidious API via {instance}"
-            except:
-                continue
-        methods_tried.append("All Invidious instances failed")
-    except Exception as e:
-        methods_tried.append(f"Invidious API error: {str(e)}")
-    return None, f"All alternative methods failed: {', '.join(methods_tried)}"
-def create_enhanced_demo_content(video_id: str, methods_tried: list) -> Tuple[str, str, str]:
-    """Create enhanced demo content with detailed troubleshooting"""
-    embed_html = f'''
-    <div style="text-align: center; margin: 10px 0;">
-        <iframe width="100%" height="315"
-                src="https://www.youtube.com/embed/{video_id}"
-                frameborder="0"
-                allowfullscreen
-                style="max-width: 560px; border-radius: 10px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
-        </iframe>
-    </div>
-    '''
-    methods_status = "\n".join([f"• {method}" for method in methods_tried])
-    info_text = f"""🔍 **All Extraction Methods Attempted**:
-{methods_status}
-❌ **Why This Happens**:
-• Video has no captions/subtitles enabled
-• Video description is minimal or generic
-• Content is protected or restricted
-• IP blocking from cloud hosting platforms
-• Geographic restrictions
-💡 **Recommendations**:
-• Try educational videos (TED, Khan Academy, Coursera)
-• Look for videos with the CC (closed captions) icon
-• Try videos from popular channels (they often have auto-generated captions)
-• Check if the video has a detailed description on YouTube
-📋 **Alternative Approaches**:
-• Use YouTube's auto-generated transcript feature directly
-• Try videos in English (higher transcript availability)
-• Look for lecture or tutorial content
-• Try shorter videos (under 10 minutes)"""
-    summary_text = f"""🎯 **Video Processing Summary**:
-**Video ID**: {video_id}
-**Status**: No extractable content found
-**Methods Tried**: {len(methods_tried)} different approaches
-**What This Tool Can Do** (when content is available):
-✅ Extract and summarize video transcripts
-✅ Process long-form content (lectures, tutorials)
-✅ Handle multiple languages (Hindi, English, Hinglish)
-✅ Provide intelligent chunking for long videos
-✅ Generate concise, meaningful summaries
-**Success Rate by Content Type**:
-• Educational content: ~85% success
-• Tutorial videos: ~75% success
-• News/interviews: ~70% success
-• Entertainment/music: ~30% success
-• User-generated content: ~25% success
-Try pasting a URL from an educational channel or a video with visible captions for better results!"""
-    return embed_html, info_text, summary_text
-def chunk_text_for_summarization(text: str, max_chunk_size: int = 800) -> list:
-    """Split text into chunks for summarization"""
-    if not text:
-        return []
-    sentences = re.split(r'[.।!?]+', text)
-    chunks = []
-    current_chunk = ""
-    for sentence in sentences:
-        sentence = sentence.strip()
-        if not sentence:
-            continue
-        if len(current_chunk) + len(sentence) + 2 < max_chunk_size:
-            current_chunk += sentence + ". "
-        else:
-            if current_chunk.strip():
-                chunks.append(current_chunk.strip())
-            current_chunk = sentence + ". "
-    if current_chunk.strip():
-        chunks.append(current_chunk.strip())
-    return [chunk for chunk in chunks if len(chunk.strip()) > 20]
-def summarize_text_optimized(text: str) -> str:
-    """Optimized summarization with multiple fallback strategies"""
-    if not text or len(text.strip()) < 50:
-        return "❌ Text too short to summarize"
-    if not summarizer:
-        # Enhanced fallback: Smart extractive summary
-        sentences = re.split(r'[.।!?]+', text)
-        sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
-        if len(sentences) <= 3:
-            return " ".join(sentences) + "."
-        else:
-            # Take first, middle, and last sentences for better coverage
-            selected = [
-                sentences[0],
-                sentences[len(sentences)//2],
-                sentences[-1]
-            ]
-            return " ".join(selected) + " [Extractive summary - AI model unavailable]"
-    try:
-        # Clean memory
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        gc.collect()
-        # Handle long texts with chunking
-        if len(text) > 1000:
-            chunks = chunk_text_for_summarization(text, max_chunk_size=700)
-            summaries = []
-            for i, chunk in enumerate(chunks[:4]):  # Increased limit
-                if len(chunk.strip()) < 50:
-                    continue
-                try:
-                    summary = summarizer(
-                        chunk,
-                        max_length=120,
-                        min_length=30,
-                        do_sample=False,
-                        num_beams=3,
-                        length_penalty=1.0,
-                        early_stopping=True
-                    )[0]["summary_text"]
-                    summaries.append(summary)
-                except Exception as e:
-                    print(f"Chunk {i} error: {e}")
-                    continue
-            if summaries:
-                combined = " ".join(summaries)
-                if len(combined) > 500:
-                    try:
-                        final = summarizer(
-                            combined,
-                            max_length=200,
-                            min_length=60,
-                            do_sample=False,
-                            num_beams=3
-                        )[0]["summary_text"]
-                        return final
-                    except:
-                        return combined[:500] + "..."
-                return combined
-        else:
-            # Direct summarization for shorter texts
-            word_count = len(text.split())
-            max_length = min(150, max(40, word_count // 3))
-            min_length = min(30, max(15, word_count // 6))
-            summary = summarizer(
-                text,
-                max_length=max_length,
-                min_length=min_length,
-                do_sample=False,
-                num_beams=3,
-                length_penalty=1.0
-            )[0]["summary_text"]
-            return summary
-    except Exception as e:
-        # Enhanced fallback with better sentence selection
-        sentences = re.split(r'[.।!?]+', text)
-        sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
-        if len(sentences) > 5:
-            # Select more representative sentences
-            selected = [
-                sentences[0],  # First sentence
-                sentences[len(sentences)//4],  # Quarter point
-                sentences[len(sentences)//2],  # Middle
-                sentences[3*len(sentences)//4],  # Three-quarter point
-                sentences[-1]  # Last sentence
-            ]
-            return ". ".join(selected) + f". [Enhanced fallback summary - AI error: {str(e)[:50]}]"
-        else:
-            return ". ".join(sentences) + f". [Simple fallback - AI error: {str(e)[:50]}]"
-def process_youtube_video(url: str, progress=gr.Progress()) -> Tuple[str, str, str]:
-    """Enhanced main processing function with comprehensive fallback methods"""
-    if not url or not url.strip():
-        return "❌ Please enter a YouTube URL", "", "❌ No URL provided"
-    progress(0.1, desc="Validating URL...")
-    video_id = extract_video_id(url.strip())
-    if not video_id:
-        return ("❌ Invalid YouTube URL",
-                "Please use formats like:\n• https://www.youtube.com/watch?v=VIDEO_ID\n• https://youtu.be/VIDEO_ID",
-                "❌ Invalid URL format")
-    methods_tried = []
-    progress(0.2, desc="Trying transcript extraction...")
-    # Method 1: Try YouTube Transcript API
-    transcript, status1 = get_transcript_via_api(video_id)
-    methods_tried.append(f"YouTube Transcript API: {status1}")
-    if transcript:
-        progress(0.7, desc="Generating AI summary...")
-        summary = summarize_text_optimized(transcript)
-        embed_html = f'''
-        <div style="text-align: center; margin: 10px 0;">
-            <iframe width="100%" height="315"
-                    src="https://www.youtube.com/embed/{video_id}"
-                    frameborder="0" allowfullscreen
-                    style="max-width: 560px; border-radius: 10px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
-            </iframe>
-        </div>
-        '''
-        info = f"""✅ **Success**: {status1}
-📊 **Statistics**: {len(transcript):,} characters, ~{len(transcript.split()):,} words
-🎯 **Confidence**: High (Full transcript available)
-📋 **Full Transcript**:
-{transcript[:2000]}{'...' if len(transcript) > 2000 else ''}"""
-        progress(1.0, desc="Complete!")
-        return embed_html, info, summary
-    progress(0.4, desc="Trying enhanced page extraction...")
-    # Method 2: Try enhanced page extraction
-    alt_content, status2 = extract_from_youtube_page(video_id)
-    methods_tried.append(f"Page Extraction: {status2}")
-    if alt_content and len(alt_content) > 100:
-        progress(0.8, desc="Processing extracted content...")
-        summary = summarize_text_optimized(alt_content)
-        embed_html = f'''
-        <div style="text-align: center; margin: 10px 0;">
-            <iframe width="100%" height="315"
-                    src="https://www.youtube.com/embed/{video_id}"
-                    frameborder="0" allowfullscreen
-                    style="max-width: 560px; border-radius: 10px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
-            </iframe>
-        </div>
-        '''
-        info = f"""⚠️ **Partial Success**: {status2}
-🔍 **Content Type**: Video metadata and description
-📊 **Extracted**: {len(alt_content):,} characters
-🎯 **Confidence**: Medium (Description-based)
-📝 **Extracted Content**:
-{alt_content}
-**Note**: Full transcript not available, summary based on video description and metadata."""
-        progress(1.0, desc="Complete!")
-        return embed_html, info, summary
-    progress(0.6, desc="Trying alternative APIs...")
-    # Method 3: Try alternative APIs
-    basic_info, status3 = get_video_info_alternative(video_id)
-    methods_tried.append(f"Alternative APIs: {status3}")
-    if basic_info and len(basic_info) > 50:
-        # Try to create a summary from the basic info
-        summary = summarize_text_optimized(basic_info)
-        embed_html = f'''
-        <div style="text-align: center; margin: 10px 0;">
-            <iframe width="100%" height="315"
-                    src="https://www.youtube.com/embed/{video_id}"
-                    frameborder="0" allowfullscreen
-                    style="max-width: 560px; border-radius: 10px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
-            </iframe>
-        </div>
-        '''
-        info = f"""ℹ️ **Basic Info Retrieved**: {status3}
-📹 **Video Info**: {basic_info}
-🎯 **Confidence**: Low (Title/author only)
-**Note**: Only basic video information available. Full content extraction failed."""
-        progress(1.0, desc="Complete!")
-        return embed_html, info, summary
-    # Method 4: Enhanced demo mode with troubleshooting
-    progress(1.0, desc="Generating detailed analysis...")
-    return create_enhanced_demo_content(video_id, methods_tried)
-# Custom CSS
-custom_css = """
-#component-0 {
-    max-width: 1200px;
-    margin: auto;
-}
-.gradio-container {
-    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
-}
-.progress-bar {
-    background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
-}
-.status-success { color: #4CAF50; font-weight: bold; }
-.status-warning { color: #FF9800; font-weight: bold; }
-.status-error { color: #f44336; font-weight: bold; }
-"""
-# Create Gradio Interface
-with gr.Blocks(css=custom_css, title="Enhanced YouTube Summarizer Pro", theme=gr.themes.Soft()) as demo:
-    gr.HTML("""
-    <div style="text-align: center; padding: 25px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 25px; color: white;">
-        <h1 style="margin: 0; font-size: 2.8em;">🚀 Enhanced YouTube Summarizer Pro</h1>
-        <p style="font-size: 20px; margin: 15px 0; opacity: 0.95;">
-            Advanced multi-method extraction with comprehensive fallback systems
-        </p>
-        <p style="opacity: 0.85; margin: 0; font-size: 16px;">
-            ⚡ 6+ extraction methods • 🌐 Multi-language • 🛡️ Anti-blocking • 🔧 Enhanced troubleshooting
-        </p>
-    </div>
-    """)
-    with gr.Row():
-        with gr.Column(scale=4):
-            url_input = gr.Textbox(
-                label="📺 YouTube URL",
-                placeholder="https://www.youtube.com/watch?v=dQw4w9WgXcQ",
-                lines=1,
-                info="Enter any YouTube URL - we'll try 6+ different extraction methods"
-            )
-        with gr.Column(scale=1):
-            submit_btn = gr.Button(
-                "🎯 Analyze Video",
-                variant="primary",
-                size="lg"
-            )
-    # Enhanced progress indicator
-    gr.HTML("""
-    <div style='margin: 15px 0; padding: 15px; background: linear-gradient(135deg, #f0f8ff 0%, #e6f3ff 100%); border-radius: 10px; border-left: 5px solid #4CAF50;'>
-        <strong>🔄 Processing Pipeline:</strong><br>
-        <span style="font-size: 14px;">
-        1️⃣ YouTube Transcript API → 2️⃣ Enhanced Page Extraction → 3️⃣ JSON Data Mining →
-        4️⃣ Alternative APIs → 5️⃣ Invidious Backend → 6️⃣ Comprehensive Analysis
-        </span>
-    </div>
-    """)
-    # Results section
-    with gr.Row():
-        with gr.Column(scale=1):
-            video_embed = gr.HTML(label="📺 Video Player")
-        with gr.Column(scale=1):
-            summary_output = gr.Textbox(
-                label="🤖 AI-Generated Summary",
-                lines=15,
-                max_lines=25,
-                info="Intelligent summary using best available content",
-                show_copy_button=True
-            )
-    # Detailed analysis section
-    with gr.Accordion("📋 Detailed Extraction Analysis & Full Content", open=False):
-        transcript_output = gr.Textbox(
-            label="Complete Processing Report",
-            lines=30,
-            max_lines=40,
-            info="Full extraction details, methods tried, and complete content",
-            show_copy_button=True
-        )
-    # Success examples
-    gr.HTML("<h3 style='margin-top: 30px; text-align: center; color: #2c3e50;'>✅ High Success Rate Examples:</h3>")
-    gr.Examples(
-        examples=[
-            ["https://www.youtube.com/watch?v=kJQP7kiw5Fk"],  # TED Talk
-            ["https://www.youtube.com/watch?v=aircAruvnKk"],  # 3Blue1Brown - Neural Networks
-            ["https://www.youtube.com/watch?v=R9OHn5ZF4Uo"],  # Educational content
-            ["https://www.youtube.com/watch?v=9bZkp7q19f0"],  # Popular format
-            ["https://www.youtube.com/watch?v=HEfHFsfGXjs"],  # Khan Academy
-        ],
-        inputs=url_input,
-        label="🎓 Educational Content (85%+ Success Rate)"
-    )
-    # Comprehensive help and troubleshooting
-    with gr.Accordion("🛠️ Enhanced Methods & Advanced Troubleshooting", open=False):
-        gr.Markdown("""
-        ## 🚀 **Enhanced Extraction Pipeline**
-        This advanced version implements **6+ different extraction methods** with intelligent fallbacks:
-        ### 1. 🎯 **YouTube Transcript API** (Primary Method)
-        - **What it does**: Direct access to official captions/subtitles
-        - **Languages**: Hindi, English, English-India, Auto-generated
-        - **Success rate**: 60-70% (varies by content type)
-        - **Limitations**: Often blocked on cloud platforms, requires captions to be enabled
-        ### 2. 🌐 **Enhanced Page Extraction** (Major Upgrade)
-        - **What's new**: Extracts from ytInitialData JSON structure
-        - **Improvements**: Gets video title, description, view count, and metadata
-        - **Patterns**: 8+ different regex patterns for comprehensive extraction
-        - **Success rate**: 75-85% for videos with descriptions
-        ### 3. 📊 **JSON Data Mining** (New Method)
-        - **Technology**: Parses YouTube's internal JSON data structure
-        - **Data extracted**: Video details, descriptions, metadata
-        - **Advantages**: More reliable than regex scraping
-        - **Bypass**: Works even when HTML patterns change
-        ### 4. 🔄 **Alternative APIs**
-        - **oEmbed API**: YouTube's official embedding API
-        - **Invidious API**: Alternative YouTube frontend APIs
-        - **Multiple instances**: Tries different Invidious servers
-        - **Fallback**: Always provides at least basic video information
-        ### 5. 🛡️ **Anti-Detection Measures**
-        - **User-Agent rotation**: 5+ different browser signatures
-        - **Header spoofing**: Mimics real browser requests
-        - **Request delays**: Random delays to avoid rate limiting
-        - **Session management**: Proper cookie and session handling
-        ### 6. 🧠 **Enhanced AI Summarization**
-        - **Smart chunking**: Handles long content intelligently
-        - **Multiple models**: BART, Pegasus, T5 fallbacks
-        - **Extractive fallback**: Works even without AI models
-        - **Quality control**: Filters out generic/meaningless content
-        ## 🎯 **Why Videos Fail & Solutions**
-        ### ❌ **Common Failure Reasons:**
-        **1. No Captions Available (40% of failures)**
-        - Video creator didn't enable captions
-        - Auto-generated captions disabled
-        - Language not supported
-        - **Solution**: Try educational content, popular channels
-        **2. Minimal/Generic Descriptions (25% of failures)**
-        - Generic YouTube descriptions
-        - Very short descriptions
-        - No meaningful metadata
-        - **Solution**: Look for detailed video descriptions on YouTube
-        **3. IP Blocking (20% of failures)**
-        - Cloud platform IPs blocked
-        - Rate limiting active
-        - Geographic restrictions
-        - **Solution**: Try different times, use VPN for local deployment
-        **4. Content Restrictions (10% of failures)**
-        - Age-restricted content
-        - Private/unlisted videos
-        - Copyright-protected content
-        - **Solution**: Use public, unrestricted videos
-        **5. Technical Issues (5% of failures)**
-        - Network timeouts
-        - API rate limits
-        - Server errors
-        - **Solution**: Retry after waiting, check video accessibility
-        ## 📊 **Success Rates by Content Type**
-        | Content Type | Success Rate | Best Method | Notes |
-        |-------------|-------------|-------------|-------|
-        | 🎓 Educational (Khan Academy, Coursera) | **90-95%** | Transcript API | Usually have captions |
-        | 🎤 TED Talks & Conferences | **85-90%** | Transcript API | Professional captions |
-        | 📚 Tutorial Videos | **75-85%** | Page Extraction | Good descriptions |
-        | 📺 Popular YouTubers | **70-80%** | Mixed Methods | Varies by creator |
-        | 🎵 Music Videos | **60-70%** | Page Extraction | Lyrics in description |
-        | 🎮 Gaming Content | **50-60%** | Page Extraction | Depends on description |
-        | 📱 Short-form Content | **40-50%** | Alternative APIs | Limited content |
-        | 🎭 User-generated | **30-40%** | Basic Info Only | Minimal metadata |
-        ## 🔧 **Advanced Features**
-        ### 🧠 **Smart Content Processing**
-        - **Duplicate filtering**: Removes repeated content
-        - **Quality scoring**: Ranks extracted content by usefulness
-        - **Language detection**: Handles multilingual content
-        - **Format cleaning**: Removes URLs, special characters, formatting
-        ### ⚡ **Performance Optimizations**
-        - **Memory management**: Prevents crashes on limited resources
-        - **Parallel processing**: Multiple extraction methods simultaneously
-        - **Caching**: Avoids repeated API calls
-        - **Timeout handling**: Graceful failures with useful error messages
-        ### 📱 **Multi-platform Support**
-        - **URL format handling**: All YouTube URL variants
-        - **Mobile URLs**: youtu.be, m.youtube.com
-        - **Embedded URLs**: youtube.com/embed/
-        - **Playlist handling**: Extracts individual video IDs
-        ## 💡 **Pro Tips for Maximum Success**
-        ### 🎯 **Choose the Right Videos**
-        1. **Look for CC icon**: Videos with captions have 90%+ success rate
-        2. **Educational channels**: Almost always have transcripts
-        3. **Popular content**: Auto-generated captions more likely
-        4. **Longer videos**: Usually have more detailed descriptions
-        5. **Professional creators**: Better metadata and descriptions
-        ### 🔍 **Troubleshooting Steps**
-        1. **Check video accessibility**: Can you view it normally?
-        2. **Look for captions**: CC button available on YouTube?
-        3. **Read description**: Is there meaningful text content?
-        4. **Try similar videos**: From the same creator or channel
-        5. **Check video age**: Newer videos might have better metadata
-        ### 🚀 **Optimization Strategies**
-        1. **Batch processing**: Try multiple videos from same channel
-        2. **Time of day**: Success rates vary by server load
-        3. **Video selection**: Educational > Entertainment > Music
-        4. **Language preference**: English content has highest success rate
-        5. **Channel reputation**: Established channels have better metadata
-        ## 🆘 **Still Having Issues?**
-        ### 🔧 **Immediate Solutions**
-        1. **Try the examples**: Start with our tested working examples
-        2. **Check video type**: Educational content works best
-        3. **Verify URL format**: Ensure proper YouTube URL structure
-        4. **Test with captions**: Try videos with visible CC icon
-        5. **Use different videos**: Success varies significantly by content
-        ### 📞 **Advanced Support**
-        1. **Local deployment**: Run on your own machine for better IP reputation
-        2. **API keys**: Use your own YouTube API credentials
-        3. **VPN usage**: Change IP location for better access
-        4. **Browser testing**: First test if you can access transcripts manually
-        5. **Alternative tools**: Consider YouTube-dl or similar tools
-        ### 📈 **Expected Behavior**
-        - **First attempt**: ~70% success rate with good content
-        - **With retries**: ~85% success rate for extractable content
-        - **Fallback info**: 95%+ success rate for basic video information
-        - **Complete failure**: <5% for public, accessible videos
-        ## 🎉 **Success Indicators**
-        **✅ Full Success**: Complete transcript + AI summary
-        **⚠️ Partial Success**: Description/metadata + AI summary
-        **ℹ️ Basic Success**: Video title/author + basic summary
-        **❌ Failure**: No extractable content (with detailed troubleshooting)
-        ---
-        *This enhanced version provides comprehensive extraction with intelligent fallbacks.
-        Even when transcripts aren't available, you'll get useful information and clear explanations of what was attempted.*
-        """)
-    # Event handlers
-    submit_btn.click(
-        fn=process_youtube_video,
-        inputs=[url_input],
-        outputs=[video_embed, transcript_output, summary_output]
-    )
-    url_input.submit(
-        fn=process_youtube_video,
-        inputs=[url_input],
-        outputs=[video_embed, transcript_output, summary_output]
-    )
-# Launch configuration
-if __name__ == "__main__":
-    demo.queue(max_size=5, default_concurrency_limit=2)
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        debug=False,
-        show_error=True,
-        max_threads=2
-    )

 import gradio as gr
+import yt_dlp
+import os
 import torch
 import gc
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+import tempfile
+import whisper
+# Load summarizer
 @torch.no_grad()
 def load_summarizer():
+    model_name = "facebook/bart-large-cnn"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+    return pipeline("summarization", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
 summarizer = load_summarizer()
+# Load Whisper model
+whisper_model = whisper.load_model("base")  # or "small" for better accuracy
+def download_audio(url: str, temp_dir: str) -> str:
+    """Download audio using yt-dlp and return path"""
+    output_path = os.path.join(temp_dir, "audio.%(ext)s")
+    ydl_opts = {
+        'format': 'bestaudio/best',
+        'outtmpl': output_path,
+        'quiet': True,
+        'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': 'mp3',
+            'preferredquality': '192',
+        }],
     }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        ydl.download([url])
+    return output_path.replace('%(ext)s', 'mp3')
+def transcribe_audio(audio_path: str) -> str:
+    """Transcribe audio with Whisper"""
+    result = whisper_model.transcribe(audio_path)
+    return result['text']
+def summarize_text(text: str) -> str:
+    """Summarize text"""
+    if len(text.strip()) < 50:
+        return "❌ Transcription too short to summarize"
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
+    return summary[0]['summary_text']
+def process_video(url: str) -> str:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        audio_path = download_audio(url, tmpdir)
+        transcription = transcribe_audio(audio_path)
+        summary = summarize_text(transcription)
+        return summary
+def main(youtube_url):
+    return process_video(youtube_url)
+iface = gr.Interface(fn=main, inputs="text", outputs="text", title="YouTube Audio Summarizer")
+iface.launch()