Spaces:

divython
/

yt-video-summariser

Running

App Files Files Community

divython commited on 23 days ago

Commit

943d5b9

verified ·

1 Parent(s): c64a626

Update app.py

Browse files

Files changed (1) hide show

app.py +303 -78

app.py CHANGED Viewed

@@ -1,19 +1,32 @@
 import gradio as gr
-import pytube
-from transformers import pipeline
-import os
 import re
-# Initialize pipelines
-asr = pipeline("automatic-speech-recognition", model="openai/whisper-base", chunk_length_s=30)
-summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 def extract_video_id(url):
     """Extract video ID from various YouTube URL formats"""
     patterns = [
         r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
         r'(?:embed\/)([0-9A-Za-z_-]{11})',
-        r'(?:v\/)([0-9A-Za-z_-]{11})'
     ]
     for pattern in patterns:
         match = re.search(pattern, url)
@@ -21,105 +34,317 @@ def extract_video_id(url):
             return match.group(1)
     return None
-def summarize_youtube(url):
     try:
-        # Clean up any existing audio file
-        if os.path.exists("audio.mp4"):
-            os.remove("audio.mp4")
-        # Create YouTube object with error handling
-        yt = pytube.YouTube(url, use_oauth=False, allow_oauth_cache=False)
-        # Get audio stream with better filtering
-        audio_streams = yt.streams.filter(only_audio=True, file_extension='mp4')
-        if not audio_streams:
-            # Fallback to any audio stream
-            audio_streams = yt.streams.filter(only_audio=True)
-        if not audio_streams:
-            return "❌ Error: No audio streams available", "Could not extract audio from video", "No summary available"
-        stream = audio_streams.first()
-        # Download with proper filename
-        audio_file = stream.download(filename="audio")
-        # Transcribe
-        result = asr(audio_file)
-        transcript = result["text"]
-        # Clean up audio file
-        if os.path.exists(audio_file):
-            os.remove(audio_file)
-        # Check transcript length for summarization
-        if len(transcript.split()) < 10:
-            return "❌ Error: Transcript too short", transcript, "Cannot summarize - transcript too brief"
-        # Summarize with better parameters
-        max_chunk = 1024  # BART's max input length
-        if len(transcript) > max_chuck:
-            # Split transcript into chunks if too long
-            words = transcript.split()
-            chunks = [' '.join(words[i:i+200]) for i in range(0, len(words), 200)]
             summaries = []
-            for chunk in chunks[:3]:  # Limit to first 3 chunks to avoid timeout
-                if len(chunk.strip()) > 50:
-                    chunk_summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
-                    summaries.append(chunk_summary)
-            summary = " ".join(summaries)
         else:
-            summary = summarizer(transcript, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
-        # Create embed HTML
-        v_id = extract_video_id(url)
-        if v_id:
-            embed_html = f'<iframe width="560" height="315" src="https://www.youtube.com/embed/{v_id}" frameborder="0" allowfullscreen></iframe>'
-        else:
-            embed_html = "❌ Could not extract video ID"
-        return embed_html, transcript, summary
-    except pytube.exceptions.RegexMatchError:
-        return "❌ Error: Invalid YouTube URL", "Please check the URL format", "No summary available"
-    except pytube.exceptions.VideoUnavailable:
-        return "❌ Error: Video unavailable", "Video may be private or deleted", "No summary available"
     except Exception as e:
-        return f"❌ Error: {str(e)}", "An error occurred during processing", "No summary available"
-# Build Gradio app
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## 🎓 Multi-lingual YouTube Summarizer (Hindi / Hinglish / English)")
-    gr.Markdown("Enter a YouTube URL to get an AI-generated summary of the video content.")
     with gr.Row():
-        with gr.Column():
             url_input = gr.Textbox(
-                label="YouTube URL",
                 placeholder="https://www.youtube.com/watch?v=...",
-                lines=1
             )
-            btn = gr.Button("🚀 Summarize Video", variant="primary")
     with gr.Row():
         with gr.Column():
-            vid = gr.HTML(label="Video Player")
         with gr.Column():
-            with gr.Accordion("📝 Transcript", open=False):
-                txt = gr.Textbox(label="Full Transcript", lines=10, max_lines=15)
-            summ = gr.Textbox(label="📋 Summary", lines=5)
-    btn.click(summarize_youtube, inputs=url_input, outputs=[vid, txt, summ])
-    # Add examples
     gr.Examples(
         examples=[
-            ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"],  # Replace with actual examples
         ],
-        inputs=url_input
     )
 if __name__ == "__main__":
-    demo.launch(share=True)

 import gradio as gr
 import re
+import requests
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+from youtube_transcript_api import YouTubeTranscriptApi
+import torch
+import gc
+# Optimize for HuggingFace Spaces - Use smaller models and efficient loading
+print("🚀 Loading models for HuggingFace Spaces...")
+# Use smaller, efficient models
+@torch.no_grad()
+def load_summarizer():
+    model_name = "facebook/bart-large-cnn"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
+    return pipeline("summarization", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
+# Initialize summarizer
+summarizer = load_summarizer()
 def extract_video_id(url):
     """Extract video ID from various YouTube URL formats"""
     patterns = [
         r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
         r'(?:embed\/)([0-9A-Za-z_-]{11})',
+        r'(?:v\/)([0-9A-Za-z_-]{11})',
+        r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})'
     ]
     for pattern in patterns:
         match = re.search(pattern, url)
             return match.group(1)
     return None
+def get_youtube_transcript(video_id):
+    """Get transcript using YouTube Transcript API - Most reliable for HF Spaces"""
     try:
+        # Priority order for languages (Hindi, English variants)
+        language_codes = ['hi', 'en', 'en-IN', 'en-US', 'en-GB']
+        transcript_data = None
+        used_language = None
+        # Try each language
+        for lang_code in language_codes:
+            try:
+                transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=[lang_code])
+                transcript_data = transcript_list
+                used_language = lang_code
+                break
+            except:
+                continue
+        # If specific languages fail, try auto-generated
+        if not transcript_data:
+            try:
+                transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
+                transcript_data = transcript_list
+                used_language = "auto-detected"
+            except Exception as e:
+                return None, f"No transcript available: {str(e)}"
+        # Process transcript
+        if transcript_data:
+            transcript_text = ' '.join([item['text'].replace('\n', ' ') for item in transcript_data])
+            # Clean up common transcript artifacts
+            transcript_text = re.sub(r'\[.*?\]', '', transcript_text)  # Remove [Music], [Applause] etc
+            transcript_text = re.sub(r'\s+', ' ', transcript_text).strip()  # Clean whitespace
+            return transcript_text, f"Transcript found in: {used_language}"
+        return None, "No transcript data found"
+    except Exception as e:
+        return None, f"Transcript API Error: {str(e)}"
+def chunk_text_for_summarization(text, max_chunk_size=800):
+    """Split text into chunks for summarization"""
+    sentences = text.replace('।', '.').split('.')  # Handle Hindi sentences
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+        # Check if adding this sentence would exceed limit
+        if len(current_chunk) + len(sentence) + 1 < max_chunk_size:
+            current_chunk += sentence + ". "
+        else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            current_chunk = sentence + ". "
+    # Add the last chunk
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+def summarize_text_optimized(text):
+    """Optimized summarization for HuggingFace Spaces"""
+    if not text or len(text.strip()) < 100:
+        return "Text too short to summarize (minimum 100 characters required)"
+    try:
+        # Clean memory before processing
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        gc.collect()
+        # For very long texts, chunk them
+        if len(text) > 1500:
+            chunks = chunk_text_for_summarization(text, max_chunk_size=900)
             summaries = []
+            # Process first 3 chunks to avoid timeout
+            for i, chunk in enumerate(chunks[:3]):
+                if len(chunk.strip()) < 50:
+                    continue
+                try:
+                    summary = summarizer(
+                        chunk,
+                        max_length=120,
+                        min_length=30,
+                        do_sample=False,
+                        num_beams=2,  # Reduced for speed
+                        length_penalty=1.0
+                    )[0]["summary_text"]
+                    summaries.append(summary)
+                except Exception as chunk_error:
+                    print(f"Error processing chunk {i}: {chunk_error}")
+                    continue
+            if summaries:
+                combined_summary = " ".join(summaries)
+                # If combined summary is still too long, summarize it again
+                if len(combined_summary) > 600:
+                    try:
+                        final_summary = summarizer(
+                            combined_summary,
+                            max_length=200,
+                            min_length=80,
+                            do_sample=False,
+                            num_beams=2
+                        )[0]["summary_text"]
+                        return final_summary
+                    except:
+                        return combined_summary
+                return combined_summary
+            else:
+                return "Could not generate summary from chunks"
         else:
+            # For shorter texts, direct summarization
+            summary = summarizer(
+                text,
+                max_length=150,
+                min_length=50,
+                do_sample=False,
+                num_beams=2,
+                length_penalty=1.0
+            )[0]["summary_text"]
+            return summary
     except Exception as e:
+        return f"Summarization error: {str(e)}"
+def process_youtube_video(url):
+    """Main processing function optimized for HuggingFace Spaces"""
+    # Input validation
+    if not url or not url.strip():
+        return "❌ Please enter a YouTube URL", "", "No summary available"
+    # Extract video ID
+    video_id = extract_video_id(url.strip())
+    if not video_id:
+        return "❌ Invalid YouTube URL format", "Please check the URL format", "No summary available"
+    # Update progress
+    progress_msg = "🔍 Extracting video transcript..."
+    # Get transcript
+    transcript, status = get_youtube_transcript(video_id)
+    if not transcript:
+        return (
+            "❌ Could not extract transcript",
+            f"Status: {status}\n\nThis video might not have captions/subtitles available.",
+            "Cannot generate summary without transcript"
+        )
+    # Generate summary
+    progress_msg = "🤖 Generating AI summary..."
+    summary = summarize_text_optimized(transcript)
+    # Create video embed
+    embed_html = f'''
+    <div style="text-align: center;">
+        <iframe width="560" height="315"
+                src="https://www.youtube.com/embed/{video_id}"
+                frameborder="0"
+                allowfullscreen
+                style="max-width: 100%; border-radius: 10px;">
+        </iframe>
+    </div>
+    '''
+    # Format transcript info
+    transcript_info = f"""📊 Processing Status: ✅ Success
+🎯 Method: YouTube Transcript API
+🌐 Language: {status}
+📝 Transcript Length: {len(transcript)} characters
+📄 Word Count: ~{len(transcript.split())} words
+📋 Full Transcript:
+{transcript}"""
+    return embed_html, transcript_info, summary
+# Custom CSS for better UI
+custom_css = """
+#component-0 {
+    max-width: 900px;
+    margin: auto;
+}
+.gradio-container {
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+}
+"""
+# Create Gradio Interface optimized for HuggingFace Spaces
+with gr.Blocks(css=custom_css, title="YouTube Video Summarizer", theme=gr.themes.Soft()) as demo:
+    gr.HTML("""
+    <div style="text-align: center; padding: 20px;">
+        <h1>🎓 YouTube Video Summarizer</h1>
+        <p style="font-size: 18px; color: #666;">
+            AI-powered summarization for Hindi, Hinglish & English videos
+        </p>
+        <p style="color: #888;">
+            Optimized for HuggingFace Spaces • Uses YouTube Transcript API
+        </p>
+    </div>
+    """)
     with gr.Row():
+        with gr.Column(scale=2):
             url_input = gr.Textbox(
+                label="📺 YouTube URL",
                 placeholder="https://www.youtube.com/watch?v=...",
+                lines=1,
+                info="Paste any YouTube video URL here"
+            )
+        with gr.Column(scale=1):
+            submit_btn = gr.Button(
+                "🚀 Summarize Video",
+                variant="primary",
+                size="lg"
             )
+    # Results section
     with gr.Row():
         with gr.Column():
+            video_embed = gr.HTML(label="📺 Video Player")
         with gr.Column():
+            summary_output = gr.Textbox(
+                label="📋 AI Summary",
+                lines=8,
+                max_lines=12,
+                info="AI-generated summary of the video content"
+            )
+    # Expandable transcript section
+    with gr.Accordion("📝 Full Transcript & Details", open=False):
+        transcript_output = gr.Textbox(
+            label="Complete Transcript",
+            lines=15,
+            max_lines=25,
+            info="Full video transcript with processing details"
+        )
+    # Examples section
+    gr.HTML("<h3 style='margin-top: 30px;'>🎯 Try these examples:</h3>")
     gr.Examples(
         examples=[
+            ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"],
+            ["https://youtu.be/dQw4w9WgXcQ"],
         ],
+        inputs=url_input,
+        label="Sample URLs"
+    )
+    # Info section
+    with gr.Accordion("ℹ️ How it works", open=False):
+        gr.Markdown("""
+        ### 🔧 How this tool works:
+        1. **Extract Video ID**: Parses the YouTube URL to get the video identifier
+        2. **Fetch Transcript**: Uses YouTube Transcript API to get captions/subtitles
+        3. **AI Summarization**: Processes text through BART model for intelligent summarization
+        4. **Multi-language Support**: Handles Hindi, Hinglish, and English content
+        ### 📋 Supported Languages:
+        - 🇮🇳 **Hindi**: Full support for Hindi captions
+        - 🌐 **Hinglish**: Mixed Hindi-English content
+        - 🇺🇸 **English**: All English variants
+        ### ⚡ Optimizations for HuggingFace Spaces:
+        - Efficient model loading with memory management
+        - Chunked processing for long videos
+        - GPU acceleration when available
+        - Automatic text cleanup and formatting
+        ### ⚠️ Limitations:
+        - Requires videos to have captions/subtitles
+        - Processing time depends on transcript length
+        - Very long videos are chunked to prevent timeouts
+        """)
+    # Event handlers
+    submit_btn.click(
+        fn=process_youtube_video,
+        inputs=[url_input],
+        outputs=[video_embed, transcript_output, summary_output]
+    )
+    url_input.submit(
+        fn=process_youtube_video,
+        inputs=[url_input],
+        outputs=[video_embed, transcript_output, summary_output]
     )
+# Launch configuration for HuggingFace Spaces
 if __name__ == "__main__":
+    demo.queue(concurrency_count=2)  # Limit concurrent users for stability
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,  # Don't need share link in HF Spaces
+        debug=False,  # Disable debug in production
+        enable_queue=True,
+        show_error=True
+    )