import os import cv2 import gradio as gr from dotenv import load_dotenv import spaces from main import ( run, detect_scenes, extract_keyframes, generate_scene_caption, generate_video_summary, generate_video_summary_groq, vqa_matches, semantic_matches, remove_scenes, ) # Load environment variables load_dotenv() if not os.getenv("HF_TOKEN"): raise ValueError("❌ Error: HF_TOKEN not found in .env file") def process_video(video_path, query, progress=gr.Progress()): """Scene‐filtering tab: remove scenes matching the query.""" try: os.makedirs("outputs", exist_ok=True) output_path = os.path.join("outputs", "trimmed_video.mp4") # 1) Detect scenes progress(0.0, desc="Detecting scenes...") scenes = detect_scenes(video_path) # 2) Extract keyframes progress(0.2, desc="Extracting keyframes...") keyframes = extract_keyframes(video_path, scenes) # 3) Caption each keyframe progress(0.4, desc="Generating captions...") captions = [generate_scene_caption(frame) for _, frame in keyframes] # 4) VQA + semantic filtering progress(0.6, desc="Analyzing scenes...") vqa_mask = vqa_matches(keyframes, query) sem_idxs, _= semantic_matches(captions, query) # 5) Build removal list to_remove = sorted({i for i, flag in enumerate(vqa_mask) if flag} | set(sem_idxs)) # 6) Trim via ffmpeg progress(0.8, desc="Processing video...") if to_remove: remove_scenes(video_path, scenes, to_remove, output_path) # Verify the output video if not os.path.exists(output_path): return None, "❌ Error: Failed to create output video" # Check if video is valid cap = cv2.VideoCapture(output_path) if not cap.isOpened(): return None, "❌ Error: Generated video is invalid" cap.release() stats = [ "✅ Processing complete!", f"📊 Total scenes: {len(scenes)}", f"🗑️ Scenes removed: {len(to_remove)}", f"🎬 Scenes kept: {len(scenes)-len(to_remove)}", "\n🔍 Scene captions:", *[f"[Scene {i}]: {cap}" for i, cap in enumerate(captions)] ] return output_path, "\n".join(stats) else: return None, "⚠️ No matching scenes found; no trimming done." except Exception as e: return None, f"❌ Error: {e}" def generate_video_description(video_path, progress=gr.Progress()): """Video‐description tab: full scene‐by‐scene summary.""" try: progress(0.0, desc="Detecting scenes...") scenes = detect_scenes(video_path) progress(0.3, desc="Extracting keyframes...") keyframes = extract_keyframes(video_path, scenes) progress(0.6, desc="Captioning scenes...") captions = [generate_scene_caption(frame) for _, frame in keyframes] # build & return the summary paragraph summary = generate_video_summary(captions) return summary except Exception as e: return f"❌ Error: {e}" @spaces.GPU def get_frame_description(video_path, frame_number): """Frame‐analysis tab: show scene descriptions.""" try: # Get scenes and keyframes scenes = detect_scenes(video_path) keyframes = extract_keyframes(video_path, scenes) # Generate captions for all scenes captions = [generate_scene_caption(frame) for _, frame in keyframes] # Format the output with timestamps output = [] for i, ((start, end), caption) in enumerate(zip(scenes, captions)): start_time = start.get_seconds() end_time = end.get_seconds() output.append(f"Scene {i+1} ({start_time:.1f}s - {end_time:.1f}s):\n{caption}\n") return "\n".join(output) except Exception as e: return f"❌ Error: {e}" # ─── Gradio UI ──────────────────────────────────────────────────────────────── with gr.Blocks(theme=gr.themes.Soft(), css=""" footer {visibility: hidden} .custom-footer { text-align: center; margin-top: 2em; margin-bottom: 1em; color: #666; } .description { color: #666; font-size: 0.9em; line-height: 1.5; } .tech-stack { background: var(--background-fill-secondary); padding: 1em; border-radius: 8px; margin: 1em 0; border: 1px solid var(--border-color-primary); color: var(--body-text-color); } """) as demo: gr.Markdown(""" # Videoxity A powerful playground for video analysis and manipulation using state-of-the-art Vision-Language models.
This application demonstrates the capabilities of modern computer vision and natural language processing models in video processing, offering a foundation for developers to build upon and optimize. Whether you're exploring scene detection, content filtering, or video summarization, Videoxity provides the tools to experiment with and enhance video understanding. ⚠️ Note: This demo is running entirely on CPU. For faster processing, either run it locally or duplicate the space.
Technical Stack: - Scene Detection: PySceneDetect with ContentDetector - Vision Models: BLIP (Image Captioning & VQA) - Language Models: Groq LLM (Llama 3.1) - Video Processing: OpenCV & FFmpeg - Embeddings: BGE-Small for semantic search
""") with gr.Tabs(): # 1) Scene Filtering with gr.TabItem("Frames to Cut"): gr.Markdown(""" ### Remove specific scenes from your video Upload a video and describe which scenes you want to remove. The BLIP Vision-Language model will analyze each scene and cut out the matching ones. Examples: - "Remove the part where there is a cat in the video" - "Cut out the scene where people are dancing" """) with gr.Row(): with gr.Column(): vid1 = gr.Video( label="Upload Video", format="mp4", interactive=True ) qry1 = gr.Textbox( label="Scenes to Remove", placeholder="e.g., 'Remove the part where there is a cat in the video'", lines=2 ) btn1 = gr.Button("Process Video", variant="primary") with gr.Column(): outVid = gr.Video( label="Processed Video", format="mp4", interactive=True ) outTxt = gr.Textbox(label="Results", lines=10) btn1.click( fn=process_video, inputs=[vid1, qry1], outputs=[outVid, outTxt] ) # 2) Video Description with gr.TabItem("Video Description"): gr.Markdown(""" ### Generate a comprehensive description of your video Get BLIP-generated scene descriptions and a Llama 3.1-powered narrative summary of your video. """) with gr.Row(): with gr.Column(): vid2 = gr.Video(label="Upload Video") btn2 = gr.Button("Generate Description", variant="primary") with gr.Column(): outDesc = gr.Textbox( label="Video Description", lines=15, show_copy_button=True ) btn2.click( fn=generate_video_description, inputs=[vid2], outputs=[outDesc] ) # 3) Frame Analysis with gr.TabItem("Frame Analysis"): gr.Markdown(""" ### Analyze scenes in your video Get detailed scene descriptions using BLIP's image captioning model, with precise timestamps for each scene. """) with gr.Row(): with gr.Column(): vid3 = gr.Video(label="Upload Video") btn3 = gr.Button("Analyze Scenes", variant="primary") with gr.Column(): outFrm = gr.Textbox( label="Scene Descriptions", lines=15, show_copy_button=True ) btn3.click( fn=get_frame_description, inputs=[vid3], outputs=[outFrm] ) # Add custom centered footer gr.Markdown(""" """, elem_classes=["custom-footer"]) if __name__ == "__main__": demo.launch(share=True, show_error=True, show_api=False)