import os import cv2 import gradio as gr from dotenv import load_dotenv import spaces from main import ( run, detect_scenes, extract_keyframes, generate_scene_caption, generate_video_summary, generate_video_summary_groq, vqa_matches, semantic_matches, remove_scenes, ) # Load environment variables load_dotenv() if not os.getenv("HF_TOKEN"): raise ValueError("❌ Error: HF_TOKEN not found in .env file") def process_video(video_path, query, progress=gr.Progress()): """Scene‐filtering tab: remove scenes matching the query.""" try: os.makedirs("outputs", exist_ok=True) output_path = os.path.join("outputs", "trimmed_video.mp4") # 1) Detect scenes progress(0.0, desc="Detecting scenes...") scenes = detect_scenes(video_path) # 2) Extract keyframes progress(0.2, desc="Extracting keyframes...") keyframes = extract_keyframes(video_path, scenes) # 3) Caption each keyframe progress(0.4, desc="Generating captions...") captions = [generate_scene_caption(frame) for _, frame in keyframes] # 4) VQA + semantic filtering progress(0.6, desc="Analyzing scenes...") vqa_mask = vqa_matches(keyframes, query) sem_idxs, _= semantic_matches(captions, query) # 5) Build removal list to_remove = sorted({i for i, flag in enumerate(vqa_mask) if flag} | set(sem_idxs)) # 6) Trim via ffmpeg progress(0.8, desc="Processing video...") if to_remove: remove_scenes(video_path, scenes, to_remove, output_path) # Verify the output video if not os.path.exists(output_path): return None, "❌ Error: Failed to create output video" # Check if video is valid cap = cv2.VideoCapture(output_path) if not cap.isOpened(): return None, "❌ Error: Generated video is invalid" cap.release() stats = [ "✅ Processing complete!", f"📊 Total scenes: {len(scenes)}", f"🗑️ Scenes removed: {len(to_remove)}", f"🎬 Scenes kept: {len(scenes)-len(to_remove)}", "\n🔍 Scene captions:", *[f"[Scene {i}]: {cap}" for i, cap in enumerate(captions)] ] return output_path, "\n".join(stats) else: return None, "⚠️ No matching scenes found; no trimming done." except Exception as e: return None, f"❌ Error: {e}" def generate_video_description(video_path, progress=gr.Progress()): """Video‐description tab: full scene‐by‐scene summary.""" try: progress(0.0, desc="Detecting scenes...") scenes = detect_scenes(video_path) progress(0.3, desc="Extracting keyframes...") keyframes = extract_keyframes(video_path, scenes) progress(0.6, desc="Captioning scenes...") captions = [generate_scene_caption(frame) for _, frame in keyframes] # build & return the summary paragraph summary = generate_video_summary(captions) return summary except Exception as e: return f"❌ Error: {e}" @spaces.GPU def get_frame_description(video_path, frame_number): """Frame‐analysis tab: show scene descriptions.""" try: # Get scenes and keyframes scenes = detect_scenes(video_path) keyframes = extract_keyframes(video_path, scenes) # Generate captions for all scenes captions = [generate_scene_caption(frame) for _, frame in keyframes] # Format the output with timestamps output = [] for i, ((start, end), caption) in enumerate(zip(scenes, captions)): start_time = start.get_seconds() end_time = end.get_seconds() output.append(f"Scene {i+1} ({start_time:.1f}s - {end_time:.1f}s):\n{caption}\n") return "\n".join(output) except Exception as e: return f"❌ Error: {e}" # ─── Gradio UI ──────────────────────────────────────────────────────────────── with gr.Blocks(theme=gr.themes.Soft(), css=""" footer {visibility: hidden} .custom-footer { text-align: center; margin-top: 2em; margin-bottom: 1em; color: #666; } .description { color: #666; font-size: 0.9em; line-height: 1.5; } .tech-stack { background: var(--background-fill-secondary); padding: 1em; border-radius: 8px; margin: 1em 0; border: 1px solid var(--border-color-primary); color: var(--body-text-color); } """) as demo: gr.Markdown(""" # Videoxity A powerful playground for video analysis and manipulation using state-of-the-art Vision-Language models.