Spaces:

fau
/

videoxity

Running

File size: 9,411 Bytes

import os
import cv2
import gradio as gr
from dotenv import load_dotenv
import spaces

from main import (
    run,
    detect_scenes,
    extract_keyframes,
    generate_scene_caption,
    generate_video_summary,
    generate_video_summary_groq,
    vqa_matches,
    semantic_matches,
    remove_scenes,
)

# Load environment variables
load_dotenv()
if not os.getenv("HF_TOKEN"):
    raise ValueError("❌ Error: HF_TOKEN not found in .env file")


def process_video(video_path, query, progress=gr.Progress()):
    """Scene‐filtering tab: remove scenes matching the query."""
    try:
        os.makedirs("outputs", exist_ok=True)
        output_path = os.path.join("outputs", "trimmed_video.mp4")

        # 1) Detect scenes
        progress(0.0, desc="Detecting scenes...")
        scenes = detect_scenes(video_path)

        # 2) Extract keyframes
        progress(0.2, desc="Extracting keyframes...")
        keyframes = extract_keyframes(video_path, scenes)

        # 3) Caption each keyframe
        progress(0.4, desc="Generating captions...")
        captions = [generate_scene_caption(frame) for _, frame in keyframes]

        # 4) VQA + semantic filtering
        progress(0.6, desc="Analyzing scenes...")
        vqa_mask   = vqa_matches(keyframes, query)
        sem_idxs, _= semantic_matches(captions, query)

        # 5) Build removal list
        to_remove = sorted({i for i, flag in enumerate(vqa_mask) if flag} | set(sem_idxs))

        # 6) Trim via ffmpeg
        progress(0.8, desc="Processing video...")
        if to_remove:
            remove_scenes(video_path, scenes, to_remove, output_path)
            
            # Verify the output video
            if not os.path.exists(output_path):
                return None, "❌ Error: Failed to create output video"
                
            # Check if video is valid
            cap = cv2.VideoCapture(output_path)
            if not cap.isOpened():
                return None, "❌ Error: Generated video is invalid"
            cap.release()
            
            stats = [
                "✅ Processing complete!",
                f"📊 Total scenes: {len(scenes)}",
                f"🗑️ Scenes removed: {len(to_remove)}",
                f"🎬 Scenes kept: {len(scenes)-len(to_remove)}",
                "\n🔍 Scene captions:",
                *[f"[Scene {i}]: {cap}" for i, cap in enumerate(captions)]
            ]
            return output_path, "\n".join(stats)
        else:
            return None, "⚠️ No matching scenes found; no trimming done."
    except Exception as e:
        return None, f"❌ Error: {e}"


def generate_video_description(video_path, progress=gr.Progress()):
    """Video‐description tab: full scene‐by‐scene summary."""
    try:
        progress(0.0, desc="Detecting scenes...")
        scenes = detect_scenes(video_path)

        progress(0.3, desc="Extracting keyframes...")
        keyframes = extract_keyframes(video_path, scenes)

        progress(0.6, desc="Captioning scenes...")
        captions = [generate_scene_caption(frame) for _, frame in keyframes]

        # build & return the summary paragraph
        summary = generate_video_summary(captions)
        return summary
    except Exception as e:
        return f"❌ Error: {e}"


@spaces.GPU
def get_frame_description(video_path, frame_number):
    """Frame‐analysis tab: show scene descriptions."""
    try:
        # Get scenes and keyframes
        scenes = detect_scenes(video_path)
        keyframes = extract_keyframes(video_path, scenes)
        
        # Generate captions for all scenes
        captions = [generate_scene_caption(frame) for _, frame in keyframes]
        
        # Format the output with timestamps
        output = []
        for i, ((start, end), caption) in enumerate(zip(scenes, captions)):
            start_time = start.get_seconds()
            end_time = end.get_seconds()
            output.append(f"Scene {i+1} ({start_time:.1f}s - {end_time:.1f}s):\n{caption}\n")
        
        return "\n".join(output)
    except Exception as e:
        return f"❌ Error: {e}"

# ─── Gradio UI ────────────────────────────────────────────────────────────────

with gr.Blocks(theme=gr.themes.Soft(), css="""
    footer {visibility: hidden}
    .custom-footer {
        text-align: center;
        margin-top: 2em;
        margin-bottom: 1em;
        color: #666;
    }
    .description {
        color: #666;
        font-size: 0.9em;
        line-height: 1.5;
    }
    .tech-stack {
        background: var(--background-fill-secondary);
        padding: 1em;
        border-radius: 8px;
        margin: 1em 0;
        border: 1px solid var(--border-color-primary);
        color: var(--body-text-color);
    }
""") as demo:
    gr.Markdown("""
    # Videoxity
    
    A powerful playground for video analysis and manipulation using state-of-the-art Vision-Language models.
    
    <div class="description">
    This application demonstrates the capabilities of modern computer vision and natural language processing models in video processing, offering a foundation for developers to build upon and optimize. 
    Whether you're exploring scene detection, content filtering, or video summarization, Videoxity provides the tools to experiment with and enhance video understanding.
    
    ⚠️ Note: This demo is running entirely on CPU. For faster processing, either run it locally or duplicate the space.
    </div>
    
    <div class="tech-stack">
    <strong>Technical Stack:</strong>
    - Scene Detection: PySceneDetect with ContentDetector
    - Vision Models: BLIP (Image Captioning & VQA)
    - Language Models: Groq LLM (Llama 3.1)
    - Video Processing: OpenCV & FFmpeg
    - Embeddings: BGE-Small for semantic search
    </div>
    """)

    with gr.Tabs():
        # 1) Scene Filtering
        with gr.TabItem("Frames to Cut"):
            gr.Markdown("""
            ### Remove specific scenes from your video
            Upload a video and describe which scenes you want to remove. The BLIP Vision-Language model will analyze each scene and cut out the matching ones.
            
            Examples:
            - "Remove the part where there is a cat in the video"
            - "Cut out the scene where people are dancing"
            """)
            with gr.Row():
                with gr.Column():
                    vid1 = gr.Video(
                        label="Upload Video",
                        format="mp4",
                        interactive=True
                    )
                    qry1 = gr.Textbox(
                        label="Scenes to Remove",
                        placeholder="e.g., 'Remove the part where there is a cat in the video'",
                        lines=2
                    )
                    btn1 = gr.Button("Process Video", variant="primary")
                with gr.Column():
                    outVid = gr.Video(
                        label="Processed Video",
                        format="mp4",
                        interactive=True
                    )
                    outTxt = gr.Textbox(label="Results", lines=10)
            btn1.click(
                fn=process_video,
                inputs=[vid1, qry1],
                outputs=[outVid, outTxt]
            )

        # 2) Video Description
        with gr.TabItem("Video Description"):
            gr.Markdown("""
            ### Generate a comprehensive description of your video
            Get BLIP-generated scene descriptions and a Llama 3.1-powered narrative summary of your video.
            """)
            with gr.Row():
                with gr.Column():
                    vid2 = gr.Video(label="Upload Video")
                    btn2 = gr.Button("Generate Description", variant="primary")
                with gr.Column():
                    outDesc = gr.Textbox(
                        label="Video Description",
                        lines=15,
                        show_copy_button=True
                    )
            btn2.click(
                fn=generate_video_description,
                inputs=[vid2],
                outputs=[outDesc]
            )

        # 3) Frame Analysis
        with gr.TabItem("Frame Analysis"):
            gr.Markdown("""
            ### Analyze scenes in your video
            Get detailed scene descriptions using BLIP's image captioning model, with precise timestamps for each scene.
            """)
            with gr.Row():
                with gr.Column():
                    vid3 = gr.Video(label="Upload Video")
                    btn3 = gr.Button("Analyze Scenes", variant="primary")
                with gr.Column():
                    outFrm = gr.Textbox(
                        label="Scene Descriptions",
                        lines=15,
                        show_copy_button=True
                    )
            btn3.click(
                fn=get_frame_description,
                inputs=[vid3],
                outputs=[outFrm]
            )

    # Add custom centered footer
    gr.Markdown("""
    <div class="custom-footer">
    Made with ❤️
    </div>
    """, elem_classes=["custom-footer"])

if __name__ == "__main__":
    demo.launch(share=True, show_error=True, show_api=False)