videoxity / app.py
zamalali
Improve VL pipeline
7cf4dc6
import os
import cv2
import gradio as gr
from dotenv import load_dotenv
import spaces
from main import (
run,
detect_scenes,
extract_keyframes,
generate_scene_caption,
generate_video_summary,
generate_video_summary_groq,
vqa_matches,
semantic_matches,
remove_scenes,
)
# Load environment variables
load_dotenv()
if not os.getenv("HF_TOKEN"):
raise ValueError("❌ Error: HF_TOKEN not found in .env file")
def process_video(video_path, query, progress=gr.Progress()):
"""Scene‐filtering tab: remove scenes matching the query."""
try:
os.makedirs("outputs", exist_ok=True)
output_path = os.path.join("outputs", "trimmed_video.mp4")
# 1) Detect scenes
progress(0.0, desc="Detecting scenes...")
scenes = detect_scenes(video_path)
# 2) Extract keyframes
progress(0.2, desc="Extracting keyframes...")
keyframes = extract_keyframes(video_path, scenes)
# 3) Caption each keyframe
progress(0.4, desc="Generating captions...")
captions = [generate_scene_caption(frame) for _, frame in keyframes]
# 4) VQA + semantic filtering
progress(0.6, desc="Analyzing scenes...")
vqa_mask = vqa_matches(keyframes, query)
sem_idxs, _= semantic_matches(captions, query)
# 5) Build removal list
to_remove = sorted({i for i, flag in enumerate(vqa_mask) if flag} | set(sem_idxs))
# 6) Trim via ffmpeg
progress(0.8, desc="Processing video...")
if to_remove:
remove_scenes(video_path, scenes, to_remove, output_path)
# Verify the output video
if not os.path.exists(output_path):
return None, "❌ Error: Failed to create output video"
# Check if video is valid
cap = cv2.VideoCapture(output_path)
if not cap.isOpened():
return None, "❌ Error: Generated video is invalid"
cap.release()
stats = [
"✅ Processing complete!",
f"📊 Total scenes: {len(scenes)}",
f"🗑️ Scenes removed: {len(to_remove)}",
f"🎬 Scenes kept: {len(scenes)-len(to_remove)}",
"\n🔍 Scene captions:",
*[f"[Scene {i}]: {cap}" for i, cap in enumerate(captions)]
]
return output_path, "\n".join(stats)
else:
return None, "⚠️ No matching scenes found; no trimming done."
except Exception as e:
return None, f"❌ Error: {e}"
def generate_video_description(video_path, progress=gr.Progress()):
"""Video‐description tab: full scene‐by‐scene summary."""
try:
progress(0.0, desc="Detecting scenes...")
scenes = detect_scenes(video_path)
progress(0.3, desc="Extracting keyframes...")
keyframes = extract_keyframes(video_path, scenes)
progress(0.6, desc="Captioning scenes...")
captions = [generate_scene_caption(frame) for _, frame in keyframes]
# build & return the summary paragraph
summary = generate_video_summary(captions)
return summary
except Exception as e:
return f"❌ Error: {e}"
@spaces.GPU
def get_frame_description(video_path, frame_number):
"""Frame‐analysis tab: show scene descriptions."""
try:
# Get scenes and keyframes
scenes = detect_scenes(video_path)
keyframes = extract_keyframes(video_path, scenes)
# Generate captions for all scenes
captions = [generate_scene_caption(frame) for _, frame in keyframes]
# Format the output with timestamps
output = []
for i, ((start, end), caption) in enumerate(zip(scenes, captions)):
start_time = start.get_seconds()
end_time = end.get_seconds()
output.append(f"Scene {i+1} ({start_time:.1f}s - {end_time:.1f}s):\n{caption}\n")
return "\n".join(output)
except Exception as e:
return f"❌ Error: {e}"
# ─── Gradio UI ────────────────────────────────────────────────────────────────
with gr.Blocks(theme=gr.themes.Soft(), css="""
footer {visibility: hidden}
.custom-footer {
text-align: center;
margin-top: 2em;
margin-bottom: 1em;
color: #666;
}
.description {
color: #666;
font-size: 0.9em;
line-height: 1.5;
}
.tech-stack {
background: var(--background-fill-secondary);
padding: 1em;
border-radius: 8px;
margin: 1em 0;
border: 1px solid var(--border-color-primary);
color: var(--body-text-color);
}
""") as demo:
gr.Markdown("""
# Videoxity
A powerful playground for video analysis and manipulation using state-of-the-art Vision-Language models.
<div class="description">
This application demonstrates the capabilities of modern computer vision and natural language processing models in video processing, offering a foundation for developers to build upon and optimize.
Whether you're exploring scene detection, content filtering, or video summarization, Videoxity provides the tools to experiment with and enhance video understanding.
⚠️ Note: This demo is running entirely on CPU. For faster processing, either run it locally or duplicate the space.
</div>
<div class="tech-stack">
<strong>Technical Stack:</strong>
- Scene Detection: PySceneDetect with ContentDetector
- Vision Models: BLIP (Image Captioning & VQA)
- Language Models: Groq LLM (Llama 3.1)
- Video Processing: OpenCV & FFmpeg
- Embeddings: BGE-Small for semantic search
</div>
""")
with gr.Tabs():
# 1) Scene Filtering
with gr.TabItem("Frames to Cut"):
gr.Markdown("""
### Remove specific scenes from your video
Upload a video and describe which scenes you want to remove. The BLIP Vision-Language model will analyze each scene and cut out the matching ones.
Examples:
- "Remove the part where there is a cat in the video"
- "Cut out the scene where people are dancing"
""")
with gr.Row():
with gr.Column():
vid1 = gr.Video(
label="Upload Video",
format="mp4",
interactive=True
)
qry1 = gr.Textbox(
label="Scenes to Remove",
placeholder="e.g., 'Remove the part where there is a cat in the video'",
lines=2
)
btn1 = gr.Button("Process Video", variant="primary")
with gr.Column():
outVid = gr.Video(
label="Processed Video",
format="mp4",
interactive=True
)
outTxt = gr.Textbox(label="Results", lines=10)
btn1.click(
fn=process_video,
inputs=[vid1, qry1],
outputs=[outVid, outTxt]
)
# 2) Video Description
with gr.TabItem("Video Description"):
gr.Markdown("""
### Generate a comprehensive description of your video
Get BLIP-generated scene descriptions and a Llama 3.1-powered narrative summary of your video.
""")
with gr.Row():
with gr.Column():
vid2 = gr.Video(label="Upload Video")
btn2 = gr.Button("Generate Description", variant="primary")
with gr.Column():
outDesc = gr.Textbox(
label="Video Description",
lines=15,
show_copy_button=True
)
btn2.click(
fn=generate_video_description,
inputs=[vid2],
outputs=[outDesc]
)
# 3) Frame Analysis
with gr.TabItem("Frame Analysis"):
gr.Markdown("""
### Analyze scenes in your video
Get detailed scene descriptions using BLIP's image captioning model, with precise timestamps for each scene.
""")
with gr.Row():
with gr.Column():
vid3 = gr.Video(label="Upload Video")
btn3 = gr.Button("Analyze Scenes", variant="primary")
with gr.Column():
outFrm = gr.Textbox(
label="Scene Descriptions",
lines=15,
show_copy_button=True
)
btn3.click(
fn=get_frame_description,
inputs=[vid3],
outputs=[outFrm]
)
# Add custom centered footer
gr.Markdown("""
<div class="custom-footer">
Made with ❤️
</div>
""", elem_classes=["custom-footer"])
if __name__ == "__main__":
demo.launch(share=True, show_error=True, show_api=False)