Spaces:
Running
Running
import os | |
import cv2 | |
import gradio as gr | |
from dotenv import load_dotenv | |
import spaces | |
from main import ( | |
run, | |
detect_scenes, | |
extract_keyframes, | |
generate_scene_caption, | |
generate_video_summary, | |
generate_video_summary_groq, | |
vqa_matches, | |
semantic_matches, | |
remove_scenes, | |
) | |
# Load environment variables | |
load_dotenv() | |
if not os.getenv("HF_TOKEN"): | |
raise ValueError("❌ Error: HF_TOKEN not found in .env file") | |
def process_video(video_path, query, progress=gr.Progress()): | |
"""Scene‐filtering tab: remove scenes matching the query.""" | |
try: | |
os.makedirs("outputs", exist_ok=True) | |
output_path = os.path.join("outputs", "trimmed_video.mp4") | |
# 1) Detect scenes | |
progress(0.0, desc="Detecting scenes...") | |
scenes = detect_scenes(video_path) | |
# 2) Extract keyframes | |
progress(0.2, desc="Extracting keyframes...") | |
keyframes = extract_keyframes(video_path, scenes) | |
# 3) Caption each keyframe | |
progress(0.4, desc="Generating captions...") | |
captions = [generate_scene_caption(frame) for _, frame in keyframes] | |
# 4) VQA + semantic filtering | |
progress(0.6, desc="Analyzing scenes...") | |
vqa_mask = vqa_matches(keyframes, query) | |
sem_idxs, _= semantic_matches(captions, query) | |
# 5) Build removal list | |
to_remove = sorted({i for i, flag in enumerate(vqa_mask) if flag} | set(sem_idxs)) | |
# 6) Trim via ffmpeg | |
progress(0.8, desc="Processing video...") | |
if to_remove: | |
remove_scenes(video_path, scenes, to_remove, output_path) | |
# Verify the output video | |
if not os.path.exists(output_path): | |
return None, "❌ Error: Failed to create output video" | |
# Check if video is valid | |
cap = cv2.VideoCapture(output_path) | |
if not cap.isOpened(): | |
return None, "❌ Error: Generated video is invalid" | |
cap.release() | |
stats = [ | |
"✅ Processing complete!", | |
f"📊 Total scenes: {len(scenes)}", | |
f"🗑️ Scenes removed: {len(to_remove)}", | |
f"🎬 Scenes kept: {len(scenes)-len(to_remove)}", | |
"\n🔍 Scene captions:", | |
*[f"[Scene {i}]: {cap}" for i, cap in enumerate(captions)] | |
] | |
return output_path, "\n".join(stats) | |
else: | |
return None, "⚠️ No matching scenes found; no trimming done." | |
except Exception as e: | |
return None, f"❌ Error: {e}" | |
def generate_video_description(video_path, progress=gr.Progress()): | |
"""Video‐description tab: full scene‐by‐scene summary.""" | |
try: | |
progress(0.0, desc="Detecting scenes...") | |
scenes = detect_scenes(video_path) | |
progress(0.3, desc="Extracting keyframes...") | |
keyframes = extract_keyframes(video_path, scenes) | |
progress(0.6, desc="Captioning scenes...") | |
captions = [generate_scene_caption(frame) for _, frame in keyframes] | |
# build & return the summary paragraph | |
summary = generate_video_summary(captions) | |
return summary | |
except Exception as e: | |
return f"❌ Error: {e}" | |
def get_frame_description(video_path, frame_number): | |
"""Frame‐analysis tab: show scene descriptions.""" | |
try: | |
# Get scenes and keyframes | |
scenes = detect_scenes(video_path) | |
keyframes = extract_keyframes(video_path, scenes) | |
# Generate captions for all scenes | |
captions = [generate_scene_caption(frame) for _, frame in keyframes] | |
# Format the output with timestamps | |
output = [] | |
for i, ((start, end), caption) in enumerate(zip(scenes, captions)): | |
start_time = start.get_seconds() | |
end_time = end.get_seconds() | |
output.append(f"Scene {i+1} ({start_time:.1f}s - {end_time:.1f}s):\n{caption}\n") | |
return "\n".join(output) | |
except Exception as e: | |
return f"❌ Error: {e}" | |
# ─── Gradio UI ──────────────────────────────────────────────────────────────── | |
with gr.Blocks(theme=gr.themes.Soft(), css=""" | |
footer {visibility: hidden} | |
.custom-footer { | |
text-align: center; | |
margin-top: 2em; | |
margin-bottom: 1em; | |
color: #666; | |
} | |
.description { | |
color: #666; | |
font-size: 0.9em; | |
line-height: 1.5; | |
} | |
.tech-stack { | |
background: var(--background-fill-secondary); | |
padding: 1em; | |
border-radius: 8px; | |
margin: 1em 0; | |
border: 1px solid var(--border-color-primary); | |
color: var(--body-text-color); | |
} | |
""") as demo: | |
gr.Markdown(""" | |
# Videoxity | |
A powerful playground for video analysis and manipulation using state-of-the-art Vision-Language models. | |
<div class="description"> | |
This application demonstrates the capabilities of modern computer vision and natural language processing models in video processing, offering a foundation for developers to build upon and optimize. | |
Whether you're exploring scene detection, content filtering, or video summarization, Videoxity provides the tools to experiment with and enhance video understanding. | |
⚠️ Note: This demo is running entirely on CPU. For faster processing, either run it locally or duplicate the space. | |
</div> | |
<div class="tech-stack"> | |
<strong>Technical Stack:</strong> | |
- Scene Detection: PySceneDetect with ContentDetector | |
- Vision Models: BLIP (Image Captioning & VQA) | |
- Language Models: Groq LLM (Llama 3.1) | |
- Video Processing: OpenCV & FFmpeg | |
- Embeddings: BGE-Small for semantic search | |
</div> | |
""") | |
with gr.Tabs(): | |
# 1) Scene Filtering | |
with gr.TabItem("Frames to Cut"): | |
gr.Markdown(""" | |
### Remove specific scenes from your video | |
Upload a video and describe which scenes you want to remove. The BLIP Vision-Language model will analyze each scene and cut out the matching ones. | |
Examples: | |
- "Remove the part where there is a cat in the video" | |
- "Cut out the scene where people are dancing" | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
vid1 = gr.Video( | |
label="Upload Video", | |
format="mp4", | |
interactive=True | |
) | |
qry1 = gr.Textbox( | |
label="Scenes to Remove", | |
placeholder="e.g., 'Remove the part where there is a cat in the video'", | |
lines=2 | |
) | |
btn1 = gr.Button("Process Video", variant="primary") | |
with gr.Column(): | |
outVid = gr.Video( | |
label="Processed Video", | |
format="mp4", | |
interactive=True | |
) | |
outTxt = gr.Textbox(label="Results", lines=10) | |
btn1.click( | |
fn=process_video, | |
inputs=[vid1, qry1], | |
outputs=[outVid, outTxt] | |
) | |
# 2) Video Description | |
with gr.TabItem("Video Description"): | |
gr.Markdown(""" | |
### Generate a comprehensive description of your video | |
Get BLIP-generated scene descriptions and a Llama 3.1-powered narrative summary of your video. | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
vid2 = gr.Video(label="Upload Video") | |
btn2 = gr.Button("Generate Description", variant="primary") | |
with gr.Column(): | |
outDesc = gr.Textbox( | |
label="Video Description", | |
lines=15, | |
show_copy_button=True | |
) | |
btn2.click( | |
fn=generate_video_description, | |
inputs=[vid2], | |
outputs=[outDesc] | |
) | |
# 3) Frame Analysis | |
with gr.TabItem("Frame Analysis"): | |
gr.Markdown(""" | |
### Analyze scenes in your video | |
Get detailed scene descriptions using BLIP's image captioning model, with precise timestamps for each scene. | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
vid3 = gr.Video(label="Upload Video") | |
btn3 = gr.Button("Analyze Scenes", variant="primary") | |
with gr.Column(): | |
outFrm = gr.Textbox( | |
label="Scene Descriptions", | |
lines=15, | |
show_copy_button=True | |
) | |
btn3.click( | |
fn=get_frame_description, | |
inputs=[vid3], | |
outputs=[outFrm] | |
) | |
# Add custom centered footer | |
gr.Markdown(""" | |
<div class="custom-footer"> | |
Made with ❤️ | |
</div> | |
""", elem_classes=["custom-footer"]) | |
if __name__ == "__main__": | |
demo.launch(share=True, show_error=True, show_api=False) | |