Spaces:

fau
/

videoxity

Running

videoxity / app.py

zamalali

Improve VL pipeline

7cf4dc6 17 days ago

9.41 kB

	import os
	import cv2
	import gradio as gr
	from dotenv import load_dotenv
	import spaces

	from main import (
	run,
	detect_scenes,
	extract_keyframes,
	generate_scene_caption,
	generate_video_summary,
	generate_video_summary_groq,
	vqa_matches,
	semantic_matches,
	remove_scenes,
	)

	# Load environment variables
	load_dotenv()
	if not os.getenv("HF_TOKEN"):
	raise ValueError("❌ Error: HF_TOKEN not found in .env file")


	def process_video(video_path, query, progress=gr.Progress()):
	"""Scene‐filtering tab: remove scenes matching the query."""
	try:
	os.makedirs("outputs", exist_ok=True)
	output_path = os.path.join("outputs", "trimmed_video.mp4")

	# 1) Detect scenes
	progress(0.0, desc="Detecting scenes...")
	scenes = detect_scenes(video_path)

	# 2) Extract keyframes
	progress(0.2, desc="Extracting keyframes...")
	keyframes = extract_keyframes(video_path, scenes)

	# 3) Caption each keyframe
	progress(0.4, desc="Generating captions...")
	captions = [generate_scene_caption(frame) for _, frame in keyframes]

	# 4) VQA + semantic filtering
	progress(0.6, desc="Analyzing scenes...")
	vqa_mask = vqa_matches(keyframes, query)
	sem_idxs, _= semantic_matches(captions, query)

	# 5) Build removal list
	to_remove = sorted({i for i, flag in enumerate(vqa_mask) if flag} \| set(sem_idxs))

	# 6) Trim via ffmpeg
	progress(0.8, desc="Processing video...")
	if to_remove:
	remove_scenes(video_path, scenes, to_remove, output_path)

	# Verify the output video
	if not os.path.exists(output_path):
	return None, "❌ Error: Failed to create output video"

	# Check if video is valid
	cap = cv2.VideoCapture(output_path)
	if not cap.isOpened():
	return None, "❌ Error: Generated video is invalid"
	cap.release()

	stats = [
	"✅ Processing complete!",
	f"📊 Total scenes: {len(scenes)}",
	f"🗑️ Scenes removed: {len(to_remove)}",
	f"🎬 Scenes kept: {len(scenes)-len(to_remove)}",
	"\n🔍 Scene captions:",
	*[f"[Scene {i}]: {cap}" for i, cap in enumerate(captions)]
	]
	return output_path, "\n".join(stats)
	else:
	return None, "⚠️ No matching scenes found; no trimming done."
	except Exception as e:
	return None, f"❌ Error: {e}"


	def generate_video_description(video_path, progress=gr.Progress()):
	"""Video‐description tab: full scene‐by‐scene summary."""
	try:
	progress(0.0, desc="Detecting scenes...")
	scenes = detect_scenes(video_path)

	progress(0.3, desc="Extracting keyframes...")
	keyframes = extract_keyframes(video_path, scenes)

	progress(0.6, desc="Captioning scenes...")
	captions = [generate_scene_caption(frame) for _, frame in keyframes]

	# build & return the summary paragraph
	summary = generate_video_summary(captions)
	return summary
	except Exception as e:
	return f"❌ Error: {e}"


	@spaces.GPU
	def get_frame_description(video_path, frame_number):
	"""Frame‐analysis tab: show scene descriptions."""
	try:
	# Get scenes and keyframes
	scenes = detect_scenes(video_path)
	keyframes = extract_keyframes(video_path, scenes)

	# Generate captions for all scenes
	captions = [generate_scene_caption(frame) for _, frame in keyframes]

	# Format the output with timestamps
	output = []
	for i, ((start, end), caption) in enumerate(zip(scenes, captions)):
	start_time = start.get_seconds()
	end_time = end.get_seconds()
	output.append(f"Scene {i+1} ({start_time:.1f}s - {end_time:.1f}s):\n{caption}\n")

	return "\n".join(output)
	except Exception as e:
	return f"❌ Error: {e}"

	# ─── Gradio UI ────────────────────────────────────────────────────────────────

	with gr.Blocks(theme=gr.themes.Soft(), css="""
	footer {visibility: hidden}
	.custom-footer {
	text-align: center;
	margin-top: 2em;
	margin-bottom: 1em;
	color: #666;
	}
	.description {
	color: #666;
	font-size: 0.9em;
	line-height: 1.5;
	}
	.tech-stack {
	background: var(--background-fill-secondary);
	padding: 1em;
	border-radius: 8px;
	margin: 1em 0;
	border: 1px solid var(--border-color-primary);
	color: var(--body-text-color);
	}
	""") as demo:
	gr.Markdown("""
	# Videoxity

	A powerful playground for video analysis and manipulation using state-of-the-art Vision-Language models.

	<div class="description">
	This application demonstrates the capabilities of modern computer vision and natural language processing models in video processing, offering a foundation for developers to build upon and optimize.
	Whether you're exploring scene detection, content filtering, or video summarization, Videoxity provides the tools to experiment with and enhance video understanding.

	⚠️ Note: This demo is running entirely on CPU. For faster processing, either run it locally or duplicate the space.
	</div>

	<div class="tech-stack">
	<strong>Technical Stack:</strong>
	- Scene Detection: PySceneDetect with ContentDetector
	- Vision Models: BLIP (Image Captioning & VQA)
	- Language Models: Groq LLM (Llama 3.1)
	- Video Processing: OpenCV & FFmpeg
	- Embeddings: BGE-Small for semantic search
	</div>
	""")

	with gr.Tabs():
	# 1) Scene Filtering
	with gr.TabItem("Frames to Cut"):
	gr.Markdown("""
	### Remove specific scenes from your video
	Upload a video and describe which scenes you want to remove. The BLIP Vision-Language model will analyze each scene and cut out the matching ones.

	Examples:
	- "Remove the part where there is a cat in the video"
	- "Cut out the scene where people are dancing"
	""")
	with gr.Row():
	with gr.Column():
	vid1 = gr.Video(
	label="Upload Video",
	format="mp4",
	interactive=True
	)
	qry1 = gr.Textbox(
	label="Scenes to Remove",
	placeholder="e.g., 'Remove the part where there is a cat in the video'",
	lines=2
	)
	btn1 = gr.Button("Process Video", variant="primary")
	with gr.Column():
	outVid = gr.Video(
	label="Processed Video",
	format="mp4",
	interactive=True
	)
	outTxt = gr.Textbox(label="Results", lines=10)
	btn1.click(
	fn=process_video,
	inputs=[vid1, qry1],
	outputs=[outVid, outTxt]
	)

	# 2) Video Description
	with gr.TabItem("Video Description"):
	gr.Markdown("""
	### Generate a comprehensive description of your video
	Get BLIP-generated scene descriptions and a Llama 3.1-powered narrative summary of your video.
	""")
	with gr.Row():
	with gr.Column():
	vid2 = gr.Video(label="Upload Video")
	btn2 = gr.Button("Generate Description", variant="primary")
	with gr.Column():
	outDesc = gr.Textbox(
	label="Video Description",
	lines=15,
	show_copy_button=True
	)
	btn2.click(
	fn=generate_video_description,
	inputs=[vid2],
	outputs=[outDesc]
	)

	# 3) Frame Analysis
	with gr.TabItem("Frame Analysis"):
	gr.Markdown("""
	### Analyze scenes in your video
	Get detailed scene descriptions using BLIP's image captioning model, with precise timestamps for each scene.
	""")
	with gr.Row():
	with gr.Column():
	vid3 = gr.Video(label="Upload Video")
	btn3 = gr.Button("Analyze Scenes", variant="primary")
	with gr.Column():
	outFrm = gr.Textbox(
	label="Scene Descriptions",
	lines=15,
	show_copy_button=True
	)
	btn3.click(
	fn=get_frame_description,
	inputs=[vid3],
	outputs=[outFrm]
	)

	# Add custom centered footer
	gr.Markdown("""
	<div class="custom-footer">
	Made with ❤️
	</div>
	""", elem_classes=["custom-footer"])

	if __name__ == "__main__":
	demo.launch(share=True, show_error=True, show_api=False)