Spaces:

fau
/

videoxity

Running

App Files Files Community

videoxity / main.py

zamal

Update main.py

b7128c2 17 days ago

raw

history blame contribute delete

12.7 kB

	import os
	import sys
	import cv2
	import subprocess
	from tqdm import tqdm # add this at the top
	from PIL import Image
	from dotenv import load_dotenv
	from langchain_groq import ChatGroq
	from langchain_core.prompts import ChatPromptTemplate
	from transformers import pipeline
	from scenedetect import SceneManager, open_video, ContentDetector
	from sentence_transformers import SentenceTransformer, util

	# ─── 1. AUTH & MODELS ────────────────────────────────────────────────────────────

	# Load environment variables
	load_dotenv()
	HF_TOKEN = os.getenv("HF_TOKEN")
	GROQ_API_KEY = os.getenv("GROQ_API_KEY")

	if not HF_TOKEN:
	print("❌ Error: HF_TOKEN not found in .env file")
	sys.exit(1)

	# Initialize models with proper configurations
	captioner = pipeline(
	"image-to-text",
	model="Salesforce/blip-image-captioning-base",
	device="cpu"
	)

	vl_pipeline = pipeline(
	"visual-question-answering",
	model="Salesforce/blip-vqa-base",
	device="cpu"
	)

	elaborator = pipeline(
	"text-generation",
	model="gpt2-medium",
	device="cpu",
	max_new_tokens=500, # Use max_new_tokens instead of max_length
	do_sample=True,
	top_p=0.9,
	temperature=0.7
	)

	embedder = SentenceTransformer("BAAI/bge-small-en-v1.5")


	# ─── 2. HELPERS ──────────────────────────────────────────────────────────────────

	def run_ffmpeg(cmd):
	full = ["ffmpeg", "-hide_banner", "-loglevel", "error", "-y"] + cmd
	p = subprocess.Popen(full, stderr=subprocess.PIPE)
	_, err = p.communicate()
	if p.returncode != 0:
	print("❌ FFmpeg error:\n", err.decode())
	sys.exit(1)


	# ─── 3. SCENE DETECTION & KEYFRAMES ──────────────────────────────────────────────

	def detect_scenes(video_path, thresh=15.0):
	v = open_video(video_path)
	mgr = SceneManager()
	mgr.add_detector(ContentDetector(threshold=thresh))
	mgr.detect_scenes(v)
	return mgr.get_scene_list()



	def get_removal_indices_groq(captions, query):
	llm = ChatGroq(
	model="llama-3.1-8b-instant",
	temperature=0.2,
	max_tokens=500
	)

	prompt = ChatPromptTemplate.from_messages([
	(
	"system",
	"You are a helpful assistant for video analysis. The user will give you a list of scene captions, "
	"each labeled with an index like [1], [2], ..., and a filtering instruction like 'remove food scenes'.\n\n"
	"Return ONLY the list of indexes that should be removed — e.g., [2, 5, 9]\n"
	"⚠️ Do not explain, describe, or add any commentary. Your response MUST be a valid Python list of integers."
	),
	(
	"human",
	"Filtering instruction: {query}\n\nCaptions:\n{captions}"
	)
	])

	chain = prompt \| llm
	captions_formatted = "\n".join(f"[{i+1}] {cap.strip()}" for i, cap in enumerate(captions))

	try:
	response = chain.invoke({"query": query, "captions": captions_formatted})
	to_remove = eval(response.content.strip())

	if not isinstance(to_remove, list) or not all(isinstance(i, int) for i in to_remove):
	raise ValueError("Invalid format")

	except Exception as e:
	print(f"❌ LLM returned invalid output: {response.content}")
	to_remove = []

	return to_remove


	def groq_llm(prompt):
	llm = ChatGroq(
	model="llama-3.1-8b-instant",
	temperature=0.2,
	max_tokens=500
	)
	return llm.invoke(prompt).content.strip()



	def extract_keyframes(video_path, scenes):
	cap, frames = cv2.VideoCapture(video_path), []
	for s,e in scenes:
	mid = (s.get_frames() + e.get_frames()) // 2
	cap.set(cv2.CAP_PROP_POS_FRAMES, mid)
	ok, img = cap.read()
	if ok: frames.append((mid, img))
	cap.release()
	return frames


	# ─── 4. DESCRIPTIONS & SUMMARY ───────────────────────────────────────────────────

	def generate_scene_caption(frame):
	img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
	return captioner(img)[0]["generated_text"]

	def generate_video_summary_groq(captions):
	"""Generate a video summary using Groq LLM."""
	llm = ChatGroq(
	model="llama-3.1-8b-instant",
	temperature=0.2,
	max_tokens=500
	)

	prompt = ChatPromptTemplate.from_messages([
	(
	"system",
	"You are a helpful assistant for video analysis. The user will give you a list of scene captions from a video. "
	"Your task is to write a concise, narrative summary of what happens in the video, focusing only on the events shown. "
	"Make it engaging and easy to understand. Do not include any titles, links, or external references."
	),
	(
	"human",
	"Here are the scene captions from the video in order:\n{captions}\n\nPlease provide a narrative summary."
	)
	])

	chain = prompt \| llm
	captions_formatted = "\n".join(f"[{i+1}] {cap.strip()}" for i, cap in enumerate(captions))

	try:
	response = chain.invoke({"captions": captions_formatted})
	summary = response.content.strip()

	# Format the final output
	return f"""🎬 Video Summary:
	{summary}

	📊 Total Scenes: {len(captions)}

	🔍 Key Moments:
	{chr(10).join(f"• {cap}" for cap in captions[:5])}
	..."""
	except Exception as e:
	print(f"❌ Error generating summary with Groq: {e}")
	return "❌ Error: Failed to generate video summary"

	def generate_video_summary(captions):
	"""
	Generate a video summary using Groq LLM.
	"""
	return generate_video_summary_groq(captions)




	import ast

	def filter_scenes_with_llm(captions, query, llm):
	"""
	Uses an LLM to determine which scenes to remove based on captions and a user query.

	Args:
	captions (List[str]): List of scene/frame captions.
	query (str): User intent, e.g. "Remove scenes with Trump".
	llm (callable): Function to call your LLM, e.g. `llm(prompt)`.

	Returns:
	List[int]: List of 0-based frame indexes to remove.
	"""
	formatted = "\n".join([f"{i+1}. {cap}" for i, cap in enumerate(captions)])
	prompt = f"""
	You're an intelligent video assistant.

	The user wants to: {query}

	Below are numbered captions for each scene in a video:
	{formatted}

	👉 Return a Python list of only the scene numbers that should be removed based on the user query.
	👉 ONLY return the list like this: [3, 5, 11]. No explanation.
	"""

	# Run LLM
	response = llm(prompt)

	try:
	result = ast.literal_eval(response.strip())
	result = [i-1 for i in result] # convert to 0-based index
	return result
	except:
	print("⚠️ Failed to parse LLM output:", response)
	return []

	# ─── 5. FILTERING ───────────────────────────────────────────────────────────────
	def group_indices(indices):
	"""Group consecutive indices together as chunks."""
	if not indices:
	return []
	indices = sorted(indices)
	groups = [[indices[0]]]
	for i in indices[1:]:
	if i == groups[-1][-1] + 1:
	groups[-1].append(i)
	else:
	groups.append([i])
	return groups



	def vqa_matches(keyframes, question):
	flags = []
	for _,frame in keyframes:
	img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
	ans = vl_pipeline({"image": img, "question": question})
	flags.append("yes" in ans[0]["answer"].lower())
	return flags

	def semantic_matches(captions, prompt, thresh=0.8):
	embs = embedder.encode(captions, convert_to_tensor=True)
	q = embedder.encode(prompt, convert_to_tensor=True)
	sims = util.cos_sim(q, embs)[0]
	return [i for i,s in enumerate(sims) if s>=thresh], sims.tolist()


	# ─── 6. TRIMMING ────────────────────────────────────────────────────────────────

	def remove_scenes(video_path, scenes, to_remove, out="trimmed.mp4"):
	times = [(float(s.get_seconds()), float(e.get_seconds())) for s,e in scenes]

	# Group deletions
	remove_groups = group_indices(to_remove)

	# Threshold: max N consecutive scenes to allow trimming
	MAX_REMOVE_GROUP_SIZE = 4

	# Adjust `to_remove`: only allow small groups or isolated removals
	filtered_remove = []
	if len(scenes) > 3:
	last_scene_idx = len(scenes) - 1
	for i in range(last_scene_idx - 2, last_scene_idx + 1):
	if i in filtered_remove:
	filtered_remove.remove(i)

	for group in remove_groups:
	if len(group) <= MAX_REMOVE_GROUP_SIZE:
	filtered_remove.extend(group)

	print(f"🧩 Filtered scenes to remove (after capping long chunks): {filtered_remove}")

	# Final list of segments to keep
	keep = [t for i,t in enumerate(times) if i not in filtered_remove]


	# Create a temporary directory for segments
	os.makedirs("temp_segments", exist_ok=True)

	try:
	parts = []
	for i,(ss,tt) in enumerate(keep):
	fn = os.path.join("temp_segments", f"segment_{i}.mp4")
	# Use proper encoding settings to maintain frame integrity
	run_ffmpeg([
	"-i", video_path,
	"-ss", str(ss),
	"-to", str(tt),
	"-c:v", "libx264", # Use H.264 codec
	"-preset", "medium", # Balance between speed and quality
	"-crf", "23", # Constant Rate Factor for quality
	"-c:a", "aac", # Audio codec
	"-b:a", "128k", # Audio bitrate
	"-movflags", "+faststart", # Enable fast start for web playback
	fn
	])
	parts.append(fn)

	# Create concat file
	with open("parts.txt", "w") as f:
	for p in parts:
	f.write(f"file '{p}'\n")

	# Concatenate segments with proper encoding
	run_ffmpeg([
	"-f", "concat",
	"-safe", "0",
	"-i", "parts.txt",
	"-c:v", "libx264",
	"-preset", "medium",
	"-crf", "23",
	"-c:a", "aac",
	"-b:a", "128k",
	"-movflags", "+faststart",
	out
	])

	finally:
	# Cleanup
	for p in parts:
	if os.path.exists(p):
	os.remove(p)
	if os.path.exists("parts.txt"):
	os.remove("parts.txt")
	if os.path.exists("temp_segments"):
	os.rmdir("temp_segments")


	# ─── 7. MAIN PIPELINE ──────────────────────────────────────────────────────────

	def run(video, query):
	print(f"\n🎥 Video: {video}\n🔎 Query: '{query}'\n")

	scenes = detect_scenes(video)
	print(f"🔢 {len(scenes)} scenes detected.")

	keyframes = extract_keyframes(video, scenes)
	print(f"🖼️ {len(keyframes)} keyframes extracted.\n")

	captions = [generate_scene_caption(f) for _, f in tqdm(keyframes, desc="Generating captions")]
	summary = generate_video_summary(captions)
	print("\n--- Video Summary ---")
	print(summary)

	# 🧠 Let the LLM decide which scenes to remove based on captions
	to_remove = filter_scenes_with_llm(captions, query, groq_llm)
	print(f"\n🔴 Scenes to remove: {to_remove}")

	if to_remove:
	remove_scenes(video, scenes, to_remove)
	print("✅ Trimmed video saved as `trimmed.mp4`.")
	else:
	print("⚠️ No matching scenes found; no trimming done.")

	return to_remove # Optional: return for external use

	# ─── 8. ENTRY POINT ─────────────────────────────────────────────────────────────

	if __name__ == "__main__":
	if len(sys.argv)<3:
	print("Usage: python main.py <video.mp4> \"your query here\"")
	sys.exit(1)
	run(sys.argv[1], sys.argv[2])