Spaces:
Running
Running
File size: 12,684 Bytes
9a14671 b7128c2 9a14671 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 |
import os
import sys
import cv2
import subprocess
from tqdm import tqdm # add this at the top
from PIL import Image
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from transformers import pipeline
from scenedetect import SceneManager, open_video, ContentDetector
from sentence_transformers import SentenceTransformer, util
# ─── 1. AUTH & MODELS ────────────────────────────────────────────────────────────
# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not HF_TOKEN:
print("❌ Error: HF_TOKEN not found in .env file")
sys.exit(1)
# Initialize models with proper configurations
captioner = pipeline(
"image-to-text",
model="Salesforce/blip-image-captioning-base",
device="cpu"
)
vl_pipeline = pipeline(
"visual-question-answering",
model="Salesforce/blip-vqa-base",
device="cpu"
)
elaborator = pipeline(
"text-generation",
model="gpt2-medium",
device="cpu",
max_new_tokens=500, # Use max_new_tokens instead of max_length
do_sample=True,
top_p=0.9,
temperature=0.7
)
embedder = SentenceTransformer("BAAI/bge-small-en-v1.5")
# ─── 2. HELPERS ──────────────────────────────────────────────────────────────────
def run_ffmpeg(cmd):
full = ["ffmpeg", "-hide_banner", "-loglevel", "error", "-y"] + cmd
p = subprocess.Popen(full, stderr=subprocess.PIPE)
_, err = p.communicate()
if p.returncode != 0:
print("❌ FFmpeg error:\n", err.decode())
sys.exit(1)
# ─── 3. SCENE DETECTION & KEYFRAMES ──────────────────────────────────────────────
def detect_scenes(video_path, thresh=15.0):
v = open_video(video_path)
mgr = SceneManager()
mgr.add_detector(ContentDetector(threshold=thresh))
mgr.detect_scenes(v)
return mgr.get_scene_list()
def get_removal_indices_groq(captions, query):
llm = ChatGroq(
model="llama-3.1-8b-instant",
temperature=0.2,
max_tokens=500
)
prompt = ChatPromptTemplate.from_messages([
(
"system",
"You are a helpful assistant for video analysis. The user will give you a list of scene captions, "
"each labeled with an index like [1], [2], ..., and a filtering instruction like 'remove food scenes'.\n\n"
"Return ONLY the list of indexes that should be removed — e.g., [2, 5, 9]\n"
"⚠️ Do not explain, describe, or add any commentary. Your response MUST be a valid Python list of integers."
),
(
"human",
"Filtering instruction: {query}\n\nCaptions:\n{captions}"
)
])
chain = prompt | llm
captions_formatted = "\n".join(f"[{i+1}] {cap.strip()}" for i, cap in enumerate(captions))
try:
response = chain.invoke({"query": query, "captions": captions_formatted})
to_remove = eval(response.content.strip())
if not isinstance(to_remove, list) or not all(isinstance(i, int) for i in to_remove):
raise ValueError("Invalid format")
except Exception as e:
print(f"❌ LLM returned invalid output: {response.content}")
to_remove = []
return to_remove
def groq_llm(prompt):
llm = ChatGroq(
model="llama-3.1-8b-instant",
temperature=0.2,
max_tokens=500
)
return llm.invoke(prompt).content.strip()
def extract_keyframes(video_path, scenes):
cap, frames = cv2.VideoCapture(video_path), []
for s,e in scenes:
mid = (s.get_frames() + e.get_frames()) // 2
cap.set(cv2.CAP_PROP_POS_FRAMES, mid)
ok, img = cap.read()
if ok: frames.append((mid, img))
cap.release()
return frames
# ─── 4. DESCRIPTIONS & SUMMARY ───────────────────────────────────────────────────
def generate_scene_caption(frame):
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
return captioner(img)[0]["generated_text"]
def generate_video_summary_groq(captions):
"""Generate a video summary using Groq LLM."""
llm = ChatGroq(
model="llama-3.1-8b-instant",
temperature=0.2,
max_tokens=500
)
prompt = ChatPromptTemplate.from_messages([
(
"system",
"You are a helpful assistant for video analysis. The user will give you a list of scene captions from a video. "
"Your task is to write a concise, narrative summary of what happens in the video, focusing only on the events shown. "
"Make it engaging and easy to understand. Do not include any titles, links, or external references."
),
(
"human",
"Here are the scene captions from the video in order:\n{captions}\n\nPlease provide a narrative summary."
)
])
chain = prompt | llm
captions_formatted = "\n".join(f"[{i+1}] {cap.strip()}" for i, cap in enumerate(captions))
try:
response = chain.invoke({"captions": captions_formatted})
summary = response.content.strip()
# Format the final output
return f"""🎬 Video Summary:
{summary}
📊 Total Scenes: {len(captions)}
🔍 Key Moments:
{chr(10).join(f"• {cap}" for cap in captions[:5])}
..."""
except Exception as e:
print(f"❌ Error generating summary with Groq: {e}")
return "❌ Error: Failed to generate video summary"
def generate_video_summary(captions):
"""
Generate a video summary using Groq LLM.
"""
return generate_video_summary_groq(captions)
import ast
def filter_scenes_with_llm(captions, query, llm):
"""
Uses an LLM to determine which scenes to remove based on captions and a user query.
Args:
captions (List[str]): List of scene/frame captions.
query (str): User intent, e.g. "Remove scenes with Trump".
llm (callable): Function to call your LLM, e.g. `llm(prompt)`.
Returns:
List[int]: List of 0-based frame indexes to remove.
"""
formatted = "\n".join([f"{i+1}. {cap}" for i, cap in enumerate(captions)])
prompt = f"""
You're an intelligent video assistant.
The user wants to: **{query}**
Below are numbered captions for each scene in a video:
{formatted}
👉 Return a Python list of only the scene numbers that should be removed based on the user query.
👉 ONLY return the list like this: [3, 5, 11]. No explanation.
"""
# Run LLM
response = llm(prompt)
try:
result = ast.literal_eval(response.strip())
result = [i-1 for i in result] # convert to 0-based index
return result
except:
print("⚠️ Failed to parse LLM output:", response)
return []
# ─── 5. FILTERING ───────────────────────────────────────────────────────────────
def group_indices(indices):
"""Group consecutive indices together as chunks."""
if not indices:
return []
indices = sorted(indices)
groups = [[indices[0]]]
for i in indices[1:]:
if i == groups[-1][-1] + 1:
groups[-1].append(i)
else:
groups.append([i])
return groups
def vqa_matches(keyframes, question):
flags = []
for _,frame in keyframes:
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
ans = vl_pipeline({"image": img, "question": question})
flags.append("yes" in ans[0]["answer"].lower())
return flags
def semantic_matches(captions, prompt, thresh=0.8):
embs = embedder.encode(captions, convert_to_tensor=True)
q = embedder.encode(prompt, convert_to_tensor=True)
sims = util.cos_sim(q, embs)[0]
return [i for i,s in enumerate(sims) if s>=thresh], sims.tolist()
# ─── 6. TRIMMING ────────────────────────────────────────────────────────────────
def remove_scenes(video_path, scenes, to_remove, out="trimmed.mp4"):
times = [(float(s.get_seconds()), float(e.get_seconds())) for s,e in scenes]
# Group deletions
remove_groups = group_indices(to_remove)
# Threshold: max N consecutive scenes to allow trimming
MAX_REMOVE_GROUP_SIZE = 4
# Adjust `to_remove`: only allow small groups or isolated removals
filtered_remove = []
if len(scenes) > 3:
last_scene_idx = len(scenes) - 1
for i in range(last_scene_idx - 2, last_scene_idx + 1):
if i in filtered_remove:
filtered_remove.remove(i)
for group in remove_groups:
if len(group) <= MAX_REMOVE_GROUP_SIZE:
filtered_remove.extend(group)
print(f"🧩 Filtered scenes to remove (after capping long chunks): {filtered_remove}")
# Final list of segments to keep
keep = [t for i,t in enumerate(times) if i not in filtered_remove]
# Create a temporary directory for segments
os.makedirs("temp_segments", exist_ok=True)
try:
parts = []
for i,(ss,tt) in enumerate(keep):
fn = os.path.join("temp_segments", f"segment_{i}.mp4")
# Use proper encoding settings to maintain frame integrity
run_ffmpeg([
"-i", video_path,
"-ss", str(ss),
"-to", str(tt),
"-c:v", "libx264", # Use H.264 codec
"-preset", "medium", # Balance between speed and quality
"-crf", "23", # Constant Rate Factor for quality
"-c:a", "aac", # Audio codec
"-b:a", "128k", # Audio bitrate
"-movflags", "+faststart", # Enable fast start for web playback
fn
])
parts.append(fn)
# Create concat file
with open("parts.txt", "w") as f:
for p in parts:
f.write(f"file '{p}'\n")
# Concatenate segments with proper encoding
run_ffmpeg([
"-f", "concat",
"-safe", "0",
"-i", "parts.txt",
"-c:v", "libx264",
"-preset", "medium",
"-crf", "23",
"-c:a", "aac",
"-b:a", "128k",
"-movflags", "+faststart",
out
])
finally:
# Cleanup
for p in parts:
if os.path.exists(p):
os.remove(p)
if os.path.exists("parts.txt"):
os.remove("parts.txt")
if os.path.exists("temp_segments"):
os.rmdir("temp_segments")
# ─── 7. MAIN PIPELINE ──────────────────────────────────────────────────────────
def run(video, query):
print(f"\n🎥 Video: {video}\n🔎 Query: '{query}'\n")
scenes = detect_scenes(video)
print(f"🔢 {len(scenes)} scenes detected.")
keyframes = extract_keyframes(video, scenes)
print(f"🖼️ {len(keyframes)} keyframes extracted.\n")
captions = [generate_scene_caption(f) for _, f in tqdm(keyframes, desc="Generating captions")]
summary = generate_video_summary(captions)
print("\n--- Video Summary ---")
print(summary)
# 🧠 Let the LLM decide which scenes to remove based on captions
to_remove = filter_scenes_with_llm(captions, query, groq_llm)
print(f"\n🔴 Scenes to remove: {to_remove}")
if to_remove:
remove_scenes(video, scenes, to_remove)
print("✅ Trimmed video saved as `trimmed.mp4`.")
else:
print("⚠️ No matching scenes found; no trimming done.")
return to_remove # Optional: return for external use
# ─── 8. ENTRY POINT ─────────────────────────────────────────────────────────────
if __name__ == "__main__":
if len(sys.argv)<3:
print("Usage: python main.py <video.mp4> \"your query here\"")
sys.exit(1)
run(sys.argv[1], sys.argv[2])
|