Spaces:

fau
/

videoxity

Running

File size: 12,684 Bytes

import os
import sys
import cv2
import subprocess
from tqdm import tqdm  # add this at the top
from PIL import Image
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from transformers import pipeline
from scenedetect import SceneManager, open_video, ContentDetector
from sentence_transformers import SentenceTransformer, util

# ─── 1. AUTH & MODELS ────────────────────────────────────────────────────────────

# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if not HF_TOKEN:
    print("❌ Error: HF_TOKEN not found in .env file")
    sys.exit(1)

# Initialize models with proper configurations
captioner = pipeline(
    "image-to-text",
    model="Salesforce/blip-image-captioning-base",
    device="cpu"
)

vl_pipeline = pipeline(
    "visual-question-answering",
    model="Salesforce/blip-vqa-base",
    device="cpu"
)

elaborator = pipeline(
    "text-generation",
    model="gpt2-medium",
    device="cpu",
    max_new_tokens=500,  # Use max_new_tokens instead of max_length
    do_sample=True,
    top_p=0.9,
    temperature=0.7
)

embedder = SentenceTransformer("BAAI/bge-small-en-v1.5")


# ─── 2. HELPERS ──────────────────────────────────────────────────────────────────

def run_ffmpeg(cmd):
    full = ["ffmpeg", "-hide_banner", "-loglevel", "error", "-y"] + cmd
    p = subprocess.Popen(full, stderr=subprocess.PIPE)
    _, err = p.communicate()
    if p.returncode != 0:
        print("❌ FFmpeg error:\n", err.decode())
        sys.exit(1)


# ─── 3. SCENE DETECTION & KEYFRAMES ──────────────────────────────────────────────

def detect_scenes(video_path, thresh=15.0):
    v = open_video(video_path)
    mgr = SceneManager()
    mgr.add_detector(ContentDetector(threshold=thresh))
    mgr.detect_scenes(v)
    return mgr.get_scene_list()



def get_removal_indices_groq(captions, query):
    llm = ChatGroq(
        model="llama-3.1-8b-instant",
        temperature=0.2,
        max_tokens=500
    )

    prompt = ChatPromptTemplate.from_messages([
        (
            "system",
            "You are a helpful assistant for video analysis. The user will give you a list of scene captions, "
            "each labeled with an index like [1], [2], ..., and a filtering instruction like 'remove food scenes'.\n\n"
            "Return ONLY the list of indexes that should be removed — e.g., [2, 5, 9]\n"
            "⚠️ Do not explain, describe, or add any commentary. Your response MUST be a valid Python list of integers."
        ),
        (
            "human",
            "Filtering instruction: {query}\n\nCaptions:\n{captions}"
        )
    ])

    chain = prompt | llm
    captions_formatted = "\n".join(f"[{i+1}] {cap.strip()}" for i, cap in enumerate(captions))

    try:
        response = chain.invoke({"query": query, "captions": captions_formatted})
        to_remove = eval(response.content.strip())

        if not isinstance(to_remove, list) or not all(isinstance(i, int) for i in to_remove):
            raise ValueError("Invalid format")

    except Exception as e:
        print(f"❌ LLM returned invalid output: {response.content}")
        to_remove = []

    return to_remove


def groq_llm(prompt):
    llm = ChatGroq(
        model="llama-3.1-8b-instant",
        temperature=0.2,
        max_tokens=500
    )
    return llm.invoke(prompt).content.strip()

    

def extract_keyframes(video_path, scenes):
    cap, frames = cv2.VideoCapture(video_path), []
    for s,e in scenes:
        mid = (s.get_frames() + e.get_frames()) // 2
        cap.set(cv2.CAP_PROP_POS_FRAMES, mid)
        ok, img = cap.read()
        if ok: frames.append((mid, img))
    cap.release()
    return frames


# ─── 4. DESCRIPTIONS & SUMMARY ───────────────────────────────────────────────────

def generate_scene_caption(frame):
    img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    return captioner(img)[0]["generated_text"]

def generate_video_summary_groq(captions):
    """Generate a video summary using Groq LLM."""
    llm = ChatGroq(
        model="llama-3.1-8b-instant",
        temperature=0.2,
        max_tokens=500
    )

    prompt = ChatPromptTemplate.from_messages([
        (
            "system",
            "You are a helpful assistant for video analysis. The user will give you a list of scene captions from a video. "
            "Your task is to write a concise, narrative summary of what happens in the video, focusing only on the events shown. "
            "Make it engaging and easy to understand. Do not include any titles, links, or external references."
        ),
        (
            "human",
            "Here are the scene captions from the video in order:\n{captions}\n\nPlease provide a narrative summary."
        )
    ])

    chain = prompt | llm
    captions_formatted = "\n".join(f"[{i+1}] {cap.strip()}" for i, cap in enumerate(captions))

    try:
        response = chain.invoke({"captions": captions_formatted})
        summary = response.content.strip()
        
        # Format the final output
        return f"""🎬 Video Summary:
{summary}

📊 Total Scenes: {len(captions)}

🔍 Key Moments:
{chr(10).join(f"• {cap}" for cap in captions[:5])}
..."""
    except Exception as e:
        print(f"❌ Error generating summary with Groq: {e}")
        return "❌ Error: Failed to generate video summary"

def generate_video_summary(captions):
    """
    Generate a video summary using Groq LLM.
    """
    return generate_video_summary_groq(captions)




import ast

def filter_scenes_with_llm(captions, query, llm):
    """
    Uses an LLM to determine which scenes to remove based on captions and a user query.

    Args:
        captions (List[str]): List of scene/frame captions.
        query (str): User intent, e.g. "Remove scenes with Trump".
        llm (callable): Function to call your LLM, e.g. `llm(prompt)`.

    Returns:
        List[int]: List of 0-based frame indexes to remove.
    """
    formatted = "\n".join([f"{i+1}. {cap}" for i, cap in enumerate(captions)])
    prompt = f"""
You're an intelligent video assistant.

The user wants to: **{query}**

Below are numbered captions for each scene in a video:
{formatted}

👉 Return a Python list of only the scene numbers that should be removed based on the user query.
👉 ONLY return the list like this: [3, 5, 11]. No explanation.
"""

    # Run LLM
    response = llm(prompt)

    try:
        result = ast.literal_eval(response.strip())
        result = [i-1 for i in result]  # convert to 0-based index
        return result
    except:
        print("⚠️ Failed to parse LLM output:", response)
        return []

# ─── 5. FILTERING ───────────────────────────────────────────────────────────────
def group_indices(indices):
    """Group consecutive indices together as chunks."""
    if not indices:
        return []
    indices = sorted(indices)
    groups = [[indices[0]]]
    for i in indices[1:]:
        if i == groups[-1][-1] + 1:
            groups[-1].append(i)
        else:
            groups.append([i])
    return groups



def vqa_matches(keyframes, question):
    flags = []
    for _,frame in keyframes:
        img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        ans = vl_pipeline({"image": img, "question": question})
        flags.append("yes" in ans[0]["answer"].lower())
    return flags

def semantic_matches(captions, prompt, thresh=0.8):
    embs = embedder.encode(captions, convert_to_tensor=True)
    q   = embedder.encode(prompt, convert_to_tensor=True)
    sims = util.cos_sim(q, embs)[0]
    return [i for i,s in enumerate(sims) if s>=thresh], sims.tolist()


# ─── 6. TRIMMING ────────────────────────────────────────────────────────────────

def remove_scenes(video_path, scenes, to_remove, out="trimmed.mp4"):
    times = [(float(s.get_seconds()), float(e.get_seconds())) for s,e in scenes]

    # Group deletions
    remove_groups = group_indices(to_remove)

    # Threshold: max N consecutive scenes to allow trimming
    MAX_REMOVE_GROUP_SIZE = 4

    # Adjust `to_remove`: only allow small groups or isolated removals
    filtered_remove = []
    if len(scenes) > 3:
        last_scene_idx = len(scenes) - 1
        for i in range(last_scene_idx - 2, last_scene_idx + 1):
            if i in filtered_remove:
                filtered_remove.remove(i)

    for group in remove_groups:
        if len(group) <= MAX_REMOVE_GROUP_SIZE:
            filtered_remove.extend(group)

    print(f"🧩 Filtered scenes to remove (after capping long chunks): {filtered_remove}")

    # Final list of segments to keep
    keep = [t for i,t in enumerate(times) if i not in filtered_remove]


    # Create a temporary directory for segments
    os.makedirs("temp_segments", exist_ok=True)
    
    try:
        parts = []
        for i,(ss,tt) in enumerate(keep):
            fn = os.path.join("temp_segments", f"segment_{i}.mp4")
            # Use proper encoding settings to maintain frame integrity
            run_ffmpeg([
                "-i", video_path,
                "-ss", str(ss),
                "-to", str(tt),
                "-c:v", "libx264",  # Use H.264 codec
                "-preset", "medium",  # Balance between speed and quality
                "-crf", "23",  # Constant Rate Factor for quality
                "-c:a", "aac",  # Audio codec
                "-b:a", "128k",  # Audio bitrate
                "-movflags", "+faststart",  # Enable fast start for web playback
                fn
            ])
            parts.append(fn)

        # Create concat file
        with open("parts.txt", "w") as f:
            for p in parts:
                f.write(f"file '{p}'\n")

        # Concatenate segments with proper encoding
        run_ffmpeg([
            "-f", "concat",
            "-safe", "0",
            "-i", "parts.txt",
            "-c:v", "libx264",
            "-preset", "medium",
            "-crf", "23",
            "-c:a", "aac",
            "-b:a", "128k",
            "-movflags", "+faststart",
            out
        ])

    finally:
        # Cleanup
        for p in parts:
            if os.path.exists(p):
                os.remove(p)
        if os.path.exists("parts.txt"):
            os.remove("parts.txt")
        if os.path.exists("temp_segments"):
            os.rmdir("temp_segments")


# ─── 7. MAIN PIPELINE ──────────────────────────────────────────────────────────

def run(video, query):
    print(f"\n🎥 Video: {video}\n🔎 Query: '{query}'\n")

    scenes    = detect_scenes(video)
    print(f"🔢 {len(scenes)} scenes detected.")

    keyframes = extract_keyframes(video, scenes)
    print(f"🖼️ {len(keyframes)} keyframes extracted.\n")

    captions = [generate_scene_caption(f) for _, f in tqdm(keyframes, desc="Generating captions")]
    summary  = generate_video_summary(captions)
    print("\n--- Video Summary ---")
    print(summary)

    # 🧠 Let the LLM decide which scenes to remove based on captions
    to_remove = filter_scenes_with_llm(captions, query, groq_llm)
    print(f"\n🔴 Scenes to remove: {to_remove}")

    if to_remove:
        remove_scenes(video, scenes, to_remove)
        print("✅ Trimmed video saved as `trimmed.mp4`.")
    else:
        print("⚠️ No matching scenes found; no trimming done.")

    return to_remove  # Optional: return for external use

# ─── 8. ENTRY POINT ─────────────────────────────────────────────────────────────

if __name__ == "__main__":
    if len(sys.argv)<3:
        print("Usage: python main.py <video.mp4> \"your query here\"")
        sys.exit(1)
    run(sys.argv[1], sys.argv[2])