Spaces:

Agents-MCP-Hackathon
/

Clip-Search-Agent

Running

App Files Files Community

HaithamIsmail commited on 16 days ago

Commit

e17f518

1 Parent(s): 7835c3b

add video link, fixed frames sampling

Browse files

Files changed (3) hide show

README.md +2 -0
app.py +44 -17
utils.py +57 -9

README.md CHANGED Viewed

@@ -63,6 +63,8 @@ Here is the revised README that incorporates these setup instructions into a com
 ---
 # 🎬 HyperClipper: Your AI Video Librarian 🤖
 Tired of scrubbing through hours of video to find that *one* perfect moment? HyperClipper is your personal AI video librarian that watches, understands, and catalogs your entire video library, making every second instantly searchable.

 ---
+### Demo Video: https://youtu.be/S-Sbn1NTWq0
 # 🎬 HyperClipper: Your AI Video Librarian 🤖
 Tired of scrubbing through hours of video to find that *one* perfect moment? HyperClipper is your personal AI video librarian that watches, understands, and catalogs your entire video library, making every second instantly searchable.

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ import shutil
 from utils import get_text_embedding, sample_from_video, convert_image_to_base64
 from config import load_config
 from lancedb_utils import retreive_clip
-import traceback
 app_config = load_config()
 langchain_message_history = []
@@ -174,9 +174,9 @@ def get_clip(clip_id: str):
         list: list of frames
     """
     print("clip id", clip_id)
-    clip = retreive_clip(clip_id)
     images = sample_from_video(clip["clip_path"])
-    base64_images = [convert_image_to_base64(image) for image in images]
     return base64_images
 def search_and_display_clips(query_text):
@@ -211,7 +211,7 @@ def chat_agent(message, history: list):
     # Add current message
     langchain_message_history.append({"role": "user", "content": message})
-    llm_with_tool = chat_model.bind_tools(tools=[get_relevant_clips, get_clip_base64])
     tools = {"get_relevant_clips": get_relevant_clips}
     # The agent loop
@@ -244,6 +244,7 @@ def chat_agent_mm(message, history):
     global latest_search_results, langchain_message_history
     langchain_message_history.append({"role": "user", "content": message})
     print(langchain_message_history)
     llm_with_tool = chat_model_vlm.bind_tools(tools=[get_relevant_clips, get_clip])
@@ -258,14 +259,40 @@ def chat_agent_mm(message, history):
         for tool_call in ai_response.tool_calls:
             print(tool_call)
             tool_output = tools[tool_call["name"]].invoke(tool_call)
             if tool_call["name"] == "get_clip":
-                langchain_message_history.append({
-                    "role": "tool", "content": [
-                    {"type": "text", "text": "here is the clip"}
-                    *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpeg;base64,{x}'}}, tool_output.content)
-                    ]
-                })
             else:
                 tool_call_log = {
                     "role": "tool",
@@ -273,6 +300,7 @@ def chat_agent_mm(message, history):
                     "content": tool_output.content
                 }
                 langchain_message_history.append(tool_call_log)
     content = ai_response.content
     if "</think>" in content:
@@ -281,7 +309,8 @@ def chat_agent_mm(message, history):
     # The global state `latest_search_results` is updated by the tool.
     # The text response is returned.
     langchain_message_history.append({"role": "assistant", "content": content})
-    return langchain_message_history
 def get_latest_clips_for_display():
     """Get the latest search results for display in the UI."""
@@ -629,8 +658,8 @@ with gr.Blocks(title="Video Search Agent", theme=gr.themes.Soft()) as demo:
                     original_filename = "uploaded_video.mp4"
                 temp_dir = tempfile.mkdtemp()
                 tmp_path = os.path.join(temp_dir, original_filename)
-                shutil.copy(file_obj, tmp_path)
                 # Run the video processing pipeline
                 run_pipeline(tmp_path)
@@ -644,7 +673,6 @@ with gr.Blocks(title="Video Search Agent", theme=gr.themes.Soft()) as demo:
                 return f"✅ Video analysis complete for '{original_filename}'. You can now search for clips from this video."
             except Exception as e:
-                traceback.print_exc()
                 return f"❌ Error during video analysis: {str(e)}"
         analyze_btn.click(
@@ -656,10 +684,9 @@ with gr.Blocks(title="Video Search Agent", theme=gr.themes.Soft()) as demo:
 # Launch the application
 if __name__ == "__main__":
     print("🚀 Starting Video Search Agent...")
-    print("📍 Using CLIP model for embeddings:", app_config.CLIP_MODEL_NAME)
     demo.launch(
-        server_name="0.0.0.0",
         server_port=7860,
-        debug=True
     )

 from utils import get_text_embedding, sample_from_video, convert_image_to_base64
 from config import load_config
 from lancedb_utils import retreive_clip
+from gradio import ChatMessage
 app_config = load_config()
 langchain_message_history = []
         list: list of frames
     """
     print("clip id", clip_id)
+    clip = retreive_clip(clip_id, app_config.LANCEDB_URI.get_secret_value())
     images = sample_from_video(clip["clip_path"])
+    base64_images = [convert_image_to_base64(image, "png") for image in images]
     return base64_images
 def search_and_display_clips(query_text):
     # Add current message
     langchain_message_history.append({"role": "user", "content": message})
+    llm_with_tool = chat_model.bind_tools(tools=[get_relevant_clips])
     tools = {"get_relevant_clips": get_relevant_clips}
     # The agent loop
     global latest_search_results, langchain_message_history
     langchain_message_history.append({"role": "user", "content": message})
+    history.append({"role": "user", "content": message})
     print(langchain_message_history)
     llm_with_tool = chat_model_vlm.bind_tools(tools=[get_relevant_clips, get_clip])
         for tool_call in ai_response.tool_calls:
             print(tool_call)
+            langchain_message_history.append(
+                {
+                    "role": "assistant",
+                    "content": "",
+                    "tool_calls": [
+                        tool_call
+                    ]
+                }
+            )
+            history.append(
+                {
+                    "role": "assistant",
+                    "content": "",
+                    "tool_calls": [
+                        tool_call
+                    ]
+                }
+            )
             tool_output = tools[tool_call["name"]].invoke(tool_call)
             if tool_call["name"] == "get_clip":
+                tool_call_log = {
+                    "role": "tool",
+                    "tool_call_id": tool_output.tool_call_id,
+                    "content": "retrieved clip will be provided by the user after this message"
+                }
+                history.append(tool_call_log)
+                langchain_message_history.extend([
+                tool_call_log,
+                {
+                    "role": "user", "content": [
+                        {"type": "text", "text": "here is the clip retreived by the tool"},
+                        *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/png;base64,{x}'}}, tool_output.content)
+                    ],
+                }])
             else:
                 tool_call_log = {
                     "role": "tool",
                     "content": tool_output.content
                 }
                 langchain_message_history.append(tool_call_log)
+                history.append(tool_call_log)
     content = ai_response.content
     if "</think>" in content:
     # The global state `latest_search_results` is updated by the tool.
     # The text response is returned.
     langchain_message_history.append({"role": "assistant", "content": content})
+    history.append({"role": "assistant", "content": content})
+    return history
 def get_latest_clips_for_display():
     """Get the latest search results for display in the UI."""
                     original_filename = "uploaded_video.mp4"
                 temp_dir = tempfile.mkdtemp()
                 tmp_path = os.path.join(temp_dir, original_filename)
+                with open(tmp_path, "wb") as f:
+                    f.write(file_obj)
                 # Run the video processing pipeline
                 run_pipeline(tmp_path)
                 return f"✅ Video analysis complete for '{original_filename}'. You can now search for clips from this video."
             except Exception as e:
                 return f"❌ Error during video analysis: {str(e)}"
         analyze_btn.click(
 # Launch the application
 if __name__ == "__main__":
     print("🚀 Starting Video Search Agent...")
     demo.launch(
+        server_name="localhost",
         server_port=7860,
+        share=False,
     )

utils.py CHANGED Viewed

@@ -5,6 +5,7 @@ import io
 import base64
 from PIL import Image
 from typing import List, Union
 import cv2
 def create_directory(directory):
@@ -114,28 +115,75 @@ def sample_from_video(video_path: str, sampling_rate=0.5) -> list[Image.Image]:
     Args:
         video_path (str): path to video
         sampling_rate (float): frames per second, how many frames to take from each second
     Returns:
         list[Image.Image]: a list of PIL images
     """
     video = cv2.VideoCapture(video_path)
     total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = video.get(cv2.CAP_PROP_FPS)
-    frames_to_skip = int(fps / sampling_rate)
-    curr_frame = 0
     images = []
-    while curr_frame < total_frames:
-        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
-        success, frame = video.read()
         if not success:
             break
-        _, buffer = cv2.imencode(".jpg", frame)
-        images.append(Image.fromarray(cv2.cvtColor(buffer, cv2.COLOR_BGR2RGB)))
-        curr_frame += frames_to_skip
-    video.release()
     return images
 def convert_base64_to_image(base64_image: str) -> Image.Image:

 import base64
 from PIL import Image
 from typing import List, Union
+import uuid
 import cv2
 def create_directory(directory):
     Args:
         video_path (str): path to video
         sampling_rate (float): frames per second, how many frames to take from each second
+                               e.g., 0.5 means take 1 frame every 2 seconds.
+                               e.g., 2 means take 2 frames every 1 second.
     Returns:
         list[Image.Image]: a list of PIL images
     """
+    print(f"Attempting to open video: {video_path}")
     video = cv2.VideoCapture(video_path)
+    if not video.isOpened():
+        print(f"Error: Could not open video {video_path}")
+        return []
     total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = video.get(cv2.CAP_PROP_FPS)
+    if fps == 0: # Handle cases where FPS might not be readable or is zero
+        print(f"Error: Video FPS is {fps}. Cannot calculate sampling.")
+        video.release()
+        return []
+    if sampling_rate <= 0:
+        print(f"Error: sampling_rate ({sampling_rate}) must be positive.")
+        video.release()
+        return []
+    # Calculate the frame interval.
+    # If sampling_rate is 0.5 FPS (1 frame every 2s) and video is 30 FPS,
+    # interval = 30 / 0.5 = 60. So, take frame 0, 60, 120...
+    # If sampling_rate is 2 FPS (2 frames every 1s) and video is 30 FPS,
+    # interval = 30 / 2 = 15. So, take frame 0, 15, 30...
+    frame_interval = round(fps / sampling_rate)
+    # Ensure we always advance at least one frame to avoid infinite loops if fps/sampling_rate is too small
+    frame_interval = max(1, int(frame_interval))
+    print(f"Video Info - Total Frames: {total_frames}, FPS: {fps:.2f}, Desired Sample Rate: {sampling_rate} fps")
+    print(f"Calculated frame interval: Take 1 frame every {frame_interval} original frames.")
+    current_frame_pos = 0
     images = []
+    while current_frame_pos < total_frames:
+        video.set(cv2.CAP_PROP_POS_FRAMES, current_frame_pos)
+        success, frame_bgr = video.read() # frame_bgr is a NumPy array in BGR format
         if not success:
+            # This might happen if we try to seek beyond the last valid frame
+            # or if there's a read error.
+            print(f"Warning: Failed to read frame at position {current_frame_pos}. Ending capture.")
             break
+        # Convert the BGR frame to RGB for PIL
+        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+        # Create a PIL Image from the RGB NumPy array
+        image = Image.fromarray(frame_rgb)
+        # If you want to display/save for debugging:
+        # image.show(title=f"Frame {current_frame_pos}") # Displays the image
+        # image.save(f"debug_frame_{current_frame_pos}.png") # Saves the image
+        images.append(image)
+        # print(f"Captured frame {current_frame_pos}")
+        current_frame_pos += frame_interval
+    video.release()
+    print(f"Successfully sampled {len(images)} images.")
     return images
 def convert_base64_to_image(base64_image: str) -> Image.Image: