HaithamIsmail
commited on
Commit
Β·
e17f518
1
Parent(s):
7835c3b
add video link, fixed frames sampling
Browse files
README.md
CHANGED
@@ -63,6 +63,8 @@ Here is the revised README that incorporates these setup instructions into a com
|
|
63 |
|
64 |
---
|
65 |
|
|
|
|
|
66 |
# π¬ HyperClipper: Your AI Video Librarian π€
|
67 |
|
68 |
Tired of scrubbing through hours of video to find that *one* perfect moment? HyperClipper is your personal AI video librarian that watches, understands, and catalogs your entire video library, making every second instantly searchable.
|
|
|
63 |
|
64 |
---
|
65 |
|
66 |
+
### Demo Video: https://youtu.be/S-Sbn1NTWq0
|
67 |
+
|
68 |
# π¬ HyperClipper: Your AI Video Librarian π€
|
69 |
|
70 |
Tired of scrubbing through hours of video to find that *one* perfect moment? HyperClipper is your personal AI video librarian that watches, understands, and catalogs your entire video library, making every second instantly searchable.
|
app.py
CHANGED
@@ -11,7 +11,7 @@ import shutil
|
|
11 |
from utils import get_text_embedding, sample_from_video, convert_image_to_base64
|
12 |
from config import load_config
|
13 |
from lancedb_utils import retreive_clip
|
14 |
-
import
|
15 |
|
16 |
app_config = load_config()
|
17 |
langchain_message_history = []
|
@@ -174,9 +174,9 @@ def get_clip(clip_id: str):
|
|
174 |
list: list of frames
|
175 |
"""
|
176 |
print("clip id", clip_id)
|
177 |
-
clip = retreive_clip(clip_id)
|
178 |
images = sample_from_video(clip["clip_path"])
|
179 |
-
base64_images = [convert_image_to_base64(image) for image in images]
|
180 |
return base64_images
|
181 |
|
182 |
def search_and_display_clips(query_text):
|
@@ -211,7 +211,7 @@ def chat_agent(message, history: list):
|
|
211 |
# Add current message
|
212 |
langchain_message_history.append({"role": "user", "content": message})
|
213 |
|
214 |
-
llm_with_tool = chat_model.bind_tools(tools=[get_relevant_clips
|
215 |
tools = {"get_relevant_clips": get_relevant_clips}
|
216 |
|
217 |
# The agent loop
|
@@ -244,6 +244,7 @@ def chat_agent_mm(message, history):
|
|
244 |
global latest_search_results, langchain_message_history
|
245 |
|
246 |
langchain_message_history.append({"role": "user", "content": message})
|
|
|
247 |
|
248 |
print(langchain_message_history)
|
249 |
llm_with_tool = chat_model_vlm.bind_tools(tools=[get_relevant_clips, get_clip])
|
@@ -258,14 +259,40 @@ def chat_agent_mm(message, history):
|
|
258 |
|
259 |
for tool_call in ai_response.tool_calls:
|
260 |
print(tool_call)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
tool_output = tools[tool_call["name"]].invoke(tool_call)
|
262 |
if tool_call["name"] == "get_clip":
|
263 |
-
|
264 |
-
"role": "tool",
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
else:
|
270 |
tool_call_log = {
|
271 |
"role": "tool",
|
@@ -273,6 +300,7 @@ def chat_agent_mm(message, history):
|
|
273 |
"content": tool_output.content
|
274 |
}
|
275 |
langchain_message_history.append(tool_call_log)
|
|
|
276 |
|
277 |
content = ai_response.content
|
278 |
if "</think>" in content:
|
@@ -281,7 +309,8 @@ def chat_agent_mm(message, history):
|
|
281 |
# The global state `latest_search_results` is updated by the tool.
|
282 |
# The text response is returned.
|
283 |
langchain_message_history.append({"role": "assistant", "content": content})
|
284 |
-
|
|
|
285 |
|
286 |
def get_latest_clips_for_display():
|
287 |
"""Get the latest search results for display in the UI."""
|
@@ -629,8 +658,8 @@ with gr.Blocks(title="Video Search Agent", theme=gr.themes.Soft()) as demo:
|
|
629 |
original_filename = "uploaded_video.mp4"
|
630 |
temp_dir = tempfile.mkdtemp()
|
631 |
tmp_path = os.path.join(temp_dir, original_filename)
|
632 |
-
|
633 |
-
|
634 |
|
635 |
# Run the video processing pipeline
|
636 |
run_pipeline(tmp_path)
|
@@ -644,7 +673,6 @@ with gr.Blocks(title="Video Search Agent", theme=gr.themes.Soft()) as demo:
|
|
644 |
|
645 |
return f"β
Video analysis complete for '{original_filename}'. You can now search for clips from this video."
|
646 |
except Exception as e:
|
647 |
-
traceback.print_exc()
|
648 |
return f"β Error during video analysis: {str(e)}"
|
649 |
|
650 |
analyze_btn.click(
|
@@ -656,10 +684,9 @@ with gr.Blocks(title="Video Search Agent", theme=gr.themes.Soft()) as demo:
|
|
656 |
# Launch the application
|
657 |
if __name__ == "__main__":
|
658 |
print("π Starting Video Search Agent...")
|
659 |
-
print("π Using CLIP model for embeddings:", app_config.CLIP_MODEL_NAME)
|
660 |
|
661 |
demo.launch(
|
662 |
-
server_name="
|
663 |
server_port=7860,
|
664 |
-
|
665 |
)
|
|
|
11 |
from utils import get_text_embedding, sample_from_video, convert_image_to_base64
|
12 |
from config import load_config
|
13 |
from lancedb_utils import retreive_clip
|
14 |
+
from gradio import ChatMessage
|
15 |
|
16 |
app_config = load_config()
|
17 |
langchain_message_history = []
|
|
|
174 |
list: list of frames
|
175 |
"""
|
176 |
print("clip id", clip_id)
|
177 |
+
clip = retreive_clip(clip_id, app_config.LANCEDB_URI.get_secret_value())
|
178 |
images = sample_from_video(clip["clip_path"])
|
179 |
+
base64_images = [convert_image_to_base64(image, "png") for image in images]
|
180 |
return base64_images
|
181 |
|
182 |
def search_and_display_clips(query_text):
|
|
|
211 |
# Add current message
|
212 |
langchain_message_history.append({"role": "user", "content": message})
|
213 |
|
214 |
+
llm_with_tool = chat_model.bind_tools(tools=[get_relevant_clips])
|
215 |
tools = {"get_relevant_clips": get_relevant_clips}
|
216 |
|
217 |
# The agent loop
|
|
|
244 |
global latest_search_results, langchain_message_history
|
245 |
|
246 |
langchain_message_history.append({"role": "user", "content": message})
|
247 |
+
history.append({"role": "user", "content": message})
|
248 |
|
249 |
print(langchain_message_history)
|
250 |
llm_with_tool = chat_model_vlm.bind_tools(tools=[get_relevant_clips, get_clip])
|
|
|
259 |
|
260 |
for tool_call in ai_response.tool_calls:
|
261 |
print(tool_call)
|
262 |
+
langchain_message_history.append(
|
263 |
+
{
|
264 |
+
"role": "assistant",
|
265 |
+
"content": "",
|
266 |
+
"tool_calls": [
|
267 |
+
tool_call
|
268 |
+
]
|
269 |
+
}
|
270 |
+
)
|
271 |
+
history.append(
|
272 |
+
{
|
273 |
+
"role": "assistant",
|
274 |
+
"content": "",
|
275 |
+
"tool_calls": [
|
276 |
+
tool_call
|
277 |
+
]
|
278 |
+
}
|
279 |
+
)
|
280 |
tool_output = tools[tool_call["name"]].invoke(tool_call)
|
281 |
if tool_call["name"] == "get_clip":
|
282 |
+
tool_call_log = {
|
283 |
+
"role": "tool",
|
284 |
+
"tool_call_id": tool_output.tool_call_id,
|
285 |
+
"content": "retrieved clip will be provided by the user after this message"
|
286 |
+
}
|
287 |
+
history.append(tool_call_log)
|
288 |
+
langchain_message_history.extend([
|
289 |
+
tool_call_log,
|
290 |
+
{
|
291 |
+
"role": "user", "content": [
|
292 |
+
{"type": "text", "text": "here is the clip retreived by the tool"},
|
293 |
+
*map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/png;base64,{x}'}}, tool_output.content)
|
294 |
+
],
|
295 |
+
}])
|
296 |
else:
|
297 |
tool_call_log = {
|
298 |
"role": "tool",
|
|
|
300 |
"content": tool_output.content
|
301 |
}
|
302 |
langchain_message_history.append(tool_call_log)
|
303 |
+
history.append(tool_call_log)
|
304 |
|
305 |
content = ai_response.content
|
306 |
if "</think>" in content:
|
|
|
309 |
# The global state `latest_search_results` is updated by the tool.
|
310 |
# The text response is returned.
|
311 |
langchain_message_history.append({"role": "assistant", "content": content})
|
312 |
+
history.append({"role": "assistant", "content": content})
|
313 |
+
return history
|
314 |
|
315 |
def get_latest_clips_for_display():
|
316 |
"""Get the latest search results for display in the UI."""
|
|
|
658 |
original_filename = "uploaded_video.mp4"
|
659 |
temp_dir = tempfile.mkdtemp()
|
660 |
tmp_path = os.path.join(temp_dir, original_filename)
|
661 |
+
with open(tmp_path, "wb") as f:
|
662 |
+
f.write(file_obj)
|
663 |
|
664 |
# Run the video processing pipeline
|
665 |
run_pipeline(tmp_path)
|
|
|
673 |
|
674 |
return f"β
Video analysis complete for '{original_filename}'. You can now search for clips from this video."
|
675 |
except Exception as e:
|
|
|
676 |
return f"β Error during video analysis: {str(e)}"
|
677 |
|
678 |
analyze_btn.click(
|
|
|
684 |
# Launch the application
|
685 |
if __name__ == "__main__":
|
686 |
print("π Starting Video Search Agent...")
|
|
|
687 |
|
688 |
demo.launch(
|
689 |
+
server_name="localhost",
|
690 |
server_port=7860,
|
691 |
+
share=False,
|
692 |
)
|
utils.py
CHANGED
@@ -5,6 +5,7 @@ import io
|
|
5 |
import base64
|
6 |
from PIL import Image
|
7 |
from typing import List, Union
|
|
|
8 |
import cv2
|
9 |
|
10 |
def create_directory(directory):
|
@@ -114,28 +115,75 @@ def sample_from_video(video_path: str, sampling_rate=0.5) -> list[Image.Image]:
|
|
114 |
Args:
|
115 |
video_path (str): path to video
|
116 |
sampling_rate (float): frames per second, how many frames to take from each second
|
|
|
|
|
117 |
|
118 |
Returns:
|
119 |
list[Image.Image]: a list of PIL images
|
120 |
"""
|
|
|
121 |
video = cv2.VideoCapture(video_path)
|
|
|
|
|
|
|
|
|
|
|
122 |
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
123 |
fps = video.get(cv2.CAP_PROP_FPS)
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
images = []
|
127 |
|
128 |
-
while
|
129 |
-
video.set(cv2.CAP_PROP_POS_FRAMES,
|
130 |
-
success,
|
|
|
131 |
if not success:
|
|
|
|
|
|
|
132 |
break
|
133 |
-
_, buffer = cv2.imencode(".jpg", frame)
|
134 |
-
images.append(Image.fromarray(cv2.cvtColor(buffer, cv2.COLOR_BGR2RGB)))
|
135 |
-
curr_frame += frames_to_skip
|
136 |
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
return images
|
140 |
|
141 |
def convert_base64_to_image(base64_image: str) -> Image.Image:
|
|
|
5 |
import base64
|
6 |
from PIL import Image
|
7 |
from typing import List, Union
|
8 |
+
import uuid
|
9 |
import cv2
|
10 |
|
11 |
def create_directory(directory):
|
|
|
115 |
Args:
|
116 |
video_path (str): path to video
|
117 |
sampling_rate (float): frames per second, how many frames to take from each second
|
118 |
+
e.g., 0.5 means take 1 frame every 2 seconds.
|
119 |
+
e.g., 2 means take 2 frames every 1 second.
|
120 |
|
121 |
Returns:
|
122 |
list[Image.Image]: a list of PIL images
|
123 |
"""
|
124 |
+
print(f"Attempting to open video: {video_path}")
|
125 |
video = cv2.VideoCapture(video_path)
|
126 |
+
|
127 |
+
if not video.isOpened():
|
128 |
+
print(f"Error: Could not open video {video_path}")
|
129 |
+
return []
|
130 |
+
|
131 |
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
132 |
fps = video.get(cv2.CAP_PROP_FPS)
|
133 |
+
|
134 |
+
if fps == 0: # Handle cases where FPS might not be readable or is zero
|
135 |
+
print(f"Error: Video FPS is {fps}. Cannot calculate sampling.")
|
136 |
+
video.release()
|
137 |
+
return []
|
138 |
+
|
139 |
+
if sampling_rate <= 0:
|
140 |
+
print(f"Error: sampling_rate ({sampling_rate}) must be positive.")
|
141 |
+
video.release()
|
142 |
+
return []
|
143 |
+
|
144 |
+
# Calculate the frame interval.
|
145 |
+
# If sampling_rate is 0.5 FPS (1 frame every 2s) and video is 30 FPS,
|
146 |
+
# interval = 30 / 0.5 = 60. So, take frame 0, 60, 120...
|
147 |
+
# If sampling_rate is 2 FPS (2 frames every 1s) and video is 30 FPS,
|
148 |
+
# interval = 30 / 2 = 15. So, take frame 0, 15, 30...
|
149 |
+
frame_interval = round(fps / sampling_rate)
|
150 |
+
# Ensure we always advance at least one frame to avoid infinite loops if fps/sampling_rate is too small
|
151 |
+
frame_interval = max(1, int(frame_interval))
|
152 |
+
|
153 |
+
|
154 |
+
print(f"Video Info - Total Frames: {total_frames}, FPS: {fps:.2f}, Desired Sample Rate: {sampling_rate} fps")
|
155 |
+
print(f"Calculated frame interval: Take 1 frame every {frame_interval} original frames.")
|
156 |
+
|
157 |
+
current_frame_pos = 0
|
158 |
images = []
|
159 |
|
160 |
+
while current_frame_pos < total_frames:
|
161 |
+
video.set(cv2.CAP_PROP_POS_FRAMES, current_frame_pos)
|
162 |
+
success, frame_bgr = video.read() # frame_bgr is a NumPy array in BGR format
|
163 |
+
|
164 |
if not success:
|
165 |
+
# This might happen if we try to seek beyond the last valid frame
|
166 |
+
# or if there's a read error.
|
167 |
+
print(f"Warning: Failed to read frame at position {current_frame_pos}. Ending capture.")
|
168 |
break
|
|
|
|
|
|
|
169 |
|
170 |
+
# Convert the BGR frame to RGB for PIL
|
171 |
+
frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
|
172 |
+
|
173 |
+
# Create a PIL Image from the RGB NumPy array
|
174 |
+
image = Image.fromarray(frame_rgb)
|
175 |
+
|
176 |
+
# If you want to display/save for debugging:
|
177 |
+
# image.show(title=f"Frame {current_frame_pos}") # Displays the image
|
178 |
+
# image.save(f"debug_frame_{current_frame_pos}.png") # Saves the image
|
179 |
|
180 |
+
images.append(image)
|
181 |
+
# print(f"Captured frame {current_frame_pos}")
|
182 |
+
|
183 |
+
current_frame_pos += frame_interval
|
184 |
+
|
185 |
+
video.release()
|
186 |
+
print(f"Successfully sampled {len(images)} images.")
|
187 |
return images
|
188 |
|
189 |
def convert_base64_to_image(base64_image: str) -> Image.Image:
|