HaithamIsmail commited on
Commit
e17f518
Β·
1 Parent(s): 7835c3b

add video link, fixed frames sampling

Browse files
Files changed (3) hide show
  1. README.md +2 -0
  2. app.py +44 -17
  3. utils.py +57 -9
README.md CHANGED
@@ -63,6 +63,8 @@ Here is the revised README that incorporates these setup instructions into a com
63
 
64
  ---
65
 
 
 
66
  # 🎬 HyperClipper: Your AI Video Librarian πŸ€–
67
 
68
  Tired of scrubbing through hours of video to find that *one* perfect moment? HyperClipper is your personal AI video librarian that watches, understands, and catalogs your entire video library, making every second instantly searchable.
 
63
 
64
  ---
65
 
66
+ ### Demo Video: https://youtu.be/S-Sbn1NTWq0
67
+
68
  # 🎬 HyperClipper: Your AI Video Librarian πŸ€–
69
 
70
  Tired of scrubbing through hours of video to find that *one* perfect moment? HyperClipper is your personal AI video librarian that watches, understands, and catalogs your entire video library, making every second instantly searchable.
app.py CHANGED
@@ -11,7 +11,7 @@ import shutil
11
  from utils import get_text_embedding, sample_from_video, convert_image_to_base64
12
  from config import load_config
13
  from lancedb_utils import retreive_clip
14
- import traceback
15
 
16
  app_config = load_config()
17
  langchain_message_history = []
@@ -174,9 +174,9 @@ def get_clip(clip_id: str):
174
  list: list of frames
175
  """
176
  print("clip id", clip_id)
177
- clip = retreive_clip(clip_id)
178
  images = sample_from_video(clip["clip_path"])
179
- base64_images = [convert_image_to_base64(image) for image in images]
180
  return base64_images
181
 
182
  def search_and_display_clips(query_text):
@@ -211,7 +211,7 @@ def chat_agent(message, history: list):
211
  # Add current message
212
  langchain_message_history.append({"role": "user", "content": message})
213
 
214
- llm_with_tool = chat_model.bind_tools(tools=[get_relevant_clips, get_clip_base64])
215
  tools = {"get_relevant_clips": get_relevant_clips}
216
 
217
  # The agent loop
@@ -244,6 +244,7 @@ def chat_agent_mm(message, history):
244
  global latest_search_results, langchain_message_history
245
 
246
  langchain_message_history.append({"role": "user", "content": message})
 
247
 
248
  print(langchain_message_history)
249
  llm_with_tool = chat_model_vlm.bind_tools(tools=[get_relevant_clips, get_clip])
@@ -258,14 +259,40 @@ def chat_agent_mm(message, history):
258
 
259
  for tool_call in ai_response.tool_calls:
260
  print(tool_call)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  tool_output = tools[tool_call["name"]].invoke(tool_call)
262
  if tool_call["name"] == "get_clip":
263
- langchain_message_history.append({
264
- "role": "tool", "content": [
265
- {"type": "text", "text": "here is the clip"}
266
- *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpeg;base64,{x}'}}, tool_output.content)
267
- ]
268
- })
 
 
 
 
 
 
 
 
269
  else:
270
  tool_call_log = {
271
  "role": "tool",
@@ -273,6 +300,7 @@ def chat_agent_mm(message, history):
273
  "content": tool_output.content
274
  }
275
  langchain_message_history.append(tool_call_log)
 
276
 
277
  content = ai_response.content
278
  if "</think>" in content:
@@ -281,7 +309,8 @@ def chat_agent_mm(message, history):
281
  # The global state `latest_search_results` is updated by the tool.
282
  # The text response is returned.
283
  langchain_message_history.append({"role": "assistant", "content": content})
284
- return langchain_message_history
 
285
 
286
  def get_latest_clips_for_display():
287
  """Get the latest search results for display in the UI."""
@@ -629,8 +658,8 @@ with gr.Blocks(title="Video Search Agent", theme=gr.themes.Soft()) as demo:
629
  original_filename = "uploaded_video.mp4"
630
  temp_dir = tempfile.mkdtemp()
631
  tmp_path = os.path.join(temp_dir, original_filename)
632
-
633
- shutil.copy(file_obj, tmp_path)
634
 
635
  # Run the video processing pipeline
636
  run_pipeline(tmp_path)
@@ -644,7 +673,6 @@ with gr.Blocks(title="Video Search Agent", theme=gr.themes.Soft()) as demo:
644
 
645
  return f"βœ… Video analysis complete for '{original_filename}'. You can now search for clips from this video."
646
  except Exception as e:
647
- traceback.print_exc()
648
  return f"❌ Error during video analysis: {str(e)}"
649
 
650
  analyze_btn.click(
@@ -656,10 +684,9 @@ with gr.Blocks(title="Video Search Agent", theme=gr.themes.Soft()) as demo:
656
  # Launch the application
657
  if __name__ == "__main__":
658
  print("πŸš€ Starting Video Search Agent...")
659
- print("πŸ“ Using CLIP model for embeddings:", app_config.CLIP_MODEL_NAME)
660
 
661
  demo.launch(
662
- server_name="0.0.0.0",
663
  server_port=7860,
664
- debug=True
665
  )
 
11
  from utils import get_text_embedding, sample_from_video, convert_image_to_base64
12
  from config import load_config
13
  from lancedb_utils import retreive_clip
14
+ from gradio import ChatMessage
15
 
16
  app_config = load_config()
17
  langchain_message_history = []
 
174
  list: list of frames
175
  """
176
  print("clip id", clip_id)
177
+ clip = retreive_clip(clip_id, app_config.LANCEDB_URI.get_secret_value())
178
  images = sample_from_video(clip["clip_path"])
179
+ base64_images = [convert_image_to_base64(image, "png") for image in images]
180
  return base64_images
181
 
182
  def search_and_display_clips(query_text):
 
211
  # Add current message
212
  langchain_message_history.append({"role": "user", "content": message})
213
 
214
+ llm_with_tool = chat_model.bind_tools(tools=[get_relevant_clips])
215
  tools = {"get_relevant_clips": get_relevant_clips}
216
 
217
  # The agent loop
 
244
  global latest_search_results, langchain_message_history
245
 
246
  langchain_message_history.append({"role": "user", "content": message})
247
+ history.append({"role": "user", "content": message})
248
 
249
  print(langchain_message_history)
250
  llm_with_tool = chat_model_vlm.bind_tools(tools=[get_relevant_clips, get_clip])
 
259
 
260
  for tool_call in ai_response.tool_calls:
261
  print(tool_call)
262
+ langchain_message_history.append(
263
+ {
264
+ "role": "assistant",
265
+ "content": "",
266
+ "tool_calls": [
267
+ tool_call
268
+ ]
269
+ }
270
+ )
271
+ history.append(
272
+ {
273
+ "role": "assistant",
274
+ "content": "",
275
+ "tool_calls": [
276
+ tool_call
277
+ ]
278
+ }
279
+ )
280
  tool_output = tools[tool_call["name"]].invoke(tool_call)
281
  if tool_call["name"] == "get_clip":
282
+ tool_call_log = {
283
+ "role": "tool",
284
+ "tool_call_id": tool_output.tool_call_id,
285
+ "content": "retrieved clip will be provided by the user after this message"
286
+ }
287
+ history.append(tool_call_log)
288
+ langchain_message_history.extend([
289
+ tool_call_log,
290
+ {
291
+ "role": "user", "content": [
292
+ {"type": "text", "text": "here is the clip retreived by the tool"},
293
+ *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/png;base64,{x}'}}, tool_output.content)
294
+ ],
295
+ }])
296
  else:
297
  tool_call_log = {
298
  "role": "tool",
 
300
  "content": tool_output.content
301
  }
302
  langchain_message_history.append(tool_call_log)
303
+ history.append(tool_call_log)
304
 
305
  content = ai_response.content
306
  if "</think>" in content:
 
309
  # The global state `latest_search_results` is updated by the tool.
310
  # The text response is returned.
311
  langchain_message_history.append({"role": "assistant", "content": content})
312
+ history.append({"role": "assistant", "content": content})
313
+ return history
314
 
315
  def get_latest_clips_for_display():
316
  """Get the latest search results for display in the UI."""
 
658
  original_filename = "uploaded_video.mp4"
659
  temp_dir = tempfile.mkdtemp()
660
  tmp_path = os.path.join(temp_dir, original_filename)
661
+ with open(tmp_path, "wb") as f:
662
+ f.write(file_obj)
663
 
664
  # Run the video processing pipeline
665
  run_pipeline(tmp_path)
 
673
 
674
  return f"βœ… Video analysis complete for '{original_filename}'. You can now search for clips from this video."
675
  except Exception as e:
 
676
  return f"❌ Error during video analysis: {str(e)}"
677
 
678
  analyze_btn.click(
 
684
  # Launch the application
685
  if __name__ == "__main__":
686
  print("πŸš€ Starting Video Search Agent...")
 
687
 
688
  demo.launch(
689
+ server_name="localhost",
690
  server_port=7860,
691
+ share=False,
692
  )
utils.py CHANGED
@@ -5,6 +5,7 @@ import io
5
  import base64
6
  from PIL import Image
7
  from typing import List, Union
 
8
  import cv2
9
 
10
  def create_directory(directory):
@@ -114,28 +115,75 @@ def sample_from_video(video_path: str, sampling_rate=0.5) -> list[Image.Image]:
114
  Args:
115
  video_path (str): path to video
116
  sampling_rate (float): frames per second, how many frames to take from each second
 
 
117
 
118
  Returns:
119
  list[Image.Image]: a list of PIL images
120
  """
 
121
  video = cv2.VideoCapture(video_path)
 
 
 
 
 
122
  total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
123
  fps = video.get(cv2.CAP_PROP_FPS)
124
- frames_to_skip = int(fps / sampling_rate)
125
- curr_frame = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  images = []
127
 
128
- while curr_frame < total_frames:
129
- video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
130
- success, frame = video.read()
 
131
  if not success:
 
 
 
132
  break
133
- _, buffer = cv2.imencode(".jpg", frame)
134
- images.append(Image.fromarray(cv2.cvtColor(buffer, cv2.COLOR_BGR2RGB)))
135
- curr_frame += frames_to_skip
136
 
137
- video.release()
 
 
 
 
 
 
 
 
138
 
 
 
 
 
 
 
 
139
  return images
140
 
141
  def convert_base64_to_image(base64_image: str) -> Image.Image:
 
5
  import base64
6
  from PIL import Image
7
  from typing import List, Union
8
+ import uuid
9
  import cv2
10
 
11
  def create_directory(directory):
 
115
  Args:
116
  video_path (str): path to video
117
  sampling_rate (float): frames per second, how many frames to take from each second
118
+ e.g., 0.5 means take 1 frame every 2 seconds.
119
+ e.g., 2 means take 2 frames every 1 second.
120
 
121
  Returns:
122
  list[Image.Image]: a list of PIL images
123
  """
124
+ print(f"Attempting to open video: {video_path}")
125
  video = cv2.VideoCapture(video_path)
126
+
127
+ if not video.isOpened():
128
+ print(f"Error: Could not open video {video_path}")
129
+ return []
130
+
131
  total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
132
  fps = video.get(cv2.CAP_PROP_FPS)
133
+
134
+ if fps == 0: # Handle cases where FPS might not be readable or is zero
135
+ print(f"Error: Video FPS is {fps}. Cannot calculate sampling.")
136
+ video.release()
137
+ return []
138
+
139
+ if sampling_rate <= 0:
140
+ print(f"Error: sampling_rate ({sampling_rate}) must be positive.")
141
+ video.release()
142
+ return []
143
+
144
+ # Calculate the frame interval.
145
+ # If sampling_rate is 0.5 FPS (1 frame every 2s) and video is 30 FPS,
146
+ # interval = 30 / 0.5 = 60. So, take frame 0, 60, 120...
147
+ # If sampling_rate is 2 FPS (2 frames every 1s) and video is 30 FPS,
148
+ # interval = 30 / 2 = 15. So, take frame 0, 15, 30...
149
+ frame_interval = round(fps / sampling_rate)
150
+ # Ensure we always advance at least one frame to avoid infinite loops if fps/sampling_rate is too small
151
+ frame_interval = max(1, int(frame_interval))
152
+
153
+
154
+ print(f"Video Info - Total Frames: {total_frames}, FPS: {fps:.2f}, Desired Sample Rate: {sampling_rate} fps")
155
+ print(f"Calculated frame interval: Take 1 frame every {frame_interval} original frames.")
156
+
157
+ current_frame_pos = 0
158
  images = []
159
 
160
+ while current_frame_pos < total_frames:
161
+ video.set(cv2.CAP_PROP_POS_FRAMES, current_frame_pos)
162
+ success, frame_bgr = video.read() # frame_bgr is a NumPy array in BGR format
163
+
164
  if not success:
165
+ # This might happen if we try to seek beyond the last valid frame
166
+ # or if there's a read error.
167
+ print(f"Warning: Failed to read frame at position {current_frame_pos}. Ending capture.")
168
  break
 
 
 
169
 
170
+ # Convert the BGR frame to RGB for PIL
171
+ frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
172
+
173
+ # Create a PIL Image from the RGB NumPy array
174
+ image = Image.fromarray(frame_rgb)
175
+
176
+ # If you want to display/save for debugging:
177
+ # image.show(title=f"Frame {current_frame_pos}") # Displays the image
178
+ # image.save(f"debug_frame_{current_frame_pos}.png") # Saves the image
179
 
180
+ images.append(image)
181
+ # print(f"Captured frame {current_frame_pos}")
182
+
183
+ current_frame_pos += frame_interval
184
+
185
+ video.release()
186
+ print(f"Successfully sampled {len(images)} images.")
187
  return images
188
 
189
  def convert_base64_to_image(base64_image: str) -> Image.Image: