Spaces:

qubvel-hf
/

vjepa2-streaming-video-classification

Running on L4

App Files Files Community

qubvel-hf HF Staff commited on Jun 16

Commit

79f197e

1 Parent(s): bc21b9d

Add comments and docstrings

Browse files

Files changed (1) hide show

app.py +61 -4

app.py CHANGED Viewed

@@ -1,3 +1,14 @@
 import cv2
 import time
 import torch
@@ -8,13 +19,24 @@ from fastrtc import Stream, VideoStreamHandler, AdditionalOutputs
 from transformers import VJEPA2ForVideoClassification, AutoVideoProcessor
-CHECKPOINT = "qubvel-hf/vjepa2-vitl-fpc16-256-ssv2"
-TORCH_DTYPE = torch.float16
-TORCH_DEVICE = "cuda"
-UPDATE_EVERY_N_FRAMES = 64
 def add_text_on_image(image, text):
     # Add a black background to the text
     image[:70] = 0
@@ -56,6 +78,17 @@ def add_text_on_image(image, text):
 class RunningFramesCache:
     def __init__(self, save_every_k_frame: int = 1, max_frames: int = 16):
         self.save_every_k_frame = save_every_k_frame
         self.max_frames = max_frames
@@ -74,6 +107,16 @@ class RunningFramesCache:
 class RunningResult:
     def __init__(self, max_predictions: int = 4):
         self.predictions = []
         self.max_predictions = max_predictions
@@ -100,6 +143,19 @@ class RunningResult:
 class FrameProcessingCallback:
     def __init__(self):
         # Loading model and processor
         self.model = VJEPA2ForVideoClassification.from_pretrained(CHECKPOINT, torch_dtype=torch.bfloat16)
@@ -146,6 +202,7 @@ class FrameProcessingCallback:
         return image, AdditionalOutputs(formatted_predictions)
 stream = Stream(
     handler=VideoStreamHandler(FrameProcessingCallback(), skip_frames=True),
     modality="video",

+"""
+Real-time video classification using VJEPA2 model with streaming capabilities.
+This module implements a real-time video classification system that:
+1. Captures video frames from a webcam
+2. Processes batches of frames using the V-JEPA 2 model
+3. Displays predictions overlaid on the video stream
+4. Maintains a history of recent predictions
+The system uses FastRTC for video streaming and Gradio for the web interface.
+"""
 import cv2
 import time
 import torch
 from transformers import VJEPA2ForVideoClassification, AutoVideoProcessor
+# Model configuration
+CHECKPOINT = "qubvel-hf/vjepa2-vitl-fpc16-256-ssv2"  # Pre-trained VJEPA2 model checkpoint
+TORCH_DTYPE = torch.float16  # Use half precision for faster inference
+TORCH_DEVICE = "cuda"  # Use GPU for inference
+UPDATE_EVERY_N_FRAMES = 64  # How often to update predictions (in frames)
 def add_text_on_image(image, text):
+    """
+    Overlays text on an image with a black background bar at the top.
+    Args:
+        image (np.ndarray): Input image to add text to
+        text (str): Text to overlay on the image
+    Returns:
+        np.ndarray: Image with text overlaid
+    """
     # Add a black background to the text
     image[:70] = 0
 class RunningFramesCache:
+    """
+    Maintains a rolling buffer of video frames for model input.
+    This class manages a fixed-size queue of frames, keeping only the most recent
+    frames needed for model inference. It supports subsampling frames to reduce
+    memory usage and processing requirements.
+    Args:
+        save_every_k_frame (int): Only save every k-th frame (for subsampling)
+        max_frames (int): Maximum number of frames to keep in cache
+    """
     def __init__(self, save_every_k_frame: int = 1, max_frames: int = 16):
         self.save_every_k_frame = save_every_k_frame
         self.max_frames = max_frames
 class RunningResult:
+    """
+    Maintains a history of recent model predictions with timestamps.
+    This class keeps track of the most recent predictions made by the model,
+    including timestamps for each prediction. It provides formatted output
+    for display in the UI.
+    Args:
+        max_predictions (int): Maximum number of predictions to keep in history
+    """
     def __init__(self, max_predictions: int = 4):
         self.predictions = []
         self.max_predictions = max_predictions
 class FrameProcessingCallback:
+    """
+    Handles real-time video frame processing and model inference.
+    This class is responsible for:
+    1. Loading and managing the V-JEPA 2 model
+    2. Processing incoming video frames
+    3. Running model inference at regular intervals
+    4. Managing frame caching and prediction history
+    5. Formatting output for display
+    The callback is called for each frame from the video stream and handles
+    the coordination between frame capture, model inference, and result display.
+    """
     def __init__(self):
         # Loading model and processor
         self.model = VJEPA2ForVideoClassification.from_pretrained(CHECKPOINT, torch_dtype=torch.bfloat16)
         return image, AdditionalOutputs(formatted_predictions)
+# Initialize the video stream with processing callback
 stream = Stream(
     handler=VideoStreamHandler(FrameProcessingCallback(), skip_frames=True),
     modality="video",