|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Quick test script for video inference with NVIDIA Nemotron Nano VL model. |
|
|
|
|
|
Note: This script requires pre-extracted video frames. Use ffmpeg or similar tools |
|
|
to extract frames from your video first: |
|
|
ffmpeg -i video.mp4 -vf fps=1 frames/frame_%04d.jpg |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
|
|
|
import torch |
|
|
from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer |
|
|
|
|
|
import video_io |
|
|
|
|
|
|
|
|
def load_model(model_path: str, device: str = "cuda:0"): |
|
|
"""Load the VLM model and processor. |
|
|
|
|
|
Args: |
|
|
model_path: Path to the pretrained model |
|
|
device: Device to load the model on |
|
|
|
|
|
Returns: |
|
|
Tuple of (model, tokenizer, processor) |
|
|
""" |
|
|
print(f"Loading model from {model_path}...") |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_path, |
|
|
trust_remote_code=True, |
|
|
device_map=device, |
|
|
torch_dtype=torch.bfloat16 |
|
|
).eval() |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
|
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) |
|
|
print("Model loaded successfully!") |
|
|
return model, tokenizer, processor |
|
|
|
|
|
|
|
|
def test_video_from_frames( |
|
|
model, |
|
|
tokenizer, |
|
|
processor, |
|
|
frames_dir: str, |
|
|
video_fps: int = 1, |
|
|
prompt_text: str = "Describe what you see.", |
|
|
device: str = "cuda:0", |
|
|
max_new_tokens: int = 128, |
|
|
video_pruning_rate: float = 0.75, |
|
|
): |
|
|
"""Test model inference on video frames from a directory. |
|
|
|
|
|
Args: |
|
|
model: The VLM model |
|
|
tokenizer: The tokenizer |
|
|
processor: The processor |
|
|
frames_dir: Directory containing extracted video frames |
|
|
video_fps: FPS used when extracting frames |
|
|
prompt_text: Text prompt for the model |
|
|
device: Device to run inference on |
|
|
max_new_tokens: Maximum number of tokens to generate |
|
|
video_pruning_rate: Video pruning rate for efficient inference |
|
|
""" |
|
|
print(f"\nProcessing video frames from: {frames_dir}") |
|
|
|
|
|
|
|
|
frames = video_io.load_frames_from_directory(frames_dir) |
|
|
|
|
|
|
|
|
image_urls, metadata = video_io.frames_to_data_urls_with_metadata(frames, video_fps) |
|
|
|
|
|
print(f"Loaded {len(frames)} frames") |
|
|
print(f"Metadata: {metadata}") |
|
|
|
|
|
|
|
|
messages = [ |
|
|
{"role": "system", "content": "/no_think"}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "video", "video": ""}, |
|
|
{"type": "text", "text": f"\n{prompt_text}"}, |
|
|
], |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
|
|
|
|
|
|
|
if metadata: |
|
|
inputs = processor( |
|
|
text=[prompt], |
|
|
videos=frames, |
|
|
videos_kwargs={'video_metadata': metadata}, |
|
|
return_tensors="pt", |
|
|
) |
|
|
else: |
|
|
inputs = processor( |
|
|
text=[prompt], |
|
|
videos=frames, |
|
|
return_tensors="pt", |
|
|
) |
|
|
inputs = inputs.to(device) |
|
|
|
|
|
|
|
|
model.video_pruning_rate = video_pruning_rate |
|
|
|
|
|
|
|
|
generated_ids = model.generate( |
|
|
pixel_values_videos=inputs.pixel_values_videos, |
|
|
input_ids=inputs.input_ids, |
|
|
attention_mask=inputs.attention_mask, |
|
|
max_new_tokens=max_new_tokens, |
|
|
) |
|
|
|
|
|
|
|
|
output_text = processor.batch_decode( |
|
|
generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False |
|
|
)[0] |
|
|
|
|
|
print(f"Output: {output_text}\n") |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Test video inference with VLM model using pre-extracted frames", |
|
|
epilog="Example: Extract frames with ffmpeg first: " |
|
|
"ffmpeg -i video.mp4 -vf fps=1 frames/frame_%%04d.jpg" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--model_path", |
|
|
type=str, |
|
|
required=True, |
|
|
help="Path to the pretrained model" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--device", |
|
|
type=str, |
|
|
default="cuda:0", |
|
|
help="Device to run inference on (e.g., cuda:0, cpu)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--frames_dir", |
|
|
type=str, |
|
|
default="images/demo_frames", |
|
|
help="Directory containing extracted video frames" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--video_fps", |
|
|
type=int, |
|
|
default=1, |
|
|
help="FPS used when extracting frames (for temporal understanding)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--prompt", |
|
|
type=str, |
|
|
default="Describe what you see.", |
|
|
help="Text prompt for the model" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--max_new_tokens", |
|
|
type=int, |
|
|
default=128, |
|
|
help="Maximum number of tokens to generate" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--video_pruning_rate", |
|
|
type=float, |
|
|
default=0.75, |
|
|
help="Video pruning rate for efficient inference (0.0-1.0)" |
|
|
) |
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
model, tokenizer, processor = load_model(args.model_path, args.device) |
|
|
|
|
|
|
|
|
print("=" * 50) |
|
|
print("Testing Video Inference from Frames") |
|
|
print("=" * 50) |
|
|
|
|
|
test_video_from_frames( |
|
|
model, tokenizer, processor, |
|
|
frames_dir=args.frames_dir, |
|
|
video_fps=args.video_fps, |
|
|
prompt_text=args.prompt, |
|
|
device=args.device, |
|
|
max_new_tokens=args.max_new_tokens, |
|
|
video_pruning_rate=args.video_pruning_rate, |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|