Image-Text-to-Text
Transformers
Safetensors
nvidia
VLM
conversational
amalad's picture
initial commit
cb5a65f
raw
history blame
6.25 kB
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Quick test script for video inference with NVIDIA Nemotron Nano VL model.
Note: This script requires pre-extracted video frames. Use ffmpeg or similar tools
to extract frames from your video first:
ffmpeg -i video.mp4 -vf fps=1 frames/frame_%04d.jpg
"""
import argparse
import torch
from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
import video_io
def load_model(model_path: str, device: str = "cuda:0"):
"""Load the VLM model and processor.
Args:
model_path: Path to the pretrained model
device: Device to load the model on
Returns:
Tuple of (model, tokenizer, processor)
"""
print(f"Loading model from {model_path}...")
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
device_map=device,
torch_dtype=torch.bfloat16
).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
print("Model loaded successfully!")
return model, tokenizer, processor
def test_video_from_frames(
model,
tokenizer,
processor,
frames_dir: str,
video_fps: int = 1,
prompt_text: str = "Describe what you see.",
device: str = "cuda:0",
max_new_tokens: int = 128,
video_pruning_rate: float = 0.75,
):
"""Test model inference on video frames from a directory.
Args:
model: The VLM model
tokenizer: The tokenizer
processor: The processor
frames_dir: Directory containing extracted video frames
video_fps: FPS used when extracting frames
prompt_text: Text prompt for the model
device: Device to run inference on
max_new_tokens: Maximum number of tokens to generate
video_pruning_rate: Video pruning rate for efficient inference
"""
print(f"\nProcessing video frames from: {frames_dir}")
# Load frames from directory
frames = video_io.load_frames_from_directory(frames_dir)
# Get data URLs and metadata
image_urls, metadata = video_io.frames_to_data_urls_with_metadata(frames, video_fps)
print(f"Loaded {len(frames)} frames")
print(f"Metadata: {metadata}")
# Prepare messages
messages = [
{"role": "system", "content": "/no_think"},
{
"role": "user",
"content": [
{"type": "video", "video": ""},
{"type": "text", "text": f"\n{prompt_text}"},
],
}
]
# Generate prompt
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# Process with FPS metadata
if metadata:
inputs = processor(
text=[prompt],
videos=frames,
videos_kwargs={'video_metadata': metadata},
return_tensors="pt",
)
else:
inputs = processor(
text=[prompt],
videos=frames,
return_tensors="pt",
)
inputs = inputs.to(device)
# Set video pruning rate for efficient inference
model.video_pruning_rate = video_pruning_rate
# Generate output
generated_ids = model.generate(
pixel_values_videos=inputs.pixel_values_videos,
input_ids=inputs.input_ids,
attention_mask=inputs.attention_mask,
max_new_tokens=max_new_tokens,
)
# Decode output
output_text = processor.batch_decode(
generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
)[0]
print(f"Output: {output_text}\n")
def main():
parser = argparse.ArgumentParser(
description="Test video inference with VLM model using pre-extracted frames",
epilog="Example: Extract frames with ffmpeg first: "
"ffmpeg -i video.mp4 -vf fps=1 frames/frame_%%04d.jpg"
)
parser.add_argument(
"--model_path",
type=str,
required=True,
help="Path to the pretrained model"
)
parser.add_argument(
"--device",
type=str,
default="cuda:0",
help="Device to run inference on (e.g., cuda:0, cpu)"
)
parser.add_argument(
"--frames_dir",
type=str,
default="images/demo_frames",
help="Directory containing extracted video frames"
)
parser.add_argument(
"--video_fps",
type=int,
default=1,
help="FPS used when extracting frames (for temporal understanding)"
)
parser.add_argument(
"--prompt",
type=str,
default="Describe what you see.",
help="Text prompt for the model"
)
parser.add_argument(
"--max_new_tokens",
type=int,
default=128,
help="Maximum number of tokens to generate"
)
parser.add_argument(
"--video_pruning_rate",
type=float,
default=0.75,
help="Video pruning rate for efficient inference (0.0-1.0)"
)
args = parser.parse_args()
# Load model
model, tokenizer, processor = load_model(args.model_path, args.device)
# Test video inference from frames
print("=" * 50)
print("Testing Video Inference from Frames")
print("=" * 50)
test_video_from_frames(
model, tokenizer, processor,
frames_dir=args.frames_dir,
video_fps=args.video_fps,
prompt_text=args.prompt,
device=args.device,
max_new_tokens=args.max_new_tokens,
video_pruning_rate=args.video_pruning_rate,
)
if __name__ == "__main__":
main()