NVIDIA-Nemotron-Nano-12B-v2-VL-BF16 / quick_test_video.py

initial commit

cb5a65f 27 days ago

6.25 kB

	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Quick test script for video inference with NVIDIA Nemotron Nano VL model.

	Note: This script requires pre-extracted video frames. Use ffmpeg or similar tools
	to extract frames from your video first:
	ffmpeg -i video.mp4 -vf fps=1 frames/frame_%04d.jpg
	"""

	import argparse

	import torch
	from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer

	import video_io


	def load_model(model_path: str, device: str = "cuda:0"):
	"""Load the VLM model and processor.

	Args:
	model_path: Path to the pretrained model
	device: Device to load the model on

	Returns:
	Tuple of (model, tokenizer, processor)
	"""
	print(f"Loading model from {model_path}...")
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	trust_remote_code=True,
	device_map=device,
	torch_dtype=torch.bfloat16
	).eval()
	tokenizer = AutoTokenizer.from_pretrained(model_path)
	processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
	print("Model loaded successfully!")
	return model, tokenizer, processor


	def test_video_from_frames(
	model,
	tokenizer,
	processor,
	frames_dir: str,
	video_fps: int = 1,
	prompt_text: str = "Describe what you see.",
	device: str = "cuda:0",
	max_new_tokens: int = 128,
	video_pruning_rate: float = 0.75,
	):
	"""Test model inference on video frames from a directory.

	Args:
	model: The VLM model
	tokenizer: The tokenizer
	processor: The processor
	frames_dir: Directory containing extracted video frames
	video_fps: FPS used when extracting frames
	prompt_text: Text prompt for the model
	device: Device to run inference on
	max_new_tokens: Maximum number of tokens to generate
	video_pruning_rate: Video pruning rate for efficient inference
	"""
	print(f"\nProcessing video frames from: {frames_dir}")

	# Load frames from directory
	frames = video_io.load_frames_from_directory(frames_dir)

	# Get data URLs and metadata
	image_urls, metadata = video_io.frames_to_data_urls_with_metadata(frames, video_fps)

	print(f"Loaded {len(frames)} frames")
	print(f"Metadata: {metadata}")

	# Prepare messages
	messages = [
	{"role": "system", "content": "/no_think"},
	{
	"role": "user",
	"content": [
	{"type": "video", "video": ""},
	{"type": "text", "text": f"\n{prompt_text}"},
	],
	}
	]

	# Generate prompt
	prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

	# Process with FPS metadata
	if metadata:
	inputs = processor(
	text=[prompt],
	videos=frames,
	videos_kwargs={'video_metadata': metadata},
	return_tensors="pt",
	)
	else:
	inputs = processor(
	text=[prompt],
	videos=frames,
	return_tensors="pt",
	)
	inputs = inputs.to(device)

	# Set video pruning rate for efficient inference
	model.video_pruning_rate = video_pruning_rate

	# Generate output
	generated_ids = model.generate(
	pixel_values_videos=inputs.pixel_values_videos,
	input_ids=inputs.input_ids,
	attention_mask=inputs.attention_mask,
	max_new_tokens=max_new_tokens,
	)

	# Decode output
	output_text = processor.batch_decode(
	generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
	)[0]

	print(f"Output: {output_text}\n")


	def main():
	parser = argparse.ArgumentParser(
	description="Test video inference with VLM model using pre-extracted frames",
	epilog="Example: Extract frames with ffmpeg first: "
	"ffmpeg -i video.mp4 -vf fps=1 frames/frame_%%04d.jpg"
	)
	parser.add_argument(
	"--model_path",
	type=str,
	required=True,
	help="Path to the pretrained model"
	)
	parser.add_argument(
	"--device",
	type=str,
	default="cuda:0",
	help="Device to run inference on (e.g., cuda:0, cpu)"
	)
	parser.add_argument(
	"--frames_dir",
	type=str,
	default="images/demo_frames",
	help="Directory containing extracted video frames"
	)
	parser.add_argument(
	"--video_fps",
	type=int,
	default=1,
	help="FPS used when extracting frames (for temporal understanding)"
	)
	parser.add_argument(
	"--prompt",
	type=str,
	default="Describe what you see.",
	help="Text prompt for the model"
	)
	parser.add_argument(
	"--max_new_tokens",
	type=int,
	default=128,
	help="Maximum number of tokens to generate"
	)
	parser.add_argument(
	"--video_pruning_rate",
	type=float,
	default=0.75,
	help="Video pruning rate for efficient inference (0.0-1.0)"
	)
	args = parser.parse_args()

	# Load model
	model, tokenizer, processor = load_model(args.model_path, args.device)

	# Test video inference from frames
	print("=" * 50)
	print("Testing Video Inference from Frames")
	print("=" * 50)

	test_video_from_frames(
	model, tokenizer, processor,
	frames_dir=args.frames_dir,
	video_fps=args.video_fps,
	prompt_text=args.prompt,
	device=args.device,
	max_new_tokens=args.max_new_tokens,
	video_pruning_rate=args.video_pruning_rate,
	)


	if __name__ == "__main__":
	main()