from typing import Optional, Union, List import numpy as np from transformers.feature_extraction_utils import BatchFeature from transformers.image_utils import ImageInput from transformers.processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs from transformers.tokenization_utils_base import PreTokenizedInput, TextInput from transformers.video_utils import VideoInput class NemotronNanoVLV2ImagesKwargs(ImagesKwargs): min_pixels: Optional[int] max_pixels: Optional[int] patch_size: Optional[int] temporal_patch_size: Optional[int] merge_size: Optional[int] class NemotronNanoVLV2ProcessorKwargs(ProcessingKwargs, total=False): images_kwargs: NemotronNanoVLV2ImagesKwargs videos_kwargs: VideosKwargs _defaults = { "text_kwargs": { "padding": False, }, } class NemotronNanoVLV2Processor(ProcessorMixin): r""" Constructs a Nemotron Nano VL V2 processor which wraps an image processor and a tokenizer into a single processor. [`NemotronNanoVLV2Processor`] offers all the functionalities of the image processor and tokenizer. See the [`~NemotronNanoVLV2Processor.__call__`] and [`~NemotronNanoVLV2Processor.decode`] for more information. Args: image_processor ([`AutoImageProcessor`], *optional*): The image processor is a required input. tokenizer ([`AutoTokenizer`], *optional*): The tokenizer is a required input. chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. """ attributes = ["image_processor", "tokenizer"] image_processor_class = "AutoImageProcessor" video_processor_class = "AutoVideoProcessor" tokenizer_class = ("AutoTokenizer") def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): self.image_token = "" if not hasattr(tokenizer, "image_token") else tokenizer.image_token self.video_token = "