can't load model. BAAI/Video-XL-2 does not appear to have a file named multimodal_encoder.builder.py.
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None,attn_implementation="flash_attention_2",torch_dtype=torch.bfloat16)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 543, in from_pretrained
config, kwargs = AutoConfig.from_pretrained(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/models/auto/configuration_auto.py", line 1179, in from_pretrained
config_class = get_class_from_dynamic_module(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 569, in get_class_from_dynamic_module
final_module = get_cached_module_file(
^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 432, in get_cached_module_file
get_cached_module_file(
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 432, in get_cached_module_file
get_cached_module_file(
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 372, in get_cached_module_file
resolved_module_file = cached_file(
^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/utils/hub.py", line 312, in cached_file
file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/utils/hub.py", line 573, in cached_files
raise EnvironmentError(
OSError: BAAI/Video-XL-2 does not appear to have a file named multimodal_encoder.builder.py. Checkout 'https://huggingface.co/BAAI/Video-XL-2/tree/main' for available files.
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None,attn_implementation="flash_attention_2",torch_dtype=torch.bfloat16)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 543, in from_pretrained
config, kwargs = AutoConfig.from_pretrained(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/models/auto/configuration_auto.py", line 1179, in from_pretrained
config_class = get_class_from_dynamic_module(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 569, in get_class_from_dynamic_module
final_module = get_cached_module_file(
^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 432, in get_cached_module_file
get_cached_module_file(
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 432, in get_cached_module_file
get_cached_module_file(
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 372, in get_cached_module_file
resolved_module_file = cached_file(
^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/utils/hub.py", line 312, in cached_file
file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/utils/hub.py", line 573, in cached_files
raise EnvironmentError(
OSError: BAAI/Video-XL-2 does not appear to have a file named multimodal_encoder.builder.py. Checkout 'https://huggingface.co/BAAI/Video-XL-2/tree/main' for available files.
Thx for your feedback, I'm fixing it as soon as possible.
Hi Mr.Zhang, We've fixed the problem. Please try again using the following steps:
- Update inference code:
huggingface-cli download BAAI/Video-XL-2 --include "*.py" --local-dir /root/Models/Video-XL-2
- Run updated demo code:
1. Inference w/o. Efficiency Optimization
from transformers import AutoTokenizer, AutoModel, AutoConfig, BitsAndBytesConfig, AutoModelForCausalLM
import torch
# load model
model_path = '/root/Models/Video-XL-2'
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None, attn_implementation="sdpa", torch_dtype=torch.float16, low_cpu_mem_usage=True)
gen_kwargs = {
"do_sample": False,
"temperature": 0.01,
"top_p": 0.001,
"num_beams": 1,
"use_cache": True,
"max_new_tokens": 256
}
model.config.enable_sparse = False
# input data
video_path = "/asset/demo.mp4"
question1 = "How many people in the video? (A)3 people (B)6 people. Please only respone the letter"
# params
max_num_frames = 150
sample_fps = 1 # extract frame at 1fps
max_sample_fps = 4
with torch.inference_mode():
response = model.chat(video_path, tokenizer, question1, chat_history=None, return_history=False,max_num_frames=max_num_frames, sample_fps=sample_fps, max_sample_fps=max_sample_fps, generation_config=gen_kwargs)
print(response)
2. Inference w. Chunk-based Pre-filling
from transformers import AutoTokenizer, AutoModel, AutoConfig, BitsAndBytesConfig, AutoModelForCausalLM
import torch
import pdb
import argparse
torch.cuda.reset_peak_memory_stats()
# load model
model_path = '/root/Models/Video-XL-2'
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None, attn_implementation="sdpa", torch_dtype=torch.float16, low_cpu_mem_usage=True) # sdpa
gen_kwargs = {"do_sample": False, "temperature": 0.01, "top_p": 0.001, "num_beams": 1, "use_cache": True, "max_new_tokens": 128}
model.config.enable_chunk_prefill = True
prefill_config = {
'chunk_prefill_mode': 'streaming',
'chunk_size': 4,
'step_size': 1,
'offload': True,
'chunk_size_for_vision_tower': 24,
}
model.config.prefill_config = prefill_config
# input data
video_path = "/asset/demo.mp4"
question1 = "How many people in the video? (A)3 people (B)6 people. Please only respone the letter"
# params
max_num_frames = 1300
sample_fps = None # uniform sampling
max_sample_fps = None
with torch.inference_mode():
response = model.chat(video_path, tokenizer, question1, chat_history=None, return_history=False,max_num_frames=max_num_frames, sample_fps=sample_fps, max_sample_fps=max_sample_fps, generation_config=gen_kwargs)
peak_memory_allocated = torch.cuda.max_memory_allocated()
print(f"Memory Peak: {peak_memory_allocated / (1024**3):.2f} GB")
print(response)
🤝
It works now. Thank you very much.