can't load model. BAAI/Video-XL-2 does not appear to have a file named multimodal_encoder.builder.py.

by zhangsongbo365 - opened Jun 5

Jun 5

model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None,attn_implementation="flash_attention_2",torch_dtype=torch.bfloat16)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "/opt/venv/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 543, in from_pretrained
config, kwargs = AutoConfig.from_pretrained(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/models/auto/configuration_auto.py", line 1179, in from_pretrained
config_class = get_class_from_dynamic_module(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 569, in get_class_from_dynamic_module
final_module = get_cached_module_file(
^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 432, in get_cached_module_file
get_cached_module_file(
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 432, in get_cached_module_file
get_cached_module_file(
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 372, in get_cached_module_file
resolved_module_file = cached_file(
^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/utils/hub.py", line 312, in cached_file
file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/utils/hub.py", line 573, in cached_files
raise EnvironmentError(
OSError: BAAI/Video-XL-2 does not appear to have a file named multimodal_encoder.builder.py. Checkout 'https://huggingface.co/BAAI/Video-XL-2/tree/main' for available files.

CharmingDog

Beijing Academy of Artificial Intelligence org Jun 5

model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None,attn_implementation="flash_attention_2",torch_dtype=torch.bfloat16)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 543, in from_pretrained
config, kwargs = AutoConfig.from_pretrained(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/models/auto/configuration_auto.py", line 1179, in from_pretrained
config_class = get_class_from_dynamic_module(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 569, in get_class_from_dynamic_module
final_module = get_cached_module_file(
^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 432, in get_cached_module_file
get_cached_module_file(
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 432, in get_cached_module_file
get_cached_module_file(
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 372, in get_cached_module_file
resolved_module_file = cached_file(
^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/utils/hub.py", line 312, in cached_file
file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/utils/hub.py", line 573, in cached_files
raise EnvironmentError(
OSError: BAAI/Video-XL-2 does not appear to have a file named multimodal_encoder.builder.py. Checkout 'https://huggingface.co/BAAI/Video-XL-2/tree/main' for available files.

Thx for your feedback, I'm fixing it as soon as possible.

CharmingDog

Beijing Academy of Artificial Intelligence org Jun 6

Hi Mr.Zhang, We've fixed the problem. Please try again using the following steps:

Update inference code: huggingface-cli download BAAI/Video-XL-2 --include "*.py" --local-dir /root/Models/Video-XL-2
Run updated demo code:

1. Inference w/o. Efficiency Optimization

from transformers import AutoTokenizer, AutoModel, AutoConfig, BitsAndBytesConfig, AutoModelForCausalLM
import torch

# load model 
model_path = '/root/Models/Video-XL-2'
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None, attn_implementation="sdpa", torch_dtype=torch.float16, low_cpu_mem_usage=True)

gen_kwargs = {
    "do_sample": False,
    "temperature": 0.01,
    "top_p": 0.001,
    "num_beams": 1,
    "use_cache": True,
    "max_new_tokens": 256
}

model.config.enable_sparse = False

# input data
video_path = "/asset/demo.mp4"
question1 = "How many people in the video? (A)3 people (B)6 people. Please only respone the letter"

# params
max_num_frames = 150
sample_fps = 1  # extract frame at 1fps
max_sample_fps = 4

with torch.inference_mode():
    response = model.chat(video_path, tokenizer, question1, chat_history=None, return_history=False,max_num_frames=max_num_frames, sample_fps=sample_fps, max_sample_fps=max_sample_fps, generation_config=gen_kwargs)
    
print(response)

2. Inference w. Chunk-based Pre-filling

from transformers import AutoTokenizer, AutoModel, AutoConfig, BitsAndBytesConfig, AutoModelForCausalLM
import torch
import pdb
import argparse

torch.cuda.reset_peak_memory_stats()
# load model 
model_path = '/root/Models/Video-XL-2'
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None, attn_implementation="sdpa", torch_dtype=torch.float16, low_cpu_mem_usage=True) # sdpa

gen_kwargs = {"do_sample": False, "temperature": 0.01, "top_p": 0.001, "num_beams": 1, "use_cache": True, "max_new_tokens": 128}

model.config.enable_chunk_prefill = True
prefill_config = {
    'chunk_prefill_mode': 'streaming',
    'chunk_size': 4,
    'step_size': 1,
    'offload': True,
    'chunk_size_for_vision_tower': 24,
}
model.config.prefill_config = prefill_config

# input data
video_path = "/asset/demo.mp4"
question1 = "How many people in the video? (A)3 people (B)6 people. Please only respone the letter"

# params
max_num_frames = 1300
sample_fps = None  # uniform sampling
max_sample_fps = None

with torch.inference_mode():
    response = model.chat(video_path, tokenizer, question1, chat_history=None, return_history=False,max_num_frames=max_num_frames, sample_fps=sample_fps, max_sample_fps=max_sample_fps, generation_config=gen_kwargs)
    

peak_memory_allocated = torch.cuda.max_memory_allocated()
print(f"Memory Peak: {peak_memory_allocated / (1024**3):.2f} GB")
print(response)

CharmingDog

Beijing Academy of Artificial Intelligence org Jun 6

•

edited Jun 6

🤝

zhangsongbo365

Jun 6

It works now. Thank you very much.

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment