BAAI
/

can't load model. BAAI/Video-XL-2 does not appear to have a file named multimodal_encoder.builder.py.

#4
by zhangsongbo365 - opened
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None,attn_implementation="flash_attention_2",torch_dtype=torch.bfloat16)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "/opt/venv/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 543, in from_pretrained
config, kwargs = AutoConfig.from_pretrained(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/models/auto/configuration_auto.py", line 1179, in from_pretrained
config_class = get_class_from_dynamic_module(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 569, in get_class_from_dynamic_module
final_module = get_cached_module_file(
^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 432, in get_cached_module_file
get_cached_module_file(
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 432, in get_cached_module_file
get_cached_module_file(
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 372, in get_cached_module_file
resolved_module_file = cached_file(
^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/utils/hub.py", line 312, in cached_file
file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/utils/hub.py", line 573, in cached_files
raise EnvironmentError(
OSError: BAAI/Video-XL-2 does not appear to have a file named multimodal_encoder.builder.py. Checkout 'https://huggingface.co/BAAI/Video-XL-2/tree/main' for available files.

Beijing Academy of Artificial Intelligence org

model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None,attn_implementation="flash_attention_2",torch_dtype=torch.bfloat16)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 543, in from_pretrained
config, kwargs = AutoConfig.from_pretrained(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/models/auto/configuration_auto.py", line 1179, in from_pretrained
config_class = get_class_from_dynamic_module(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 569, in get_class_from_dynamic_module
final_module = get_cached_module_file(
^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 432, in get_cached_module_file
get_cached_module_file(
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 432, in get_cached_module_file
get_cached_module_file(
File "/opt/venv/lib/python3.12/site-packages/transformers/dynamic_module_utils.py", line 372, in get_cached_module_file
resolved_module_file = cached_file(
^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/utils/hub.py", line 312, in cached_file
file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/venv/lib/python3.12/site-packages/transformers/utils/hub.py", line 573, in cached_files
raise EnvironmentError(
OSError: BAAI/Video-XL-2 does not appear to have a file named multimodal_encoder.builder.py. Checkout 'https://huggingface.co/BAAI/Video-XL-2/tree/main' for available files.

Thx for your feedback, I'm fixing it as soon as possible.

Beijing Academy of Artificial Intelligence org

Hi Mr.Zhang, We've fixed the problem. Please try again using the following steps:

  1. Update inference code: huggingface-cli download BAAI/Video-XL-2 --include "*.py" --local-dir /root/Models/Video-XL-2
  2. Run updated demo code:

1. Inference w/o. Efficiency Optimization

from transformers import AutoTokenizer, AutoModel, AutoConfig, BitsAndBytesConfig, AutoModelForCausalLM
import torch

# load model 
model_path = '/root/Models/Video-XL-2'
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None, attn_implementation="sdpa", torch_dtype=torch.float16, low_cpu_mem_usage=True)

gen_kwargs = {
    "do_sample": False,
    "temperature": 0.01,
    "top_p": 0.001,
    "num_beams": 1,
    "use_cache": True,
    "max_new_tokens": 256
}

model.config.enable_sparse = False

# input data
video_path = "/asset/demo.mp4"
question1 = "How many people in the video? (A)3 people (B)6 people. Please only respone the letter"

# params
max_num_frames = 150
sample_fps = 1  # extract frame at 1fps
max_sample_fps = 4

with torch.inference_mode():
    response = model.chat(video_path, tokenizer, question1, chat_history=None, return_history=False,max_num_frames=max_num_frames, sample_fps=sample_fps, max_sample_fps=max_sample_fps, generation_config=gen_kwargs)
    
print(response)

2. Inference w. Chunk-based Pre-filling

from transformers import AutoTokenizer, AutoModel, AutoConfig, BitsAndBytesConfig, AutoModelForCausalLM
import torch
import pdb
import argparse

torch.cuda.reset_peak_memory_stats()
# load model 
model_path = '/root/Models/Video-XL-2'
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map=device,quantization_config=None, attn_implementation="sdpa", torch_dtype=torch.float16, low_cpu_mem_usage=True) # sdpa

gen_kwargs = {"do_sample": False, "temperature": 0.01, "top_p": 0.001, "num_beams": 1, "use_cache": True, "max_new_tokens": 128}

model.config.enable_chunk_prefill = True
prefill_config = {
    'chunk_prefill_mode': 'streaming',
    'chunk_size': 4,
    'step_size': 1,
    'offload': True,
    'chunk_size_for_vision_tower': 24,
}
model.config.prefill_config = prefill_config

# input data
video_path = "/asset/demo.mp4"
question1 = "How many people in the video? (A)3 people (B)6 people. Please only respone the letter"

# params
max_num_frames = 1300
sample_fps = None  # uniform sampling
max_sample_fps = None

with torch.inference_mode():
    response = model.chat(video_path, tokenizer, question1, chat_history=None, return_history=False,max_num_frames=max_num_frames, sample_fps=sample_fps, max_sample_fps=max_sample_fps, generation_config=gen_kwargs)
    

peak_memory_allocated = torch.cuda.max_memory_allocated()
print(f"Memory Peak: {peak_memory_allocated / (1024**3):.2f} GB")
print(response)
Beijing Academy of Artificial Intelligence org
edited Jun 6

🤝

It works now. Thank you very much.

Sign up or log in to comment