Spaces:

Fancy-MLLM
/

R1-Onevision

Running on Zero

File size: 3,256 Bytes

5be3d23
2ba0a0c
 
5be3d23
 
2ba0a0c
5be3d23
6a423bd
 
 
366cc4b
5be3d23
6a423bd
e547e36
6a423bd
5be3d23
 
 
2ba0a0c
 
5be3d23
 
 
 
bbdfb03
5be3d23
 
 
 
2ba0a0c
 
5be3d23
 
 
 
 
 
 
 
 
6a423bd
 
2ba0a0c
5be3d23
2ba0a0c
 
bbdfb03
2ba0a0c
bbdfb03
 
 
 
 
5be3d23
6a423bd
2ba0a0c
 
 
e3288b1
2ba0a0c
 
 
 
 
 
 
 
db07883
 
 
 
 
 
 
 
 
 
 
 
 
 
f7ae658
 
 
 
 
db07883
 
 
 
 
 
366cc4b
5be3d23
bbdfb03
 
f9438a5
 
bbdfb03
f9438a5
 
bbdfb03
 
f7ae658
f9438a5

import gradio as gr
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
from threading import Thread
from qwen_vl_utils import process_vision_info
import torch
import time

# Check if a GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"

local_path = "Fancy-MLLM/R1-OneVision-7B"

# Load the model on the appropriate device (GPU if available, otherwise CPU)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    local_path, torch_dtype="auto", device_map=device
)
processor = AutoProcessor.from_pretrained(local_path)

def generate_output(image, text, button_click):
    # Prepare input data
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image, 'min_pixels': 1003520, 'max_pixels': 12845056},
                {"type": "text", "text": text},
            ],
        }
    ]
    
    # Prepare inputs for the model
    text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text_input],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    
    # Move inputs to the same device as the model
    inputs = inputs.to(model.device)

    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(
        **inputs,
        streamer=streamer,
        max_new_tokens=4096,
        top_p=0.001,
        top_k=1,
        temperature=0.01,
        repetition_penalty=1.0,
    )
    
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    generated_text = ''
    
    try:
        for new_text in streamer:
            generated_text += new_text
            yield f"‎{generated_text}"
    except Exception as e:
        print(f"Error: {e}")
        yield f"Error occurred: {str(e)}"

Css = """
#output-markdown {
    overflow-y: auto;
    white-space: pre-wrap; 
    word-wrap: break-word;
}
#output-markdown .math {
    overflow-x: auto;
    max-width: 100%;
}
.markdown-text {
    white-space: pre-wrap;
    word-wrap: break-word;
}
.markdown-output {
    min-height: 20vh;
    max-width: 100%;
    overflow-y: auto;
}
#qwen-md .katex-display { display: inline; }
#qwen-md .katex-display>.katex { display: inline; }
#qwen-md .katex-display>.katex>.katex-html { display: inline; }
"""

with gr.Blocks(css=Css) as demo:
    gr.HTML("""<center><font size=8>🦖 R1-OneVision Demo</center>""")

    with gr.Row():
        with gr.Column():
            input_image = gr.Image(type="pil", label="Upload")  # **改回 PIL 处理**
            input_text = gr.Textbox(label="Input your question")
            with gr.Row():
                clear_btn = gr.ClearButton([input_image, input_text])
                submit_btn = gr.Button("Submit", variant="primary")

        with gr.Column():
            output_text = gr.Markdown(elem_id="qwen-md", container=True, elem_classes="markdown-output")

    submit_btn.click(fn=generate_output, inputs=[input_image, input_text], outputs=output_text)

demo.launch(share=True)