Spaces:

LanguageBind
/

UniWorld-V1

Runtime error

File size: 33,076 Bytes

import spaces
import os
import gradio as gr
import sys
sys.path.append("..")
from transformers import AutoProcessor, SiglipImageProcessor, SiglipVisionModel, T5EncoderModel, BitsAndBytesConfig
from univa.models.qwen2p5vl.modeling_univa_qwen2p5vl import UnivaQwen2p5VLForConditionalGeneration
from univa.utils.flux_pipeline import FluxPipeline
from univa.utils.get_ocr import get_ocr_result
from univa.utils.denoiser_prompt_embedding_flux import encode_prompt
from qwen_vl_utils import process_vision_info
from univa.utils.anyres_util import dynamic_resize, concat_images_adaptive
import torch
from torch import nn
import uuid
import base64
from typing import Dict
from PIL import Image, ImageDraw, ImageFont
import argparse
import gc


def parse_args():
    parser = argparse.ArgumentParser(description="Model and component paths")

    parser.add_argument("--model_path", type=str, default="LanguageBind/UniWorld-V1", help="UniWorld-V1模型路径")
    parser.add_argument("--flux_path", type=str, default="black-forest-labs/FLUX.1-dev", help="FLUX.1-dev模型路径")
    parser.add_argument("--siglip_path", type=str, default="google/siglip2-so400m-patch16-512", help="siglip2模型路径")
    parser.add_argument("--server_name", type=str, default="127.0.0.1", help="IP地址")
    parser.add_argument("--server_port", type=int, default=6812, help="端口号")
    parser.add_argument("--share", action="store_true", help="是否公开分享")
    parser.add_argument("--nf4", action="store_true", help="是否NF4量化")
    parser.add_argument("--zh", action="store_true", help="是否使用中文")
    parser.add_argument("--offload", action="store_true", help="是否开启顺序卸载")

    return parser.parse_args()


def add_plain_text_watermark(
    img: Image.Image,
    text: str,
    margin: int = 50, 
    font_size: int = 30, 
):
    if img.mode != "RGB":
        img = img.convert("RGB")

    draw = ImageDraw.Draw(img)
    font = ImageFont.truetype("DejaVuSans.ttf", font_size)
    bbox = draw.textbbox((0, 0), text)
    text_width = bbox[2] - bbox[0]
    text_height = bbox[3] - bbox[1]

    x = img.width - text_width - int(3.3 * margin)
    y = img.height - text_height - margin

    draw.text((x, y), text, font=font, fill=(255, 255, 255))
    return img


css = """
.table-wrap table tr td:nth-child(3) > div {
    max-height: 150px;       /* 最多 100px 高度，按需修改 */
    overflow-y: auto;        /* 超出部分显示竖向滚动条 */
    white-space: pre-wrap;   /* 自动换行 */
    word-break: break-all;   /* 长单词内部分行 */
}
.table-wrap table tr td:nth-child(2) > div {
    max-width: 150px;
    white-space: pre-wrap;
    word-break: break-all;
    overflow-x: auto;
}
.table-wrap table tr th:nth-child(2) {
    max-width: 150px;
    white-space: normal;
    word-break: keep-all;
    overflow-x: auto;
}
.table-wrap table tr td:nth-last-child(-n+8) > div {
    max-width: 130px;
    white-space: pre-wrap;
    word-break: break-all;
    overflow-x: auto;
}
.table-wrap table tr th:nth-last-child(-n+8) {
    max-width: 130px;
    white-space: normal;
    word-break: keep-all;
    overflow-x: auto;
}
"""


def img2b64(image_path):
    with open(image_path, "rb") as f:
        b64 = base64.b64encode(f.read()).decode()
    data_uri = f"data:image/jpeg;base64,{b64}"
    return data_uri

def initialize_models(args):
    os.makedirs("tmp", exist_ok=True)
    # Paths

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    )

    # Load main model and task head
    model = UnivaQwen2p5VLForConditionalGeneration.from_pretrained(
        args.model_path,
        torch_dtype=torch.float16,
        attn_implementation="sdpa",
        quantization_config=quantization_config if args.nf4 else None,
    )
    task_head = nn.Sequential(
        nn.Linear(3584, 10240),
        nn.SiLU(),
        nn.Dropout(0.3),
        nn.Linear(10240, 2)
    )
    # task_head.load_state_dict(torch.load(os.path.join(args.model_path, 'task_head_final.pt')))
    task_head.eval()

    processor = AutoProcessor.from_pretrained(
        args.model_path,
        min_pixels=448*448,
        max_pixels=448*448,
    )
    if args.nf4:
        text_encoder_2 = T5EncoderModel.from_pretrained(
            args.flux_path,
            subfolder="text_encoder_2",
            quantization_config=quantization_config,
            torch_dtype=torch.float16,
        )
        pipe = FluxPipeline.from_pretrained(
            args.flux_path,
            transformer=model.denoise_tower.denoiser,
            text_encoder_2=text_encoder_2,
            torch_dtype=torch.float16,
            token=os.environ["HF_TOKEN"], 
        )
    else:
        pipe = FluxPipeline.from_pretrained(
            args.flux_path,
            transformer=model.denoise_tower.denoiser,
            torch_dtype=torch.float16,
            token=os.environ["HF_TOKEN"], 
        )
    if args.offload:
        pipe.enable_model_cpu_offload()
        pipe.enable_vae_slicing()
    tokenizers = [pipe.tokenizer, pipe.tokenizer_2]
    text_encoders = [pipe.text_encoder, pipe.text_encoder_2]

    # Optional SigLIP
    siglip_processor, siglip_model = None, None
    siglip_processor = SiglipImageProcessor.from_pretrained(args.siglip_path)
    siglip_model = SiglipVisionModel.from_pretrained(
        args.siglip_path,
        torch_dtype=torch.float16,
    )

    return {
        'model': model,
        'task_head': task_head,
        'processor': processor,
        'pipe': pipe,
        'tokenizers': tokenizers,
        'text_encoders': text_encoders,
        'siglip_processor': siglip_processor,
        'siglip_model': siglip_model,
        
    }

@spaces.GPU(duration=600)
def to_device(state):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    state['model'] = state['model'].to(device, dtype=torch.bfloat16)
    state['task_head'] = state['task_head'].to(device, dtype=torch.bfloat16)
    state['pipe'] = state['pipe'].to(device, dtype=torch.bfloat16)
    state['text_encoders'] = state['text_encoders'].to(device, dtype=torch.bfloat16)
    state['siglip_model'] = state['siglip_model'].to(device, dtype=torch.bfloat16)
    state['device'] = device
    return state

args = parse_args()
state = initialize_models(args)
state = to_device(state)

@spaces.GPU
def process_large_image(raw_img):
    if raw_img is None:
        return raw_img
    img = Image.open(raw_img).convert("RGB")

    max_side = max(img.width, img.height)
    if max_side > 1024:
        scale = 1024 / max_side
        new_w = int(img.width * scale)
        new_h = int(img.height * scale)
        print(f'resize img {img.size} to {(new_w, new_h)}')
        img = img.resize((new_w, new_h), resample=Image.LANCZOS)
        save_path = f"tmp/{uuid.uuid4().hex}.png"
        img.save(save_path)
        return save_path
    else:
        return raw_img

@spaces.GPU(duration=200)
def chat_step(image1, image2, text, height, width, steps, guidance,
              ocr_enhancer, joint_with_t5, enhance_generation, enhance_understanding,
              seed, num_imgs, history_state, progress=gr.Progress()):

    try:
        convo = history_state['conversation']
        image_paths = history_state['history_image_paths']
        cur_ocr_i = history_state['cur_ocr_i']
        cur_genimg_i = history_state['cur_genimg_i']

        # image1 = process_large_image(image1)
        # image2 = process_large_image(image2)
        # Build content
        content = []
        if text:
            ocr_text = ''
            if ocr_enhancer and content:
                ocr_texts = []
                for img in (image1, image2):
                    if img:
                        ocr_texts.append(get_ocr_result(img, cur_ocr_i))
                        cur_ocr_i += 1
                ocr_text = '\n'.join(ocr_texts)
            content.append({'type':'text','text': text + ocr_text})
        for img in (image1, image2):
            if img:
                content.append({'type':'image','image':img,'min_pixels':448*448,'max_pixels':448*448})
                image_paths.append(img)

        convo.append({'role':'user','content':content})

        # Prepare inputs
        chat_text = state['processor'].apply_chat_template(convo,
                        tokenize=False, add_generation_prompt=True)
        chat_text = '<|im_end|>\n'.join(chat_text.split('<|im_end|>\n')[1:])
        image_inputs, video_inputs = process_vision_info(convo)
        inputs = state['processor'](
            text=[chat_text], images=image_inputs, videos=video_inputs,
            padding=True, return_tensors='pt'
        ).to(state['device'])

        # Model forward & task head
        with torch.no_grad():
            outputs = state['model'](**inputs, return_dict=True, output_hidden_states=True)
        hidden = outputs.hidden_states[-1]
        mask = inputs.input_ids == 77091
        vecs = hidden[mask][-1:]
        task_res = state['task_head'](vecs.float())[0]
        print(task_res)
        # Branch decision
        if enhance_generation:
            do_image = True
        elif enhance_understanding:
            do_image = False
        else:
            do_image = (task_res[0] < task_res[1])

        seed = int(seed)
        if seed == -1:
            seed = torch.Generator(device="cpu").seed()
        torch.manual_seed(seed)
        # Generate
        if True:
            # image generation pipeline
            siglip_hs = None
            if state['siglip_processor'] and image_paths:
                vals = [state['siglip_processor'].preprocess(
                            images=Image.open(p).convert('RGB'), do_resize=True,
                            return_tensors='pt', do_convert_rgb=True
                        ).pixel_values.to(state['device'])
                        for p in image_paths]
                siglip_hs = state['siglip_model'](torch.concat(vals)).last_hidden_state

            with torch.no_grad():
                lvlm = state['model'](
                    inputs.input_ids, pixel_values=getattr(inputs,'pixel_values',None),
                    attention_mask=inputs.attention_mask,
                    image_grid_thw=getattr(inputs,'image_grid_thw',None),
                    siglip_hidden_states=siglip_hs,
                    output_type='denoise_embeds'
                )
                prm_embeds, pooled = encode_prompt(
                    state['text_encoders'], state['tokenizers'],
                    text if joint_with_t5 else '', 256, state['device'], 1
                )
            emb = torch.concat([lvlm, prm_embeds], dim=1) if joint_with_t5 else lvlm


            def diffusion_to_gradio_callback(_pipeline, step_idx: int, timestep: int, tensor_dict: Dict):
                # 1）更新 Gradio 进度条
                frac = (step_idx + 1) / float(steps)
                progress(frac)

                return tensor_dict

            with torch.no_grad():
                img = state['pipe'](
                    prompt_embeds=emb, pooled_prompt_embeds=pooled,
                    height=height, width=width,
                    num_inference_steps=steps,
                    guidance_scale=guidance,
                    generator=torch.Generator(device='cuda').manual_seed(seed), 
                    num_images_per_prompt=num_imgs, 
                    callback_on_step_end=diffusion_to_gradio_callback,
                    # callback_on_step_end_tensor_inputs=["latents", "prompt_embeds"],
                ).images
            # img = [add_plain_text_watermark(im, 'Open-Sora Plan 2.0 Generated') for im in img]
            img = concat_images_adaptive(img)
            save_path = f"tmp/{uuid.uuid4().hex}.png"
            img.save(save_path)
            convo.append({'role':'assistant','content':[{'type':'image','image':save_path}]})
            cur_genimg_i += 1
            progress(1.0)
            bot_msg = (None, save_path)
        else:
            # text generation
            gen_ids = state['model'].generate(**inputs, max_new_tokens=128)
            out = state['processor'].batch_decode(
                [g[len(inputs.input_ids[0]):] for g in gen_ids], skip_special_tokens=True
            )[0]
            convo.append({'role':'assistant','content':[{'type':'text','text':out}]})
            bot_msg = (None, out)


        chat_pairs = []
        # print(convo)
        # print()
        # print()
        for msg in convo:
            # print(msg)
            if msg['role']=='user':
                parts = []
                for c in msg['content']:
                    if c['type']=='text': parts.append(c['text'])
                    if c['type']=='image': parts.append(f"![user image]({img2b64(c['image'])})")
                chat_pairs.append(("\n".join(parts), None))
            else:
                parts = []
                for c in msg['content']:
                    if c['type']=='text': parts.append(c['text'])
                    if c['type']=='image': parts.append(f"![assistant image]({img2b64(c['image'])})")
                if msg['content'][-1]['type']=='text':
                    chat_pairs[-1] = (chat_pairs[-1][0], parts[-1])
                else:
                    chat_pairs[-1] = (chat_pairs[-1][0], parts[-1])
        # print()
        # print(chat_pairs)

        # Update state
        history_state.update({
            'conversation': convo,
            'history_image_paths': image_paths,
            'cur_ocr_i': cur_ocr_i,
            'cur_genimg_i': cur_genimg_i
        })
        return chat_pairs, history_state, seed
    except Exception as e:
        # 捕捉所有异常，返回错误提示，建议用户清理历史后重试
        error_msg = f"发生错误：{e}. 请点击 \"Clear History\" 清理对话历史后再试一次。"
        chat_pairs = [(None, error_msg)]
        # 不修改 history_state，让用户自行清理
        return chat_pairs, history_state, seed

def copy_seed_for_user(real_seed):
    # 这个函数会把隐藏的 seed_holder 值，传给真正要显示的 seed Textbox
    return real_seed

def clear_inputs():
    # img1 和 img2 用 None 来清空；text_in 用空字符串清空；seed 同理清空
    return None, None, "", ""
@spaces.GPU
def clear_history():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    # 默认 prompt 和 seed
    default_prompt = "Translate this photo into a Studio Ghibli-style illustration, holding true to the original composition and movement."
    default_seed   = "-1"

    # 1. chatbot 要用 gr.update(value=[]) 清空
    # 2. state 直接给回初始 dict
    # 3. prompt 和 seed 同样用 gr.update()
    return (
        gr.update(value=[]),                             # 清空聊天框
        {'conversation':[],                              # 重置 state
         'history_image_paths':[],
         'cur_ocr_i':0,
         'cur_genimg_i':0},
        gr.update(value=None),                 # 重置 image1
        gr.update(value=None),                 # 重置 image2
        gr.update(value=default_prompt),                 # 重置 prompt 文本框
        gr.update(value=default_seed),                   # 重置 seed 文本框
    )


if __name__ == '__main__':
    # Gradio UI
    with gr.Blocks(
        theme=gr.themes.Soft(), 
        css=css
        ) as demo:

        gr.Markdown(
            """
            <div style="text-align:center;">
            
            # 🎉 UniWorld-V1 Chat Interface 🎉

            ### Unlock Cutting‑Edge Visual Perception, Feature Extraction, Editing, Synthesis, and Understanding

            **Usage Guide:**
            - It is recommended to perform inference on four images concurrently to offer varied selections.
            - Uploaded images are automatically resized; manually specifying resolutions that differ substantially from the original is not advised.
            </div>
            """,
            elem_classes="header-text",
        )
        with gr.Row():
            with gr.Column():
                chatbot = gr.Chatbot(
                    max_height=100000, min_height=700, 
                    height=None, 
                    resizable=True, 
                    show_copy_button=True
                    )
                text_in = gr.Textbox(label="Instruction", value="Translate this photo into a Studio Ghibli-style illustration, holding true to the original composition and movement.")
            with gr.Column():
                with gr.Row():
                    img1 = gr.Image(type='filepath', label="Image 1", height=256, width=256)
                    img2 = gr.Image(type='filepath', label="Image 2 (Optional reference)", height=256, width=256, visible=True)
                seed = gr.Textbox(label="Seed (-1 for random)", value="-1")
                seed_holder = gr.Textbox(visible=False)
                with gr.Row():
                    num_imgs = gr.Slider(1, 4, 4, step=1, label="Num Images")
                with gr.Row():
                    height = gr.Slider(256, 2048, 1024, step=64, label="Height")
                    width = gr.Slider(256, 2048, 1024, step=64, label="Width")
                with gr.Row():
                    steps = gr.Slider(8, 50, 30, step=1, label="Inference steps")
                    guidance = gr.Slider(1.0, 10.0, 4.0, step=0.1, label="Guidance scale")
                with gr.Accordion("Advanced Options", open=True, visible=True):
                    with gr.Row():
                        enhance_gen_box = gr.Checkbox(value=False, label="Enhance Generation")
                        enhance_und_box = gr.Checkbox(value=False, label="Enhance Understanding")
                    with gr.Row():
                        ocr_box = gr.Checkbox(value=False, label="Enhance Text Rendering")
                        t5_box = gr.Checkbox(value=True, label="Enhance Current Turn")
        with gr.Row():
            submit = gr.Button("Send", variant="primary")
            clear = gr.Button("Clear History", variant="primary")
        with gr.Row():
            with gr.Column(1, min_width=0):
                gr.Markdown(
                    """
                    **🖼️ Visual Perception & Feature Extraction**  
                    - Canny Edge Detection  
                    - Mini-Line Segment Detection  
                    - Normal Map Generation
                    - Sketch Generation  
                    - Holistically-Nested Edge Detection  
                    - Depth Estimation  
                    - Human Pose Estimation
                    - Object Detection (Boxes)  
                    - Semantic Segmentation (Masks)
                    """
                )
            with gr.Column(1, min_width=0):
                gr.Markdown(
                    """
                    **✂️ Image Editing & Manipulation**  
                    - Add Elements  
                    - Adjust Attributes  
                    - Change Background  
                    - Remove Objects  
                    - Replace Regions  
                    - Perform Actions  
                    - Restyle  
                    - Compose Scenes
                    """
                )
            with gr.Column(1, min_width=0):
                gr.Markdown(
                    """
                    **🔄 Cross-Modal Synthesis & Transformation**  
                    - Text→Image Synthesis  
                    - Image‑to‑Image Translation  
                    - Multi‑Image Combination 
                    - Extract IP Features  
                    - IP Feature Composition
                    """ 
                )
            with gr.Column(1, min_width=0):
                gr.Markdown(
                    """
                    **🤖 Visual & Textual QA**  
                    - Image‑Text QA  
                    - Text‑Text QA
                    """
                )
        anchor_pixels = 1024*1024
        # Dynamic resize callback
        def update_size(i1, i2):
            shapes = []
            for p in (i1, i2):
                if p:
                    im = Image.open(p)
                    w, h = im.size
                    shapes.append((w, h))
            if not shapes:
                return gr.update(), gr.update()
            if len(shapes) == 1:
                w, h = shapes[0]
            else:
                w = sum(s[0] for s in shapes) / len(shapes)
                h = sum(s[1] for s in shapes) / len(shapes)
            new_h, new_w = dynamic_resize(int(h), int(w), 'any_11ratio', anchor_pixels=anchor_pixels)
            return gr.update(value=new_h), gr.update(value=new_w)
        img1.change(fn=update_size, inputs=[img1, img2], outputs=[height, width])
        img2.change(fn=update_size, inputs=[img1, img2], outputs=[height, width])

        # Mutual exclusivity
        enhance_gen_box.change(
            lambda g: gr.update(value=False) if g else gr.update(),
            inputs=[enhance_gen_box], outputs=[enhance_und_box]
        )
        enhance_und_box.change(
            lambda u: gr.update(value=False) if u else gr.update(),
            inputs=[enhance_und_box], outputs=[enhance_gen_box]
        )
        state_ = gr.State({'conversation':[], 'history_image_paths':[], 'cur_ocr_i':0, 'cur_genimg_i':0})

        progress_bar = gr.Progress()
        gr.on(
            triggers=[submit.click, text_in.submit],
            fn=chat_step,
            inputs=[img1, img2, text_in, height, width, steps, guidance,
                    ocr_box, t5_box, enhance_gen_box, enhance_und_box, seed, num_imgs, state_, 
                    ],
            outputs=[chatbot, state_, seed_holder], 
            scroll_to_output=True
        ).then(
            fn=copy_seed_for_user,
            inputs=[seed_holder],    # 输入是隐藏的 seed_holder
            outputs=[seed]           # 输出到真正要显示的 seed Textbox
        )

        clear.click(
            fn=clear_history,
            inputs=[],
            outputs=[chatbot, state_, img1, img2, text_in, seed]
        )

        # ========== 添加 Validation Examples ==========
        example_height, example_width = 1024, 1024
        gr.Examples(
            examples_per_page=100, 
            examples=[
                # text-to-image
                [None, None,
                "Generate an adorable golden retriever puppy playing in a sunny park, "
                "with fluffy fur, big round eyes, and a happy expression. "
                "The background should have green grass, some flowers, and a blue sky with white clouds.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],


                # NIKE color swap
                ["assets/nike_src.jpg", None,
                "Switch the product's color from black, black to white, white, making sure the transition is crisp and clear.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

                # style transfer (Ghibli)
                ["assets/gradio/origin.png", None,
                "Translate this photo into a Studio Ghibli-style illustration, holding true to the original composition and movement.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

                ["assets/gradio/origin.png", None,
                "Remove the bicycle located in the lower center region of the image.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

                # blur
                ["assets/gradio/blur.jpg", None,
                "Remove blur, make it clear.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

                # 
                ["assets/gradio/00004614_tgt.jpg", None,
                "Add the ingrid fair isle cashmere turtleneck sweater to the person.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
                # 
                ["assets/gradio/00006581_tgt.jpg", None,
                "Place the belvoir broderie anglaise linen tank on the person in a way that complements their appearance and style.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
                # 
                ["assets/gradio/00008153_tgt.jpg", None,
                "Integrate may cashmere tank on body.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
                # 
                ["assets/gradio/00002315_src.jpg", None,
                "Strip away all context and distractions, leaving the pointelle-trimmed cashmere t-shirt floating on a neutral background.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
                # 
                ["assets/gradio/00002985_src.jpg", None,
                "Generate an image containing only the henry shearling jacket, free from any other visual elements.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

                ["assets/gradio/origin.png", None,
                "Add a cat in the center of image.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

                # image+image-to-image (compose)
                ["assets/00182555_target.jpg",
                "assets/00182555_InstantStyle_ref_1.jpg",
                "Adapt Image1's content to fit the aesthetic of Image2.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

                # replace object
                ["assets/replace_src.png", None,
                "replace motorcycle located in the lower center region of the image with a black bicycle",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

                # segmentation
                ["assets/seg_src.jpg", None,
                "Segment the giraffe from the background.\n",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

                # detection
                ["assets/det_src.jpg", None,
                "Please depict the vase accurately",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

                # image-to-canny
                ["assets/canny_image.jpg", None,
                "Generate a Canny edge map for this image.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

                # image-to-mlsd
                ["assets/mlsd_image.jpg", None,
                "Render an MLSD detection overlay for this input image.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

                # image-to-normal
                ["assets/normal_image.jpg", None,
                "Convert the input texture into a tangent-space normal map.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

                # image-to-sketch
                ["assets/sketch_image.jpg", None,
                "Transform this image into a hand-drawn charcoal sketch.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

                # image-to-hed
                ["assets/hed_image.jpg", None,
                "Produce a holistically-nested boundary probability map of this image.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

                # image-to-depth
                ["assets/depth_image.jpg", None,
                "Estimate depth with a focus on background structure.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

                # image-to-image (reconstruction)
                ["assets/rec.jpg", None,
                "Simply reconstruct the original image with no enhancements.",
                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

            ],
            inputs=[img1, img2, text_in, height, width, steps, guidance,
                    ocr_box, t5_box, enhance_gen_box, enhance_und_box, seed, num_imgs],
        )
    # ==============================================

UI_TRANSLATIONS = {
    "🎉 UniWorld-V1 Chat Interface 🎉":"🎉 UniWorld-V1 聊天界面 🎉",
    "Unlock Cutting‑Edge Visual Perception, Feature Extraction, Editing, Synthesis, and Understanding":
    '解锁尖端视觉感知，特征提取，编辑，合成和理解',
    "Usage Guide:":"使用指南：",
    "It is recommended to perform inference on four images concurrently to offer varied selections.":"建议同时进行四张图像的推理，以提供多选。",
    "Uploaded images are automatically resized; manually specifying resolutions that differ substantially from the original is not advised.":"已上传的图像将自动调整大小，但手动指定与原始图像差异太大的分辨率并不建议。",
    "🖼️ Visual Perception & Feature Extraction":"🖼️ 视觉感知与特征提取", 
    "Canny Edge Detection":"Canny边缘检测 ", 
    "Mini-Line Segment Detection":"微型行段检测",
    "Normal Map Generation":"生成法线图",
    "Sketch Generation":"手绘生成",
    "Holistically-Nested Edge Detection":"整体嵌套边缘检测",
    "Depth Estimation":"深度估计",
    "Human Pose Estimation":"人体姿势估计",
    "Object Detection (Boxes)":"对象检测（框）",
    "Semantic Segmentation (Masks)":"语义分割（蒙版）",
    "✂️ Image Editing & Manipulation":"✂️ 图像编辑与操作",
    "Add Elements":"添加元素",
    "Adjust Attributes":"调整属性",
    "Change Background":"更改背景",
    "Remove Objects":"删除对象",
    "Replace Regions":"替换区域",
    "Perform Actions":"执行操作",
    "Restyle":"重绘风格",
    "Compose Scenes":"组合场景",
    "🔄 Cross-Modal Synthesis & Transformation":"🔄 跨模态综合与转换",
    "Text→Image Synthesis":"文本→图像综合",
    "Image‑to‑Image Translation":"图像-图像转换",
    "Multi‑Image Combination":"多图像组合",
    "Extract IP Features":"提取IP特征",
    "IP Feature Composition":"IP特征组合",
    "🤖 Visual & Textual QA":"🤖 视觉和文字质量检查",
    "Image‑Text QA":"图像-文本质量检查",
    "Text‑Text QA":"文本-文本质量检查",
    "Image 1":"图像 1",
    "Image 2 (Optional reference)":"图像 2 (可选参考)",
    "Instruction":"指令",
    "Seed (-1 for random)":"种子 (-1为随机)",
    "Num Images":"图像数量",
    "Height":"高度",
    "Width":"宽度",
    "Inference steps":"推理步数",
    "Guidance scale":"引导缩放",
    "Advanced Options":"高级选项",
    "Enhance Generation":"增强生成",
    "Enhance Understanding":"增强理解",
    "Enhance Text Rendering":"增强文本渲染",
    "Enhance Current Turn":"增强当前轮次",
    "Send":"发送",
    "Clear History":"清除历史记录",
}


def apply_localization(block):
    def process_component(component):
        if not component:
            return

        for attr in ['label', 'info', 'placeholder']:
            if hasattr(component, attr):
                text = getattr(component, attr)
                if text in UI_TRANSLATIONS:
                    setattr(component, attr, UI_TRANSLATIONS[text])

        if hasattr(component, 'value'):
            value = component.value
            if isinstance(value, str) and value in UI_TRANSLATIONS: 
                component.value = UI_TRANSLATIONS[value]

        if isinstance(component, gr.Markdown):
            for en, zh in UI_TRANSLATIONS.items():
                component.value = component.value.replace(en, zh)

        if hasattr(component, 'children'):
            for child in component.children:
                process_component(child)

    process_component(block)
    return block


if __name__ == "__main__": 
    if args.zh:
        demo = apply_localization(demo)
    demo.title  = "UniWorld-V1"
    demo.launch(
        allowed_paths=["/"],
        server_name=args.server_name, 
        server_port=args.server_port,
        share=args.share, 
        inbrowser=True,
    )


'''
MODEL_PATH="/mnt/data/lb/Remake/FlowWorld/checkpoints/flux_qwen2p5vl_7b_vlm_mlp_siglip_stage2_ts_1024_bs42x8x1_fa_any_11ratio_ema999_ocr_adamw_t5_0p4_lr1e-5_mask_refstyle_extract_resume_run3/checkpoint-12000/model_ema"
FLUX_PATH="/mnt/data/checkpoints/black-forest-labs/FLUX.1-dev"
SIGLIP_PATH="/mnt/data/checkpoints/google/siglip2-so400m-patch16-512"
CUDA_VISIBLE_DEVICES=2 python app.py \
    --model_path ${MODEL_PATH} \
    --flux_path ${FLUX_PATH} \
    --siglip_path ${SIGLIP_PATH}
'''