import spaces import os import gradio as gr import sys sys.path.append("..") from transformers import AutoProcessor, SiglipImageProcessor, SiglipVisionModel, T5EncoderModel, BitsAndBytesConfig from univa.models.qwen2p5vl.modeling_univa_qwen2p5vl import UnivaQwen2p5VLForConditionalGeneration from univa.utils.flux_pipeline import FluxPipeline from univa.utils.get_ocr import get_ocr_result from univa.utils.denoiser_prompt_embedding_flux import encode_prompt from qwen_vl_utils import process_vision_info from univa.utils.anyres_util import dynamic_resize, concat_images_adaptive import torch from torch import nn import uuid import base64 from typing import Dict from PIL import Image, ImageDraw, ImageFont import argparse import gc def parse_args(): parser = argparse.ArgumentParser(description="Model and component paths") parser.add_argument("--model_path", type=str, default="LanguageBind/UniWorld-V1", help="UniWorld-V1模型路径") parser.add_argument("--flux_path", type=str, default="black-forest-labs/FLUX.1-dev", help="FLUX.1-dev模型路径") parser.add_argument("--siglip_path", type=str, default="google/siglip2-so400m-patch16-512", help="siglip2模型路径") parser.add_argument("--server_name", type=str, default="127.0.0.1", help="IP地址") parser.add_argument("--server_port", type=int, default=6812, help="端口号") parser.add_argument("--share", action="store_true", help="是否公开分享") parser.add_argument("--nf4", action="store_true", help="是否NF4量化") parser.add_argument("--zh", action="store_true", help="是否使用中文") parser.add_argument("--offload", action="store_true", help="是否开启顺序卸载") return parser.parse_args() def add_plain_text_watermark( img: Image.Image, text: str, margin: int = 50, font_size: int = 30, ): if img.mode != "RGB": img = img.convert("RGB") draw = ImageDraw.Draw(img) font = ImageFont.truetype("DejaVuSans.ttf", font_size) bbox = draw.textbbox((0, 0), text) text_width = bbox[2] - bbox[0] text_height = bbox[3] - bbox[1] x = img.width - text_width - int(3.3 * margin) y = img.height - text_height - margin draw.text((x, y), text, font=font, fill=(255, 255, 255)) return img css = """ .table-wrap table tr td:nth-child(3) > div { max-height: 150px; /* 最多 100px 高度,按需修改 */ overflow-y: auto; /* 超出部分显示竖向滚动条 */ white-space: pre-wrap; /* 自动换行 */ word-break: break-all; /* 长单词内部分行 */ } .table-wrap table tr td:nth-child(2) > div { max-width: 150px; white-space: pre-wrap; word-break: break-all; overflow-x: auto; } .table-wrap table tr th:nth-child(2) { max-width: 150px; white-space: normal; word-break: keep-all; overflow-x: auto; } .table-wrap table tr td:nth-last-child(-n+8) > div { max-width: 130px; white-space: pre-wrap; word-break: break-all; overflow-x: auto; } .table-wrap table tr th:nth-last-child(-n+8) { max-width: 130px; white-space: normal; word-break: keep-all; overflow-x: auto; } """ def img2b64(image_path): with open(image_path, "rb") as f: b64 = base64.b64encode(f.read()).decode() data_uri = f"data:image/jpeg;base64,{b64}" return data_uri def initialize_models(args): os.makedirs("tmp", exist_ok=True) # Paths quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4", ) # Load main model and task head model = UnivaQwen2p5VLForConditionalGeneration.from_pretrained( args.model_path, torch_dtype=torch.float16, attn_implementation="sdpa", quantization_config=quantization_config if args.nf4 else None, ) task_head = nn.Sequential( nn.Linear(3584, 10240), nn.SiLU(), nn.Dropout(0.3), nn.Linear(10240, 2) ) # task_head.load_state_dict(torch.load(os.path.join(args.model_path, 'task_head_final.pt'))) task_head.eval() processor = AutoProcessor.from_pretrained( args.model_path, min_pixels=448*448, max_pixels=448*448, ) if args.nf4: text_encoder_2 = T5EncoderModel.from_pretrained( args.flux_path, subfolder="text_encoder_2", quantization_config=quantization_config, torch_dtype=torch.float16, ) pipe = FluxPipeline.from_pretrained( args.flux_path, transformer=model.denoise_tower.denoiser, text_encoder_2=text_encoder_2, torch_dtype=torch.float16, token=os.environ["HF_TOKEN"], ) else: pipe = FluxPipeline.from_pretrained( args.flux_path, transformer=model.denoise_tower.denoiser, torch_dtype=torch.float16, token=os.environ["HF_TOKEN"], ) if args.offload: pipe.enable_model_cpu_offload() pipe.enable_vae_slicing() tokenizers = [pipe.tokenizer, pipe.tokenizer_2] text_encoders = [pipe.text_encoder, pipe.text_encoder_2] # Optional SigLIP siglip_processor, siglip_model = None, None siglip_processor = SiglipImageProcessor.from_pretrained(args.siglip_path) siglip_model = SiglipVisionModel.from_pretrained( args.siglip_path, torch_dtype=torch.float16, ) return { 'model': model, 'task_head': task_head, 'processor': processor, 'pipe': pipe, 'tokenizers': tokenizers, 'text_encoders': text_encoders, 'siglip_processor': siglip_processor, 'siglip_model': siglip_model, } @spaces.GPU(duration=600) def to_device(state): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") state['model'] = state['model'].to(device, dtype=torch.bfloat16) state['task_head'] = state['task_head'].to(device, dtype=torch.bfloat16) state['pipe'] = state['pipe'].to(device, dtype=torch.bfloat16) state['text_encoders'] = state['text_encoders'].to(device, dtype=torch.bfloat16) state['siglip_model'] = state['siglip_model'].to(device, dtype=torch.bfloat16) state['device'] = device return state args = parse_args() state = initialize_models(args) state = to_device(state) @spaces.GPU def process_large_image(raw_img): if raw_img is None: return raw_img img = Image.open(raw_img).convert("RGB") max_side = max(img.width, img.height) if max_side > 1024: scale = 1024 / max_side new_w = int(img.width * scale) new_h = int(img.height * scale) print(f'resize img {img.size} to {(new_w, new_h)}') img = img.resize((new_w, new_h), resample=Image.LANCZOS) save_path = f"tmp/{uuid.uuid4().hex}.png" img.save(save_path) return save_path else: return raw_img @spaces.GPU(duration=200) def chat_step(image1, image2, text, height, width, steps, guidance, ocr_enhancer, joint_with_t5, enhance_generation, enhance_understanding, seed, num_imgs, history_state, progress=gr.Progress()): try: convo = history_state['conversation'] image_paths = history_state['history_image_paths'] cur_ocr_i = history_state['cur_ocr_i'] cur_genimg_i = history_state['cur_genimg_i'] # image1 = process_large_image(image1) # image2 = process_large_image(image2) # Build content content = [] if text: ocr_text = '' if ocr_enhancer and content: ocr_texts = [] for img in (image1, image2): if img: ocr_texts.append(get_ocr_result(img, cur_ocr_i)) cur_ocr_i += 1 ocr_text = '\n'.join(ocr_texts) content.append({'type':'text','text': text + ocr_text}) for img in (image1, image2): if img: content.append({'type':'image','image':img,'min_pixels':448*448,'max_pixels':448*448}) image_paths.append(img) convo.append({'role':'user','content':content}) # Prepare inputs chat_text = state['processor'].apply_chat_template(convo, tokenize=False, add_generation_prompt=True) chat_text = '<|im_end|>\n'.join(chat_text.split('<|im_end|>\n')[1:]) image_inputs, video_inputs = process_vision_info(convo) inputs = state['processor']( text=[chat_text], images=image_inputs, videos=video_inputs, padding=True, return_tensors='pt' ).to(state['device']) # Model forward & task head with torch.no_grad(): outputs = state['model'](**inputs, return_dict=True, output_hidden_states=True) hidden = outputs.hidden_states[-1] mask = inputs.input_ids == 77091 vecs = hidden[mask][-1:] task_res = state['task_head'](vecs.float())[0] print(task_res) # Branch decision if enhance_generation: do_image = True elif enhance_understanding: do_image = False else: do_image = (task_res[0] < task_res[1]) seed = int(seed) if seed == -1: seed = torch.Generator(device="cpu").seed() torch.manual_seed(seed) # Generate if True: # image generation pipeline siglip_hs = None if state['siglip_processor'] and image_paths: vals = [state['siglip_processor'].preprocess( images=Image.open(p).convert('RGB'), do_resize=True, return_tensors='pt', do_convert_rgb=True ).pixel_values.to(state['device']) for p in image_paths] siglip_hs = state['siglip_model'](torch.concat(vals)).last_hidden_state with torch.no_grad(): lvlm = state['model']( inputs.input_ids, pixel_values=getattr(inputs,'pixel_values',None), attention_mask=inputs.attention_mask, image_grid_thw=getattr(inputs,'image_grid_thw',None), siglip_hidden_states=siglip_hs, output_type='denoise_embeds' ) prm_embeds, pooled = encode_prompt( state['text_encoders'], state['tokenizers'], text if joint_with_t5 else '', 256, state['device'], 1 ) emb = torch.concat([lvlm, prm_embeds], dim=1) if joint_with_t5 else lvlm def diffusion_to_gradio_callback(_pipeline, step_idx: int, timestep: int, tensor_dict: Dict): # 1)更新 Gradio 进度条 frac = (step_idx + 1) / float(steps) progress(frac) return tensor_dict with torch.no_grad(): img = state['pipe']( prompt_embeds=emb, pooled_prompt_embeds=pooled, height=height, width=width, num_inference_steps=steps, guidance_scale=guidance, generator=torch.Generator(device='cuda').manual_seed(seed), num_images_per_prompt=num_imgs, callback_on_step_end=diffusion_to_gradio_callback, # callback_on_step_end_tensor_inputs=["latents", "prompt_embeds"], ).images # img = [add_plain_text_watermark(im, 'Open-Sora Plan 2.0 Generated') for im in img] img = concat_images_adaptive(img) save_path = f"tmp/{uuid.uuid4().hex}.png" img.save(save_path) convo.append({'role':'assistant','content':[{'type':'image','image':save_path}]}) cur_genimg_i += 1 progress(1.0) bot_msg = (None, save_path) else: # text generation gen_ids = state['model'].generate(**inputs, max_new_tokens=128) out = state['processor'].batch_decode( [g[len(inputs.input_ids[0]):] for g in gen_ids], skip_special_tokens=True )[0] convo.append({'role':'assistant','content':[{'type':'text','text':out}]}) bot_msg = (None, out) chat_pairs = [] # print(convo) # print() # print() for msg in convo: # print(msg) if msg['role']=='user': parts = [] for c in msg['content']: if c['type']=='text': parts.append(c['text']) if c['type']=='image': parts.append(f"![user image]({img2b64(c['image'])})") chat_pairs.append(("\n".join(parts), None)) else: parts = [] for c in msg['content']: if c['type']=='text': parts.append(c['text']) if c['type']=='image': parts.append(f"![assistant image]({img2b64(c['image'])})") if msg['content'][-1]['type']=='text': chat_pairs[-1] = (chat_pairs[-1][0], parts[-1]) else: chat_pairs[-1] = (chat_pairs[-1][0], parts[-1]) # print() # print(chat_pairs) # Update state history_state.update({ 'conversation': convo, 'history_image_paths': image_paths, 'cur_ocr_i': cur_ocr_i, 'cur_genimg_i': cur_genimg_i }) return chat_pairs, history_state, seed except Exception as e: # 捕捉所有异常,返回错误提示,建议用户清理历史后重试 error_msg = f"发生错误:{e}. 请点击 \"Clear History\" 清理对话历史后再试一次。" chat_pairs = [(None, error_msg)] # 不修改 history_state,让用户自行清理 return chat_pairs, history_state, seed def copy_seed_for_user(real_seed): # 这个函数会把隐藏的 seed_holder 值,传给真正要显示的 seed Textbox return real_seed def clear_inputs(): # img1 和 img2 用 None 来清空;text_in 用空字符串清空;seed 同理清空 return None, None, "", "" @spaces.GPU def clear_history(): gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.ipc_collect() # 默认 prompt 和 seed default_prompt = "Translate this photo into a Studio Ghibli-style illustration, holding true to the original composition and movement." default_seed = "-1" # 1. chatbot 要用 gr.update(value=[]) 清空 # 2. state 直接给回初始 dict # 3. prompt 和 seed 同样用 gr.update() return ( gr.update(value=[]), # 清空聊天框 {'conversation':[], # 重置 state 'history_image_paths':[], 'cur_ocr_i':0, 'cur_genimg_i':0}, gr.update(value=None), # 重置 image1 gr.update(value=None), # 重置 image2 gr.update(value=default_prompt), # 重置 prompt 文本框 gr.update(value=default_seed), # 重置 seed 文本框 ) if __name__ == '__main__': # Gradio UI with gr.Blocks( theme=gr.themes.Soft(), css=css ) as demo: gr.Markdown( """
# 🎉 UniWorld-V1 Chat Interface 🎉 ### Unlock Cutting‑Edge Visual Perception, Feature Extraction, Editing, Synthesis, and Understanding **Usage Guide:** - It is recommended to perform inference on four images concurrently to offer varied selections. - Uploaded images are automatically resized; manually specifying resolutions that differ substantially from the original is not advised.
""", elem_classes="header-text", ) with gr.Row(): with gr.Column(): chatbot = gr.Chatbot( max_height=100000, min_height=700, height=None, resizable=True, show_copy_button=True ) text_in = gr.Textbox(label="Instruction", value="Translate this photo into a Studio Ghibli-style illustration, holding true to the original composition and movement.") with gr.Column(): with gr.Row(): img1 = gr.Image(type='filepath', label="Image 1", height=256, width=256) img2 = gr.Image(type='filepath', label="Image 2 (Optional reference)", height=256, width=256, visible=True) seed = gr.Textbox(label="Seed (-1 for random)", value="-1") seed_holder = gr.Textbox(visible=False) with gr.Row(): num_imgs = gr.Slider(1, 4, 4, step=1, label="Num Images") with gr.Row(): height = gr.Slider(256, 2048, 1024, step=64, label="Height") width = gr.Slider(256, 2048, 1024, step=64, label="Width") with gr.Row(): steps = gr.Slider(8, 50, 30, step=1, label="Inference steps") guidance = gr.Slider(1.0, 10.0, 4.0, step=0.1, label="Guidance scale") with gr.Accordion("Advanced Options", open=True, visible=True): with gr.Row(): enhance_gen_box = gr.Checkbox(value=False, label="Enhance Generation") enhance_und_box = gr.Checkbox(value=False, label="Enhance Understanding") with gr.Row(): ocr_box = gr.Checkbox(value=False, label="Enhance Text Rendering") t5_box = gr.Checkbox(value=True, label="Enhance Current Turn") with gr.Row(): submit = gr.Button("Send", variant="primary") clear = gr.Button("Clear History", variant="primary") with gr.Row(): with gr.Column(1, min_width=0): gr.Markdown( """ **🖼️ Visual Perception & Feature Extraction** - Canny Edge Detection - Mini-Line Segment Detection - Normal Map Generation - Sketch Generation - Holistically-Nested Edge Detection - Depth Estimation - Human Pose Estimation - Object Detection (Boxes) - Semantic Segmentation (Masks) """ ) with gr.Column(1, min_width=0): gr.Markdown( """ **✂️ Image Editing & Manipulation** - Add Elements - Adjust Attributes - Change Background - Remove Objects - Replace Regions - Perform Actions - Restyle - Compose Scenes """ ) with gr.Column(1, min_width=0): gr.Markdown( """ **🔄 Cross-Modal Synthesis & Transformation** - Text→Image Synthesis - Image‑to‑Image Translation - Multi‑Image Combination - Extract IP Features - IP Feature Composition """ ) with gr.Column(1, min_width=0): gr.Markdown( """ **🤖 Visual & Textual QA** - Image‑Text QA - Text‑Text QA """ ) anchor_pixels = 1024*1024 # Dynamic resize callback def update_size(i1, i2): shapes = [] for p in (i1, i2): if p: im = Image.open(p) w, h = im.size shapes.append((w, h)) if not shapes: return gr.update(), gr.update() if len(shapes) == 1: w, h = shapes[0] else: w = sum(s[0] for s in shapes) / len(shapes) h = sum(s[1] for s in shapes) / len(shapes) new_h, new_w = dynamic_resize(int(h), int(w), 'any_11ratio', anchor_pixels=anchor_pixels) return gr.update(value=new_h), gr.update(value=new_w) img1.change(fn=update_size, inputs=[img1, img2], outputs=[height, width]) img2.change(fn=update_size, inputs=[img1, img2], outputs=[height, width]) # Mutual exclusivity enhance_gen_box.change( lambda g: gr.update(value=False) if g else gr.update(), inputs=[enhance_gen_box], outputs=[enhance_und_box] ) enhance_und_box.change( lambda u: gr.update(value=False) if u else gr.update(), inputs=[enhance_und_box], outputs=[enhance_gen_box] ) state_ = gr.State({'conversation':[], 'history_image_paths':[], 'cur_ocr_i':0, 'cur_genimg_i':0}) progress_bar = gr.Progress() gr.on( triggers=[submit.click, text_in.submit], fn=chat_step, inputs=[img1, img2, text_in, height, width, steps, guidance, ocr_box, t5_box, enhance_gen_box, enhance_und_box, seed, num_imgs, state_, ], outputs=[chatbot, state_, seed_holder], scroll_to_output=True ).then( fn=copy_seed_for_user, inputs=[seed_holder], # 输入是隐藏的 seed_holder outputs=[seed] # 输出到真正要显示的 seed Textbox ) clear.click( fn=clear_history, inputs=[], outputs=[chatbot, state_, img1, img2, text_in, seed] ) # ========== 添加 Validation Examples ========== example_height, example_width = 1024, 1024 gr.Examples( examples_per_page=100, examples=[ # text-to-image [None, None, "Generate an adorable golden retriever puppy playing in a sunny park, " "with fluffy fur, big round eyes, and a happy expression. " "The background should have green grass, some flowers, and a blue sky with white clouds.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # NIKE color swap ["assets/nike_src.jpg", None, "Switch the product's color from black, black to white, white, making sure the transition is crisp and clear.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # style transfer (Ghibli) ["assets/gradio/origin.png", None, "Translate this photo into a Studio Ghibli-style illustration, holding true to the original composition and movement.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], ["assets/gradio/origin.png", None, "Remove the bicycle located in the lower center region of the image.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # blur ["assets/gradio/blur.jpg", None, "Remove blur, make it clear.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # ["assets/gradio/00004614_tgt.jpg", None, "Add the ingrid fair isle cashmere turtleneck sweater to the person.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # ["assets/gradio/00006581_tgt.jpg", None, "Place the belvoir broderie anglaise linen tank on the person in a way that complements their appearance and style.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # ["assets/gradio/00008153_tgt.jpg", None, "Integrate may cashmere tank on body.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # ["assets/gradio/00002315_src.jpg", None, "Strip away all context and distractions, leaving the pointelle-trimmed cashmere t-shirt floating on a neutral background.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # ["assets/gradio/00002985_src.jpg", None, "Generate an image containing only the henry shearling jacket, free from any other visual elements.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], ["assets/gradio/origin.png", None, "Add a cat in the center of image.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # image+image-to-image (compose) ["assets/00182555_target.jpg", "assets/00182555_InstantStyle_ref_1.jpg", "Adapt Image1's content to fit the aesthetic of Image2.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # replace object ["assets/replace_src.png", None, "replace motorcycle located in the lower center region of the image with a black bicycle", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # segmentation ["assets/seg_src.jpg", None, "Segment the giraffe from the background.\n", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # detection ["assets/det_src.jpg", None, "Please depict the vase accurately", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # image-to-canny ["assets/canny_image.jpg", None, "Generate a Canny edge map for this image.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # image-to-mlsd ["assets/mlsd_image.jpg", None, "Render an MLSD detection overlay for this input image.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # image-to-normal ["assets/normal_image.jpg", None, "Convert the input texture into a tangent-space normal map.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # image-to-sketch ["assets/sketch_image.jpg", None, "Transform this image into a hand-drawn charcoal sketch.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # image-to-hed ["assets/hed_image.jpg", None, "Produce a holistically-nested boundary probability map of this image.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # image-to-depth ["assets/depth_image.jpg", None, "Estimate depth with a focus on background structure.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], # image-to-image (reconstruction) ["assets/rec.jpg", None, "Simply reconstruct the original image with no enhancements.", example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4], ], inputs=[img1, img2, text_in, height, width, steps, guidance, ocr_box, t5_box, enhance_gen_box, enhance_und_box, seed, num_imgs], ) # ============================================== UI_TRANSLATIONS = { "🎉 UniWorld-V1 Chat Interface 🎉":"🎉 UniWorld-V1 聊天界面 🎉", "Unlock Cutting‑Edge Visual Perception, Feature Extraction, Editing, Synthesis, and Understanding": '解锁尖端视觉感知,特征提取,编辑,合成和理解', "Usage Guide:":"使用指南:", "It is recommended to perform inference on four images concurrently to offer varied selections.":"建议同时进行四张图像的推理,以提供多选。", "Uploaded images are automatically resized; manually specifying resolutions that differ substantially from the original is not advised.":"已上传的图像将自动调整大小,但手动指定与原始图像差异太大的分辨率并不建议。", "🖼️ Visual Perception & Feature Extraction":"🖼️ 视觉感知与特征提取", "Canny Edge Detection":"Canny边缘检测 ", "Mini-Line Segment Detection":"微型行段检测", "Normal Map Generation":"生成法线图", "Sketch Generation":"手绘生成", "Holistically-Nested Edge Detection":"整体嵌套边缘检测", "Depth Estimation":"深度估计", "Human Pose Estimation":"人体姿势估计", "Object Detection (Boxes)":"对象检测(框)", "Semantic Segmentation (Masks)":"语义分割(蒙版)", "✂️ Image Editing & Manipulation":"✂️ 图像编辑与操作", "Add Elements":"添加元素", "Adjust Attributes":"调整属性", "Change Background":"更改背景", "Remove Objects":"删除对象", "Replace Regions":"替换区域", "Perform Actions":"执行操作", "Restyle":"重绘风格", "Compose Scenes":"组合场景", "🔄 Cross-Modal Synthesis & Transformation":"🔄 跨模态综合与转换", "Text→Image Synthesis":"文本→图像综合", "Image‑to‑Image Translation":"图像-图像转换", "Multi‑Image Combination":"多图像组合", "Extract IP Features":"提取IP特征", "IP Feature Composition":"IP特征组合", "🤖 Visual & Textual QA":"🤖 视觉和文字质量检查", "Image‑Text QA":"图像-文本质量检查", "Text‑Text QA":"文本-文本质量检查", "Image 1":"图像 1", "Image 2 (Optional reference)":"图像 2 (可选参考)", "Instruction":"指令", "Seed (-1 for random)":"种子 (-1为随机)", "Num Images":"图像数量", "Height":"高度", "Width":"宽度", "Inference steps":"推理步数", "Guidance scale":"引导缩放", "Advanced Options":"高级选项", "Enhance Generation":"增强生成", "Enhance Understanding":"增强理解", "Enhance Text Rendering":"增强文本渲染", "Enhance Current Turn":"增强当前轮次", "Send":"发送", "Clear History":"清除历史记录", } def apply_localization(block): def process_component(component): if not component: return for attr in ['label', 'info', 'placeholder']: if hasattr(component, attr): text = getattr(component, attr) if text in UI_TRANSLATIONS: setattr(component, attr, UI_TRANSLATIONS[text]) if hasattr(component, 'value'): value = component.value if isinstance(value, str) and value in UI_TRANSLATIONS: component.value = UI_TRANSLATIONS[value] if isinstance(component, gr.Markdown): for en, zh in UI_TRANSLATIONS.items(): component.value = component.value.replace(en, zh) if hasattr(component, 'children'): for child in component.children: process_component(child) process_component(block) return block if __name__ == "__main__": if args.zh: demo = apply_localization(demo) demo.title = "UniWorld-V1" demo.launch( allowed_paths=["/"], server_name=args.server_name, server_port=args.server_port, share=args.share, inbrowser=True, ) ''' MODEL_PATH="/mnt/data/lb/Remake/FlowWorld/checkpoints/flux_qwen2p5vl_7b_vlm_mlp_siglip_stage2_ts_1024_bs42x8x1_fa_any_11ratio_ema999_ocr_adamw_t5_0p4_lr1e-5_mask_refstyle_extract_resume_run3/checkpoint-12000/model_ema" FLUX_PATH="/mnt/data/checkpoints/black-forest-labs/FLUX.1-dev" SIGLIP_PATH="/mnt/data/checkpoints/google/siglip2-so400m-patch16-512" CUDA_VISIBLE_DEVICES=2 python app.py \ --model_path ${MODEL_PATH} \ --flux_path ${FLUX_PATH} \ --siglip_path ${SIGLIP_PATH} '''