UniWorld-V1 / app.py
LanguageBind's picture
Update app.py
568c324 verified
import spaces
import os
import gradio as gr
import sys
sys.path.append("..")
from transformers import AutoProcessor, SiglipImageProcessor, SiglipVisionModel, T5EncoderModel, BitsAndBytesConfig
from univa.models.qwen2p5vl.modeling_univa_qwen2p5vl import UnivaQwen2p5VLForConditionalGeneration
from univa.utils.flux_pipeline import FluxPipeline
from univa.utils.get_ocr import get_ocr_result
from univa.utils.denoiser_prompt_embedding_flux import encode_prompt
from qwen_vl_utils import process_vision_info
from univa.utils.anyres_util import dynamic_resize, concat_images_adaptive
import torch
from torch import nn
import uuid
import base64
from typing import Dict
from PIL import Image, ImageDraw, ImageFont
import argparse
import gc
def parse_args():
parser = argparse.ArgumentParser(description="Model and component paths")
parser.add_argument("--model_path", type=str, default="LanguageBind/UniWorld-V1", help="UniWorld-V1模型路径")
parser.add_argument("--flux_path", type=str, default="black-forest-labs/FLUX.1-dev", help="FLUX.1-dev模型路径")
parser.add_argument("--siglip_path", type=str, default="google/siglip2-so400m-patch16-512", help="siglip2模型路径")
parser.add_argument("--server_name", type=str, default="127.0.0.1", help="IP地址")
parser.add_argument("--server_port", type=int, default=6812, help="端口号")
parser.add_argument("--share", action="store_true", help="是否公开分享")
parser.add_argument("--nf4", action="store_true", help="是否NF4量化")
parser.add_argument("--zh", action="store_true", help="是否使用中文")
parser.add_argument("--offload", action="store_true", help="是否开启顺序卸载")
return parser.parse_args()
def add_plain_text_watermark(
img: Image.Image,
text: str,
margin: int = 50,
font_size: int = 30,
):
if img.mode != "RGB":
img = img.convert("RGB")
draw = ImageDraw.Draw(img)
font = ImageFont.truetype("DejaVuSans.ttf", font_size)
bbox = draw.textbbox((0, 0), text)
text_width = bbox[2] - bbox[0]
text_height = bbox[3] - bbox[1]
x = img.width - text_width - int(3.3 * margin)
y = img.height - text_height - margin
draw.text((x, y), text, font=font, fill=(255, 255, 255))
return img
css = """
.table-wrap table tr td:nth-child(3) > div {
max-height: 150px; /* 最多 100px 高度,按需修改 */
overflow-y: auto; /* 超出部分显示竖向滚动条 */
white-space: pre-wrap; /* 自动换行 */
word-break: break-all; /* 长单词内部分行 */
}
.table-wrap table tr td:nth-child(2) > div {
max-width: 150px;
white-space: pre-wrap;
word-break: break-all;
overflow-x: auto;
}
.table-wrap table tr th:nth-child(2) {
max-width: 150px;
white-space: normal;
word-break: keep-all;
overflow-x: auto;
}
.table-wrap table tr td:nth-last-child(-n+8) > div {
max-width: 130px;
white-space: pre-wrap;
word-break: break-all;
overflow-x: auto;
}
.table-wrap table tr th:nth-last-child(-n+8) {
max-width: 130px;
white-space: normal;
word-break: keep-all;
overflow-x: auto;
}
"""
def img2b64(image_path):
with open(image_path, "rb") as f:
b64 = base64.b64encode(f.read()).decode()
data_uri = f"data:image/jpeg;base64,{b64}"
return data_uri
def initialize_models(args):
os.makedirs("tmp", exist_ok=True)
# Paths
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4",
)
# Load main model and task head
model = UnivaQwen2p5VLForConditionalGeneration.from_pretrained(
args.model_path,
torch_dtype=torch.float16,
attn_implementation="sdpa",
quantization_config=quantization_config if args.nf4 else None,
)
task_head = nn.Sequential(
nn.Linear(3584, 10240),
nn.SiLU(),
nn.Dropout(0.3),
nn.Linear(10240, 2)
)
# task_head.load_state_dict(torch.load(os.path.join(args.model_path, 'task_head_final.pt')))
task_head.eval()
processor = AutoProcessor.from_pretrained(
args.model_path,
min_pixels=448*448,
max_pixels=448*448,
)
if args.nf4:
text_encoder_2 = T5EncoderModel.from_pretrained(
args.flux_path,
subfolder="text_encoder_2",
quantization_config=quantization_config,
torch_dtype=torch.float16,
)
pipe = FluxPipeline.from_pretrained(
args.flux_path,
transformer=model.denoise_tower.denoiser,
text_encoder_2=text_encoder_2,
torch_dtype=torch.float16,
token=os.environ["HF_TOKEN"],
)
else:
pipe = FluxPipeline.from_pretrained(
args.flux_path,
transformer=model.denoise_tower.denoiser,
torch_dtype=torch.float16,
token=os.environ["HF_TOKEN"],
)
if args.offload:
pipe.enable_model_cpu_offload()
pipe.enable_vae_slicing()
tokenizers = [pipe.tokenizer, pipe.tokenizer_2]
text_encoders = [pipe.text_encoder, pipe.text_encoder_2]
# Optional SigLIP
siglip_processor, siglip_model = None, None
siglip_processor = SiglipImageProcessor.from_pretrained(args.siglip_path)
siglip_model = SiglipVisionModel.from_pretrained(
args.siglip_path,
torch_dtype=torch.float16,
)
return {
'model': model,
'task_head': task_head,
'processor': processor,
'pipe': pipe,
'tokenizers': tokenizers,
'text_encoders': text_encoders,
'siglip_processor': siglip_processor,
'siglip_model': siglip_model,
}
@spaces.GPU(duration=600)
def to_device(state):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
state['model'] = state['model'].to(device, dtype=torch.bfloat16)
state['task_head'] = state['task_head'].to(device, dtype=torch.bfloat16)
state['pipe'] = state['pipe'].to(device, dtype=torch.bfloat16)
state['text_encoders'] = state['text_encoders'].to(device, dtype=torch.bfloat16)
state['siglip_model'] = state['siglip_model'].to(device, dtype=torch.bfloat16)
state['device'] = device
return state
args = parse_args()
state = initialize_models(args)
state = to_device(state)
@spaces.GPU
def process_large_image(raw_img):
if raw_img is None:
return raw_img
img = Image.open(raw_img).convert("RGB")
max_side = max(img.width, img.height)
if max_side > 1024:
scale = 1024 / max_side
new_w = int(img.width * scale)
new_h = int(img.height * scale)
print(f'resize img {img.size} to {(new_w, new_h)}')
img = img.resize((new_w, new_h), resample=Image.LANCZOS)
save_path = f"tmp/{uuid.uuid4().hex}.png"
img.save(save_path)
return save_path
else:
return raw_img
@spaces.GPU(duration=200)
def chat_step(image1, image2, text, height, width, steps, guidance,
ocr_enhancer, joint_with_t5, enhance_generation, enhance_understanding,
seed, num_imgs, history_state, progress=gr.Progress()):
try:
convo = history_state['conversation']
image_paths = history_state['history_image_paths']
cur_ocr_i = history_state['cur_ocr_i']
cur_genimg_i = history_state['cur_genimg_i']
# image1 = process_large_image(image1)
# image2 = process_large_image(image2)
# Build content
content = []
if text:
ocr_text = ''
if ocr_enhancer and content:
ocr_texts = []
for img in (image1, image2):
if img:
ocr_texts.append(get_ocr_result(img, cur_ocr_i))
cur_ocr_i += 1
ocr_text = '\n'.join(ocr_texts)
content.append({'type':'text','text': text + ocr_text})
for img in (image1, image2):
if img:
content.append({'type':'image','image':img,'min_pixels':448*448,'max_pixels':448*448})
image_paths.append(img)
convo.append({'role':'user','content':content})
# Prepare inputs
chat_text = state['processor'].apply_chat_template(convo,
tokenize=False, add_generation_prompt=True)
chat_text = '<|im_end|>\n'.join(chat_text.split('<|im_end|>\n')[1:])
image_inputs, video_inputs = process_vision_info(convo)
inputs = state['processor'](
text=[chat_text], images=image_inputs, videos=video_inputs,
padding=True, return_tensors='pt'
).to(state['device'])
# Model forward & task head
with torch.no_grad():
outputs = state['model'](**inputs, return_dict=True, output_hidden_states=True)
hidden = outputs.hidden_states[-1]
mask = inputs.input_ids == 77091
vecs = hidden[mask][-1:]
task_res = state['task_head'](vecs.float())[0]
print(task_res)
# Branch decision
if enhance_generation:
do_image = True
elif enhance_understanding:
do_image = False
else:
do_image = (task_res[0] < task_res[1])
seed = int(seed)
if seed == -1:
seed = torch.Generator(device="cpu").seed()
torch.manual_seed(seed)
# Generate
if True:
# image generation pipeline
siglip_hs = None
if state['siglip_processor'] and image_paths:
vals = [state['siglip_processor'].preprocess(
images=Image.open(p).convert('RGB'), do_resize=True,
return_tensors='pt', do_convert_rgb=True
).pixel_values.to(state['device'])
for p in image_paths]
siglip_hs = state['siglip_model'](torch.concat(vals)).last_hidden_state
with torch.no_grad():
lvlm = state['model'](
inputs.input_ids, pixel_values=getattr(inputs,'pixel_values',None),
attention_mask=inputs.attention_mask,
image_grid_thw=getattr(inputs,'image_grid_thw',None),
siglip_hidden_states=siglip_hs,
output_type='denoise_embeds'
)
prm_embeds, pooled = encode_prompt(
state['text_encoders'], state['tokenizers'],
text if joint_with_t5 else '', 256, state['device'], 1
)
emb = torch.concat([lvlm, prm_embeds], dim=1) if joint_with_t5 else lvlm
def diffusion_to_gradio_callback(_pipeline, step_idx: int, timestep: int, tensor_dict: Dict):
# 1)更新 Gradio 进度条
frac = (step_idx + 1) / float(steps)
progress(frac)
return tensor_dict
with torch.no_grad():
img = state['pipe'](
prompt_embeds=emb, pooled_prompt_embeds=pooled,
height=height, width=width,
num_inference_steps=steps,
guidance_scale=guidance,
generator=torch.Generator(device='cuda').manual_seed(seed),
num_images_per_prompt=num_imgs,
callback_on_step_end=diffusion_to_gradio_callback,
# callback_on_step_end_tensor_inputs=["latents", "prompt_embeds"],
).images
# img = [add_plain_text_watermark(im, 'Open-Sora Plan 2.0 Generated') for im in img]
img = concat_images_adaptive(img)
save_path = f"tmp/{uuid.uuid4().hex}.png"
img.save(save_path)
convo.append({'role':'assistant','content':[{'type':'image','image':save_path}]})
cur_genimg_i += 1
progress(1.0)
bot_msg = (None, save_path)
else:
# text generation
gen_ids = state['model'].generate(**inputs, max_new_tokens=128)
out = state['processor'].batch_decode(
[g[len(inputs.input_ids[0]):] for g in gen_ids], skip_special_tokens=True
)[0]
convo.append({'role':'assistant','content':[{'type':'text','text':out}]})
bot_msg = (None, out)
chat_pairs = []
# print(convo)
# print()
# print()
for msg in convo:
# print(msg)
if msg['role']=='user':
parts = []
for c in msg['content']:
if c['type']=='text': parts.append(c['text'])
if c['type']=='image': parts.append(f"![user image]({img2b64(c['image'])})")
chat_pairs.append(("\n".join(parts), None))
else:
parts = []
for c in msg['content']:
if c['type']=='text': parts.append(c['text'])
if c['type']=='image': parts.append(f"![assistant image]({img2b64(c['image'])})")
if msg['content'][-1]['type']=='text':
chat_pairs[-1] = (chat_pairs[-1][0], parts[-1])
else:
chat_pairs[-1] = (chat_pairs[-1][0], parts[-1])
# print()
# print(chat_pairs)
# Update state
history_state.update({
'conversation': convo,
'history_image_paths': image_paths,
'cur_ocr_i': cur_ocr_i,
'cur_genimg_i': cur_genimg_i
})
return chat_pairs, history_state, seed
except Exception as e:
# 捕捉所有异常,返回错误提示,建议用户清理历史后重试
error_msg = f"发生错误:{e}. 请点击 \"Clear History\" 清理对话历史后再试一次。"
chat_pairs = [(None, error_msg)]
# 不修改 history_state,让用户自行清理
return chat_pairs, history_state, seed
def copy_seed_for_user(real_seed):
# 这个函数会把隐藏的 seed_holder 值,传给真正要显示的 seed Textbox
return real_seed
def clear_inputs():
# img1 和 img2 用 None 来清空;text_in 用空字符串清空;seed 同理清空
return None, None, "", ""
@spaces.GPU
def clear_history():
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
# 默认 prompt 和 seed
default_prompt = "Translate this photo into a Studio Ghibli-style illustration, holding true to the original composition and movement."
default_seed = "-1"
# 1. chatbot 要用 gr.update(value=[]) 清空
# 2. state 直接给回初始 dict
# 3. prompt 和 seed 同样用 gr.update()
return (
gr.update(value=[]), # 清空聊天框
{'conversation':[], # 重置 state
'history_image_paths':[],
'cur_ocr_i':0,
'cur_genimg_i':0},
gr.update(value=None), # 重置 image1
gr.update(value=None), # 重置 image2
gr.update(value=default_prompt), # 重置 prompt 文本框
gr.update(value=default_seed), # 重置 seed 文本框
)
if __name__ == '__main__':
# Gradio UI
with gr.Blocks(
theme=gr.themes.Soft(),
css=css
) as demo:
gr.Markdown(
"""
<div style="text-align:center;">
# 🎉 UniWorld-V1 Chat Interface 🎉
### Unlock Cutting‑Edge Visual Perception, Feature Extraction, Editing, Synthesis, and Understanding
**Usage Guide:**
- It is recommended to perform inference on four images concurrently to offer varied selections.
- Uploaded images are automatically resized; manually specifying resolutions that differ substantially from the original is not advised.
</div>
""",
elem_classes="header-text",
)
with gr.Row():
with gr.Column():
chatbot = gr.Chatbot(
max_height=100000, min_height=700,
height=None,
resizable=True,
show_copy_button=True
)
text_in = gr.Textbox(label="Instruction", value="Translate this photo into a Studio Ghibli-style illustration, holding true to the original composition and movement.")
with gr.Column():
with gr.Row():
img1 = gr.Image(type='filepath', label="Image 1", height=256, width=256)
img2 = gr.Image(type='filepath', label="Image 2 (Optional reference)", height=256, width=256, visible=True)
seed = gr.Textbox(label="Seed (-1 for random)", value="-1")
seed_holder = gr.Textbox(visible=False)
with gr.Row():
num_imgs = gr.Slider(1, 4, 4, step=1, label="Num Images")
with gr.Row():
height = gr.Slider(256, 2048, 1024, step=64, label="Height")
width = gr.Slider(256, 2048, 1024, step=64, label="Width")
with gr.Row():
steps = gr.Slider(8, 50, 30, step=1, label="Inference steps")
guidance = gr.Slider(1.0, 10.0, 4.0, step=0.1, label="Guidance scale")
with gr.Accordion("Advanced Options", open=True, visible=True):
with gr.Row():
enhance_gen_box = gr.Checkbox(value=False, label="Enhance Generation")
enhance_und_box = gr.Checkbox(value=False, label="Enhance Understanding")
with gr.Row():
ocr_box = gr.Checkbox(value=False, label="Enhance Text Rendering")
t5_box = gr.Checkbox(value=True, label="Enhance Current Turn")
with gr.Row():
submit = gr.Button("Send", variant="primary")
clear = gr.Button("Clear History", variant="primary")
with gr.Row():
with gr.Column(1, min_width=0):
gr.Markdown(
"""
**🖼️ Visual Perception & Feature Extraction**
- Canny Edge Detection
- Mini-Line Segment Detection
- Normal Map Generation
- Sketch Generation
- Holistically-Nested Edge Detection
- Depth Estimation
- Human Pose Estimation
- Object Detection (Boxes)
- Semantic Segmentation (Masks)
"""
)
with gr.Column(1, min_width=0):
gr.Markdown(
"""
**✂️ Image Editing & Manipulation**
- Add Elements
- Adjust Attributes
- Change Background
- Remove Objects
- Replace Regions
- Perform Actions
- Restyle
- Compose Scenes
"""
)
with gr.Column(1, min_width=0):
gr.Markdown(
"""
**🔄 Cross-Modal Synthesis & Transformation**
- Text→Image Synthesis
- Image‑to‑Image Translation
- Multi‑Image Combination
- Extract IP Features
- IP Feature Composition
"""
)
with gr.Column(1, min_width=0):
gr.Markdown(
"""
**🤖 Visual & Textual QA**
- Image‑Text QA
- Text‑Text QA
"""
)
anchor_pixels = 1024*1024
# Dynamic resize callback
def update_size(i1, i2):
shapes = []
for p in (i1, i2):
if p:
im = Image.open(p)
w, h = im.size
shapes.append((w, h))
if not shapes:
return gr.update(), gr.update()
if len(shapes) == 1:
w, h = shapes[0]
else:
w = sum(s[0] for s in shapes) / len(shapes)
h = sum(s[1] for s in shapes) / len(shapes)
new_h, new_w = dynamic_resize(int(h), int(w), 'any_11ratio', anchor_pixels=anchor_pixels)
return gr.update(value=new_h), gr.update(value=new_w)
img1.change(fn=update_size, inputs=[img1, img2], outputs=[height, width])
img2.change(fn=update_size, inputs=[img1, img2], outputs=[height, width])
# Mutual exclusivity
enhance_gen_box.change(
lambda g: gr.update(value=False) if g else gr.update(),
inputs=[enhance_gen_box], outputs=[enhance_und_box]
)
enhance_und_box.change(
lambda u: gr.update(value=False) if u else gr.update(),
inputs=[enhance_und_box], outputs=[enhance_gen_box]
)
state_ = gr.State({'conversation':[], 'history_image_paths':[], 'cur_ocr_i':0, 'cur_genimg_i':0})
progress_bar = gr.Progress()
gr.on(
triggers=[submit.click, text_in.submit],
fn=chat_step,
inputs=[img1, img2, text_in, height, width, steps, guidance,
ocr_box, t5_box, enhance_gen_box, enhance_und_box, seed, num_imgs, state_,
],
outputs=[chatbot, state_, seed_holder],
scroll_to_output=True
).then(
fn=copy_seed_for_user,
inputs=[seed_holder], # 输入是隐藏的 seed_holder
outputs=[seed] # 输出到真正要显示的 seed Textbox
)
clear.click(
fn=clear_history,
inputs=[],
outputs=[chatbot, state_, img1, img2, text_in, seed]
)
# ========== 添加 Validation Examples ==========
example_height, example_width = 1024, 1024
gr.Examples(
examples_per_page=100,
examples=[
# text-to-image
[None, None,
"Generate an adorable golden retriever puppy playing in a sunny park, "
"with fluffy fur, big round eyes, and a happy expression. "
"The background should have green grass, some flowers, and a blue sky with white clouds.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
# NIKE color swap
["assets/nike_src.jpg", None,
"Switch the product's color from black, black to white, white, making sure the transition is crisp and clear.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
# style transfer (Ghibli)
["assets/gradio/origin.png", None,
"Translate this photo into a Studio Ghibli-style illustration, holding true to the original composition and movement.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
["assets/gradio/origin.png", None,
"Remove the bicycle located in the lower center region of the image.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
# blur
["assets/gradio/blur.jpg", None,
"Remove blur, make it clear.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
#
["assets/gradio/00004614_tgt.jpg", None,
"Add the ingrid fair isle cashmere turtleneck sweater to the person.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
#
["assets/gradio/00006581_tgt.jpg", None,
"Place the belvoir broderie anglaise linen tank on the person in a way that complements their appearance and style.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
#
["assets/gradio/00008153_tgt.jpg", None,
"Integrate may cashmere tank on body.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
#
["assets/gradio/00002315_src.jpg", None,
"Strip away all context and distractions, leaving the pointelle-trimmed cashmere t-shirt floating on a neutral background.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
#
["assets/gradio/00002985_src.jpg", None,
"Generate an image containing only the henry shearling jacket, free from any other visual elements.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
["assets/gradio/origin.png", None,
"Add a cat in the center of image.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
# image+image-to-image (compose)
["assets/00182555_target.jpg",
"assets/00182555_InstantStyle_ref_1.jpg",
"Adapt Image1's content to fit the aesthetic of Image2.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
# replace object
["assets/replace_src.png", None,
"replace motorcycle located in the lower center region of the image with a black bicycle",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
# segmentation
["assets/seg_src.jpg", None,
"Segment the giraffe from the background.\n",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
# detection
["assets/det_src.jpg", None,
"Please depict the vase accurately",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
# image-to-canny
["assets/canny_image.jpg", None,
"Generate a Canny edge map for this image.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
# image-to-mlsd
["assets/mlsd_image.jpg", None,
"Render an MLSD detection overlay for this input image.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
# image-to-normal
["assets/normal_image.jpg", None,
"Convert the input texture into a tangent-space normal map.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
# image-to-sketch
["assets/sketch_image.jpg", None,
"Transform this image into a hand-drawn charcoal sketch.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
# image-to-hed
["assets/hed_image.jpg", None,
"Produce a holistically-nested boundary probability map of this image.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
# image-to-depth
["assets/depth_image.jpg", None,
"Estimate depth with a focus on background structure.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
# image-to-image (reconstruction)
["assets/rec.jpg", None,
"Simply reconstruct the original image with no enhancements.",
example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
],
inputs=[img1, img2, text_in, height, width, steps, guidance,
ocr_box, t5_box, enhance_gen_box, enhance_und_box, seed, num_imgs],
)
# ==============================================
UI_TRANSLATIONS = {
"🎉 UniWorld-V1 Chat Interface 🎉":"🎉 UniWorld-V1 聊天界面 🎉",
"Unlock Cutting‑Edge Visual Perception, Feature Extraction, Editing, Synthesis, and Understanding":
'解锁尖端视觉感知,特征提取,编辑,合成和理解',
"Usage Guide:":"使用指南:",
"It is recommended to perform inference on four images concurrently to offer varied selections.":"建议同时进行四张图像的推理,以提供多选。",
"Uploaded images are automatically resized; manually specifying resolutions that differ substantially from the original is not advised.":"已上传的图像将自动调整大小,但手动指定与原始图像差异太大的分辨率并不建议。",
"🖼️ Visual Perception & Feature Extraction":"🖼️ 视觉感知与特征提取",
"Canny Edge Detection":"Canny边缘检测 ",
"Mini-Line Segment Detection":"微型行段检测",
"Normal Map Generation":"生成法线图",
"Sketch Generation":"手绘生成",
"Holistically-Nested Edge Detection":"整体嵌套边缘检测",
"Depth Estimation":"深度估计",
"Human Pose Estimation":"人体姿势估计",
"Object Detection (Boxes)":"对象检测(框)",
"Semantic Segmentation (Masks)":"语义分割(蒙版)",
"✂️ Image Editing & Manipulation":"✂️ 图像编辑与操作",
"Add Elements":"添加元素",
"Adjust Attributes":"调整属性",
"Change Background":"更改背景",
"Remove Objects":"删除对象",
"Replace Regions":"替换区域",
"Perform Actions":"执行操作",
"Restyle":"重绘风格",
"Compose Scenes":"组合场景",
"🔄 Cross-Modal Synthesis & Transformation":"🔄 跨模态综合与转换",
"Text→Image Synthesis":"文本→图像综合",
"Image‑to‑Image Translation":"图像-图像转换",
"Multi‑Image Combination":"多图像组合",
"Extract IP Features":"提取IP特征",
"IP Feature Composition":"IP特征组合",
"🤖 Visual & Textual QA":"🤖 视觉和文字质量检查",
"Image‑Text QA":"图像-文本质量检查",
"Text‑Text QA":"文本-文本质量检查",
"Image 1":"图像 1",
"Image 2 (Optional reference)":"图像 2 (可选参考)",
"Instruction":"指令",
"Seed (-1 for random)":"种子 (-1为随机)",
"Num Images":"图像数量",
"Height":"高度",
"Width":"宽度",
"Inference steps":"推理步数",
"Guidance scale":"引导缩放",
"Advanced Options":"高级选项",
"Enhance Generation":"增强生成",
"Enhance Understanding":"增强理解",
"Enhance Text Rendering":"增强文本渲染",
"Enhance Current Turn":"增强当前轮次",
"Send":"发送",
"Clear History":"清除历史记录",
}
def apply_localization(block):
def process_component(component):
if not component:
return
for attr in ['label', 'info', 'placeholder']:
if hasattr(component, attr):
text = getattr(component, attr)
if text in UI_TRANSLATIONS:
setattr(component, attr, UI_TRANSLATIONS[text])
if hasattr(component, 'value'):
value = component.value
if isinstance(value, str) and value in UI_TRANSLATIONS:
component.value = UI_TRANSLATIONS[value]
if isinstance(component, gr.Markdown):
for en, zh in UI_TRANSLATIONS.items():
component.value = component.value.replace(en, zh)
if hasattr(component, 'children'):
for child in component.children:
process_component(child)
process_component(block)
return block
if __name__ == "__main__":
if args.zh:
demo = apply_localization(demo)
demo.title = "UniWorld-V1"
demo.launch(
allowed_paths=["/"],
server_name=args.server_name,
server_port=args.server_port,
share=args.share,
inbrowser=True,
)
'''
MODEL_PATH="/mnt/data/lb/Remake/FlowWorld/checkpoints/flux_qwen2p5vl_7b_vlm_mlp_siglip_stage2_ts_1024_bs42x8x1_fa_any_11ratio_ema999_ocr_adamw_t5_0p4_lr1e-5_mask_refstyle_extract_resume_run3/checkpoint-12000/model_ema"
FLUX_PATH="/mnt/data/checkpoints/black-forest-labs/FLUX.1-dev"
SIGLIP_PATH="/mnt/data/checkpoints/google/siglip2-so400m-patch16-512"
CUDA_VISIBLE_DEVICES=2 python app.py \
--model_path ${MODEL_PATH} \
--flux_path ${FLUX_PATH} \
--siglip_path ${SIGLIP_PATH}
'''