Spaces:

HorizonRobotics
/

EmbodiedGen-Text-to-3D

Running on Zero

App Files Files Community

xinjie.wang commited on Jun 10

Commit

1043d26

1 Parent(s): b0848ee

update

Browse files

Files changed (6) hide show

app.py +17 -0
common.py +4 -2
embodied_gen/models/text_model.py +10 -0
embodied_gen/scripts/imageto3d.py +1 -1
embodied_gen/scripts/text2image.py +6 -0
requirements.txt +5 -3

app.py CHANGED Viewed

@@ -286,6 +286,22 @@ with gr.Blocks(
                 est_mu_text = gr.Textbox(
                     label="Friction coefficient", interactive=False
                 )
     output_buf = gr.State()
@@ -353,6 +369,7 @@ with gr.Blocks(
             ip_adapt_scale,
             img_resolution,
             rmbg_tag,
         ],
         outputs=[
             image_sample1,

                 est_mu_text = gr.Textbox(
                     label="Friction coefficient", interactive=False
                 )
+            prompt_examples = [
+                "satin gold tea cup with saucer",
+                "small brown leather bag",
+                "Miniature cup with floral design",
+                "带木质底座, 具有经纬线的地球仪",
+                "橙色电动手钻, 有磨损细节",
+                "手工制作的皮革笔记本",
+                "写实风格机甲3D全身模型, 主体色调为深灰色和荧光黄",
+            ]
+            examples = gr.Examples(
+                label="Gallery",
+                examples=prompt_examples,
+                inputs=[text_prompt],
+                examples_per_page=10,
+            )
     output_buf = gr.State()
             ip_adapt_scale,
             img_resolution,
             rmbg_tag,
+            seed,
         ],
         outputs=[
             image_sample1,

common.py CHANGED Viewed

@@ -165,7 +165,7 @@ if os.getenv("GRADIO_APP") == "imageto3d":
     RBG14_REMOVER = BMGG14Remover()
     SAM_PREDICTOR = SAMPredictor(model_type="vit_h", device="cpu")
     PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
-        "jetx/trellis-image-large"
     )
     # PIPELINE.cuda()
     SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
@@ -179,7 +179,7 @@ elif os.getenv("GRADIO_APP") == "textto3d":
     RBG_REMOVER = RembgRemover()
     RBG14_REMOVER = BMGG14Remover()
     PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
-        "jetx/trellis-image-large"
     )
     # PIPELINE.cuda()
     text_model_dir = "weights/Kolors"
@@ -671,6 +671,7 @@ def text2image_fn(
     image_wh: int | tuple[int, int] = [1024, 1024],
     rmbg_tag: str = "rembg",
     n_sample: int = 3,
     req: gr.Request = None,
 ):
     if isinstance(image_wh, int):
@@ -692,6 +693,7 @@ def text2image_fn(
         ip_image=ip_image,
         image_wh=image_wh,
         infer_step=infer_step,
     )
     for idx in range(len(images)):

     RBG14_REMOVER = BMGG14Remover()
     SAM_PREDICTOR = SAMPredictor(model_type="vit_h", device="cpu")
     PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
+        "microsoft/TRELLIS-image-large"
     )
     # PIPELINE.cuda()
     SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
     RBG_REMOVER = RembgRemover()
     RBG14_REMOVER = BMGG14Remover()
     PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
+        "microsoft/TRELLIS-image-large"
     )
     # PIPELINE.cuda()
     text_model_dir = "weights/Kolors"
     image_wh: int | tuple[int, int] = [1024, 1024],
     rmbg_tag: str = "rembg",
     n_sample: int = 3,
+    seed: int = None,
     req: gr.Request = None,
 ):
     if isinstance(image_wh, int):
         ip_image=ip_image,
         image_wh=image_wh,
         infer_step=infer_step,
+        seed=seed,
     )
     for idx in range(len(images)):

embodied_gen/models/text_model.py CHANGED Viewed

@@ -18,6 +18,8 @@
 import logging
 import torch
 from diffusers import (
     AutoencoderKL,
     EulerDiscreteScheduler,
@@ -138,11 +140,18 @@ def text2img_gen(
     image_wh: tuple[int, int] = [1024, 1024],
     infer_step: int = 50,
     ip_image_size: int = 512,
 ) -> list[Image.Image]:
     prompt = "Single " + prompt + ", in the center of the image"
     prompt += ", high quality, high resolution, best quality, white background, 3D style,"  # noqa
     logger.info(f"Processing prompt: {prompt}")
     kwargs = dict(
         prompt=prompt,
         height=image_wh[1],
@@ -150,6 +159,7 @@ def text2img_gen(
         num_inference_steps=infer_step,
         guidance_scale=guidance_scale,
         num_images_per_prompt=n_sample,
     )
     if ip_image is not None:
         if isinstance(ip_image, str):

 import logging
 import torch
+import numpy as np
+import random
 from diffusers import (
     AutoencoderKL,
     EulerDiscreteScheduler,
     image_wh: tuple[int, int] = [1024, 1024],
     infer_step: int = 50,
     ip_image_size: int = 512,
+    seed: int = None,
 ) -> list[Image.Image]:
     prompt = "Single " + prompt + ", in the center of the image"
     prompt += ", high quality, high resolution, best quality, white background, 3D style,"  # noqa
     logger.info(f"Processing prompt: {prompt}")
+    if seed is not None:
+        generator = torch.Generator(pipeline.device).manual_seed(seed)
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+        random.seed(seed)
     kwargs = dict(
         prompt=prompt,
         height=image_wh[1],
         num_inference_steps=infer_step,
         guidance_scale=guidance_scale,
         num_images_per_prompt=n_sample,
+        generator=generator,
     )
     if ip_image is not None:
         if isinstance(ip_image, str):

embodied_gen/scripts/imageto3d.py CHANGED Viewed

@@ -70,7 +70,7 @@ IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
 RBG_REMOVER = RembgRemover()
 RBG14_REMOVER = BMGG14Remover()
 SAM_PREDICTOR = SAMPredictor(model_type="vit_h", device="cpu")
-PIPELINE = TrellisImageTo3DPipeline.from_pretrained("jetx/trellis-image-large")
 PIPELINE.cuda()
 SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
 GEO_CHECKER = MeshGeoChecker(GPT_CLIENT)

 RBG_REMOVER = RembgRemover()
 RBG14_REMOVER = BMGG14Remover()
 SAM_PREDICTOR = SAMPredictor(model_type="vit_h", device="cpu")
+PIPELINE = TrellisImageTo3DPipeline.from_pretrained("microsoft/TRELLIS-image-large")
 PIPELINE.cuda()
 SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
 GEO_CHECKER = MeshGeoChecker(GPT_CLIENT)

embodied_gen/scripts/text2image.py CHANGED Viewed

@@ -82,6 +82,11 @@ def parse_args():
         type=int,
         default=50,
     )
     args = parser.parse_args()
     return args
@@ -143,6 +148,7 @@ def entrypoint(
             ip_image=ip_img_path,
             image_wh=[args.resolution, args.resolution],
             infer_step=args.infer_step,
         )
         save_paths = []

         type=int,
         default=50,
     )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=0,
+    )
     args = parser.parse_args()
     return args
             ip_image=ip_img_path,
             image_wh=[args.resolution, args.resolution],
             infer_step=args.infer_step,
+            seed=args.seed,
         )
         save_paths = []

requirements.txt CHANGED Viewed

@@ -5,6 +5,7 @@ torchvision==0.19.0
 xformers==0.0.27.post2
 pytorch-lightning==2.4.0
 spconv-cu120==2.3.6
 triton
 dataclasses_json
 easydict
@@ -37,6 +38,7 @@ kolors@git+https://github.com/Kwai-Kolors/Kolors.git#egg=038818d
 segment-anything@git+https://github.com/facebookresearch/segment-anything.git#egg=dca509f
 https://github.com/nerfstudio-project/gsplat/releases/download/v1.5.0/gsplat-1.5.0+pt24cu121-cp310-cp310-linux_x86_64.whl
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-https://huggingface.co/spaces/xinjjj/ImgRoboAssetGen/resolve/main/wheels/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl
-https://huggingface.co/spaces/xinjjj/ImgRoboAssetGen/resolve/main/wheels/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl
-https://huggingface.co/spaces/xinjjj/ImgRoboAssetGen/resolve/main/wheels/kaolin-0.16.0-cp310-cp310-linux_x86_64.whl

 xformers==0.0.27.post2
 pytorch-lightning==2.4.0
 spconv-cu120==2.3.6
+numpy==1.26.4
 triton
 dataclasses_json
 easydict
 segment-anything@git+https://github.com/facebookresearch/segment-anything.git#egg=dca509f
 https://github.com/nerfstudio-project/gsplat/releases/download/v1.5.0/gsplat-1.5.0+pt24cu121-cp310-cp310-linux_x86_64.whl
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+kaolin@git+https://github.com/NVIDIAGameWorks/kaolin.git@v0.16.0
+# nvdiffrast@git+https://github.com/NVlabs/nvdiffrast.git#egg=729261d
+https://huggingface.co/xinjjj/RoboAssetGen/resolve/main/wheel_cu121/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl
+https://huggingface.co/xinjjj/RoboAssetGen/resolve/main/wheel_cu121/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl