Spaces:

HorizonRobotics
/

EmbodiedGen-Image-to-3D

Running on Zero

App Files Files Community

xinjie.wang commited on Jun 12

Commit

f219113

1 Parent(s): dd1f1fd

update

Browse files

Files changed (2) hide show

app.py +170 -488
embodied_gen/utils/gpt_clients.py +1 -0

app.py CHANGED Viewed

@@ -1,501 +1,183 @@
-# Project EmbodiedGen
-#
-# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied. See the License for the specific language governing
-# permissions and limitations under the License.
-import os
-os.environ["GRADIO_APP"] = "imageto3d"
-from glob import glob
 import gradio as gr
-from common import (
-    MAX_SEED,
-    VERSION,
-    active_btn_by_content,
-    custom_theme,
-    end_session,
-    extract_3d_representations_v2,
-    extract_urdf,
-    get_seed,
-    image_css,
-    image_to_3d,
-    lighting_css,
-    preprocess_image_fn,
-    preprocess_sam_image_fn,
-    select_point,
-    start_session,
 )
-with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
-    gr.Markdown(
-        """
-        ## ***EmbodiedGen***: Image-to-3D Asset
-        **🔖 Version**: {VERSION}
-        <p style="display: flex; gap: 10px; flex-wrap: nowrap;">
-            <a href="https://horizonrobotics.github.io/robot_lab/embodied_gen/index.html">
-                <img alt="🌐 Project Page" src="https://img.shields.io/badge/🌐-Project_Page-blue">
-            </a>
-            <a href="https://arxiv.org/abs/xxxx.xxxxx">
-                <img alt="📄 arXiv" src="https://img.shields.io/badge/📄-arXiv-b31b1b">
-            </a>
-            <a href="https://github.com/HorizonRobotics/EmbodiedGen">
-                <img alt="💻 GitHub" src="https://img.shields.io/badge/GitHub-000000?logo=github">
-            </a>
-            <a href="https://www.youtube.com/watch?v=SnHhzHeb_aI">
-                <img alt="🎥 Video" src="https://img.shields.io/badge/🎥-Video-red">
-            </a>
-        </p>
-        🖼️ Generate physically plausible 3D asset from single input image.
-        """.format(
-            VERSION=VERSION
-        ),
-        elem_classes=["header"],
-    )
-    gr.HTML(image_css)
-    # gr.HTML(lighting_css)
-    with gr.Row():
-        with gr.Column(scale=2):
-            with gr.Tabs() as input_tabs:
-                with gr.Tab(
-                    label="Image(auto seg)", id=0
-                ) as single_image_input_tab:
-                    raw_image_cache = gr.Image(
-                        format="png",
-                        image_mode="RGB",
-                        type="pil",
-                        visible=False,
-                    )
-                    image_prompt = gr.Image(
-                        label="Input Image",
-                        format="png",
-                        image_mode="RGBA",
-                        type="pil",
-                        height=400,
-                        elem_classes=["image_fit"],
-                    )
-                    gr.Markdown(
-                        """
-                        If you are not satisfied with the auto segmentation
-                        result, please switch to the `Image(SAM seg)` tab."""
-                    )
-                with gr.Tab(
-                    label="Image(SAM seg)", id=1
-                ) as samimage_input_tab:
-                    with gr.Row():
-                        with gr.Column(scale=1):
-                            image_prompt_sam = gr.Image(
-                                label="Input Image",
-                                type="numpy",
-                                height=400,
-                                elem_classes=["image_fit"],
-                            )
-                            image_seg_sam = gr.Image(
-                                label="SAM Seg Image",
-                                image_mode="RGBA",
-                                type="pil",
-                                height=400,
-                                visible=False,
-                            )
-                        with gr.Column(scale=1):
-                            image_mask_sam = gr.AnnotatedImage(
-                                elem_classes=["image_fit"]
-                            )
-                    fg_bg_radio = gr.Radio(
-                        ["foreground_point", "background_point"],
-                        label="Select foreground(green) or background(red) points, by default foreground",  # noqa
-                        value="foreground_point",
-                    )
-                    gr.Markdown(
-                        """ Click the `Input Image` to select SAM points,
-                        after get the satisified segmentation, click `Generate`
-                         button to generate the 3D asset. \n
-                        Note: If the segmented foreground is too small relative
-                         to the entire image area, the generation will fail.
-                    """
-                    )
-            with gr.Accordion(label="Generation Settings", open=False):
-                with gr.Row():
-                    seed = gr.Slider(
-                        0, MAX_SEED, label="Seed", value=0, step=1
-                    )
-                    texture_size = gr.Slider(
-                        1024,
-                        4096,
-                        label="UV texture size",
-                        value=2048,
-                        step=256,
-                    )
-                    rmbg_tag = gr.Radio(
-                        choices=["rembg", "rmbg14"],
-                        value="rembg",
-                        label="Background Removal Model",
-                    )
-                with gr.Row():
-                    randomize_seed = gr.Checkbox(
-                        label="Randomize Seed", value=False
-                    )
-                    project_delight = gr.Checkbox(
-                        label="Backproject delighting",
-                        value=False,
-                    )
-                gr.Markdown("Geo Structure Generation")
-                with gr.Row():
-                    ss_guidance_strength = gr.Slider(
-                        0.0,
-                        10.0,
-                        label="Guidance Strength",
-                        value=7.5,
-                        step=0.1,
-                    )
-                    ss_sampling_steps = gr.Slider(
-                        1, 50, label="Sampling Steps", value=12, step=1
-                    )
-                gr.Markdown("Visual Appearance Generation")
-                with gr.Row():
-                    slat_guidance_strength = gr.Slider(
-                        0.0,
-                        10.0,
-                        label="Guidance Strength",
-                        value=3.0,
-                        step=0.1,
-                    )
-                    slat_sampling_steps = gr.Slider(
-                        1, 50, label="Sampling Steps", value=12, step=1
-                    )
-            generate_btn = gr.Button(
-                "🚀 1. Generate(~0.5 mins)",
-                variant="primary",
-                interactive=False,
             )
-            model_output_obj = gr.Textbox(label="raw mesh .obj", visible=False)
-            with gr.Row():
-                extract_rep3d_btn = gr.Button(
-                    "🔍 2. Extract 3D Representation(~2 mins)",
-                    variant="primary",
-                    interactive=False,
-                )
-            with gr.Accordion(
-                label="Enter Asset Attributes(optional)", open=False
-            ):
-                asset_cat_text = gr.Textbox(
-                    label="Enter Asset Category (e.g., chair)"
-                )
-                height_range_text = gr.Textbox(
-                    label="Enter **Height Range** in meter (e.g., 0.5-0.6)"
-                )
-                mass_range_text = gr.Textbox(
-                    label="Enter **Mass Range** in kg (e.g., 1.1-1.2)"
-                )
-                asset_version_text = gr.Textbox(
-                    label=f"Enter version (e.g., {VERSION})"
-                )
-            with gr.Row():
-                extract_urdf_btn = gr.Button(
-                    "🧩 3. Extract URDF with physics(~1 mins)",
-                    variant="primary",
-                    interactive=False,
-                )
-            with gr.Row():
-                gr.Markdown(
-                    "#### Estimated Asset 3D Attributes(No input required)"
-                )
-            with gr.Row():
-                est_type_text = gr.Textbox(
-                    label="Asset category", interactive=False
-                )
-                est_height_text = gr.Textbox(
-                    label="Real height(.m)", interactive=False
-                )
-                est_mass_text = gr.Textbox(
-                    label="Mass(.kg)", interactive=False
-                )
-                est_mu_text = gr.Textbox(
-                    label="Friction coefficient", interactive=False
-                )
-            with gr.Row():
-                download_urdf = gr.DownloadButton(
-                    label="⬇️ 4. Download URDF",
-                    variant="primary",
-                    interactive=False,
-                )
-            gr.Markdown(
-                """ NOTE: If `Asset Attributes` are provided, the provided
-                properties will be used; otherwise, the GPT-preset properties
-                will be applied. \n
-                The `Download URDF` file is restored to the real scale and
-                has quality inspection, open with an editor to view details.
-            """
             )
-            with gr.Row() as single_image_example:
-                examples = gr.Examples(
-                    label="Image Gallery",
-                    examples=[
-                        [image_path]
-                        for image_path in sorted(
-                            glob("assets/example_image/*")
-                        )
-                    ],
-                    inputs=[image_prompt, rmbg_tag],
-                    fn=preprocess_image_fn,
-                    outputs=[image_prompt, raw_image_cache],
-                    run_on_click=True,
-                    examples_per_page=10,
-                )
-            with gr.Row(visible=False) as single_sam_image_example:
-                examples = gr.Examples(
-                    label="Image Gallery",
-                    examples=[
-                        [image_path]
-                        for image_path in sorted(
-                            glob("assets/example_image/*")
-                        )
-                    ],
-                    inputs=[image_prompt_sam],
-                    fn=preprocess_sam_image_fn,
-                    outputs=[image_prompt_sam, raw_image_cache],
-                    run_on_click=True,
-                    examples_per_page=10,
-                )
-        with gr.Column(scale=1):
-            video_output = gr.Video(
-                label="Generated 3D Asset",
-                autoplay=True,
-                loop=True,
-                height=300,
-            )
-            model_output_gs = gr.Model3D(
-                label="Gaussian Representation", height=300, interactive=False
-            )
-            aligned_gs = gr.Textbox(visible=False)
-            gr.Markdown(
-                """ The rendering of `Gaussian Representation` takes additional 10s. """  # noqa
             )
-            with gr.Row():
-                model_output_mesh = gr.Model3D(
-                    label="Mesh Representation",
-                    height=300,
-                    interactive=False,
-                    clear_color=[0.8, 0.8, 0.8, 1],
-                    elem_id="lighter_mesh",
                 )
-    is_samimage = gr.State(False)
-    output_buf = gr.State()
-    selected_points = gr.State(value=[])
-    demo.load(start_session)
-    demo.unload(end_session)
-    single_image_input_tab.select(
-        lambda: tuple(
-            [False, gr.Row.update(visible=True), gr.Row.update(visible=False)]
-        ),
-        outputs=[is_samimage, single_image_example, single_sam_image_example],
-    )
-    samimage_input_tab.select(
-        lambda: tuple(
-            [True, gr.Row.update(visible=True), gr.Row.update(visible=False)]
-        ),
-        outputs=[is_samimage, single_sam_image_example, single_image_example],
-    )
-    image_prompt.upload(
-        preprocess_image_fn,
-        inputs=[image_prompt, rmbg_tag],
-        outputs=[image_prompt, raw_image_cache],
-    )
-    image_prompt.change(
-        lambda: tuple(
-            [
-                gr.Button(interactive=False),
-                gr.Button(interactive=False),
-                gr.Button(interactive=False),
-                None,
-                "",
-                None,
-                None,
-                "",
-                "",
-                "",
-                "",
-                "",
-                "",
-                "",
-                "",
-            ]
-        ),
-        outputs=[
-            extract_rep3d_btn,
-            extract_urdf_btn,
-            download_urdf,
-            model_output_gs,
-            aligned_gs,
-            model_output_mesh,
-            video_output,
-            asset_cat_text,
-            height_range_text,
-            mass_range_text,
-            asset_version_text,
-            est_type_text,
-            est_height_text,
-            est_mass_text,
-            est_mu_text,
-        ],
-    )
-    image_prompt.change(
-        active_btn_by_content,
-        inputs=image_prompt,
-        outputs=generate_btn,
-    )
-    image_prompt_sam.upload(
-        preprocess_sam_image_fn,
-        inputs=[image_prompt_sam],
-        outputs=[image_prompt_sam, raw_image_cache],
-    )
-    image_prompt_sam.change(
-        lambda: tuple(
-            [
-                gr.Button(interactive=False),
-                gr.Button(interactive=False),
-                gr.Button(interactive=False),
-                None,
-                None,
-                None,
-                "",
-                "",
-                "",
-                "",
-                "",
-                "",
-                "",
-                "",
-                None,
-                [],
-            ]
-        ),
-        outputs=[
-            extract_rep3d_btn,
-            extract_urdf_btn,
-            download_urdf,
-            model_output_gs,
-            model_output_mesh,
-            video_output,
-            asset_cat_text,
-            height_range_text,
-            mass_range_text,
-            asset_version_text,
-            est_type_text,
-            est_height_text,
-            est_mass_text,
-            est_mu_text,
-            image_mask_sam,
-            selected_points,
-        ],
-    )
-    image_prompt_sam.select(
-        select_point,
-        [
-            image_prompt_sam,
-            selected_points,
-            fg_bg_radio,
-        ],
-        [image_mask_sam, image_seg_sam],
-    )
-    image_seg_sam.change(
-        active_btn_by_content,
-        inputs=image_seg_sam,
-        outputs=generate_btn,
-    )
-    generate_btn.click(
-        get_seed,
-        inputs=[randomize_seed, seed],
-        outputs=[seed],
-    ).success(
-        image_to_3d,
-        inputs=[
-            image_prompt,
-            seed,
-            ss_guidance_strength,
-            ss_sampling_steps,
-            slat_guidance_strength,
-            slat_sampling_steps,
-            raw_image_cache,
-            image_seg_sam,
-            is_samimage,
-        ],
-        outputs=[output_buf, video_output],
-    ).success(
-        lambda: gr.Button(interactive=True),
-        outputs=[extract_rep3d_btn],
-    )
-    extract_rep3d_btn.click(
-        extract_3d_representations_v2,
-        inputs=[
-            output_buf,
-            project_delight,
-            texture_size,
-        ],
-        outputs=[
-            model_output_mesh,
-            model_output_gs,
-            model_output_obj,
-            aligned_gs,
-        ],
-    ).success(
-        lambda: gr.Button(interactive=True),
-        outputs=[extract_urdf_btn],
-    )
-    extract_urdf_btn.click(
-        extract_urdf,
-        inputs=[
-            aligned_gs,
-            model_output_obj,
-            asset_cat_text,
-            height_range_text,
-            mass_range_text,
-            asset_version_text,
-        ],
-        outputs=[
-            download_urdf,
-            est_type_text,
-            est_height_text,
-            est_mass_text,
-            est_mu_text,
-        ],
-        queue=True,
-        show_progress="full",
-    ).success(
-        lambda: gr.Button(interactive=True),
-        outputs=[download_urdf],
-    )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import os
+import yaml
+import base64
+import logging
+import os
+from io import BytesIO
+from typing import Optional
+import yaml
+from openai import AzureOpenAI, OpenAI  # pip install openai
+from PIL import Image
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    stop_after_delay,
+    wait_random_exponential,
 )
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class GPTclient:
+    """A client to interact with the GPT model via OpenAI or Azure API."""
+    def __init__(
+        self,
+        endpoint: str,
+        api_key: str,
+        model_name: str = "yfb-gpt-4o",
+        api_version: str = None,
+        verbose: bool = False,
+    ):
+        if api_version is not None:
+            self.client = AzureOpenAI(
+                azure_endpoint=endpoint,
+                api_key=api_key,
+                api_version=api_version,
             )
+        else:
+            self.client = OpenAI(
+                base_url=endpoint,
+                api_key=api_key,
             )
+        self.endpoint = endpoint
+        self.model_name = model_name
+        self.image_formats = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif"}
+        self.verbose = verbose
+        logger.info(f"Using GPT model: {self.model_name}.")
+    @retry(
+        wait=wait_random_exponential(min=1, max=20),
+        stop=(stop_after_attempt(10) | stop_after_delay(30)),
+    )
+    def completion_with_backoff(self, **kwargs):
+        return self.client.chat.completions.create(**kwargs)
+    def query(
+        self,
+        text_prompt: str,
+        image_base64: Optional[list[str | Image.Image]] = None,
+        system_role: Optional[str] = None,
+    ) -> Optional[str]:
+        """Queries the GPT model with a text and optional image prompts.
+        Args:
+            text_prompt (str): The main text input that the model responds to.
+            image_base64 (Optional[List[str]]): A list of image base64 strings
+                or local image paths or PIL.Image to accompany the text prompt.
+            system_role (Optional[str]): Optional system-level instructions
+                that specify the behavior of the assistant.
+        Returns:
+            Optional[str]: The response content generated by the model based on
+                the prompt. Returns `None` if an error occurs.
+        """
+        if system_role is None:
+            system_role = "You are a highly knowledgeable assistant specializing in physics, engineering, and object properties."  # noqa
+        content_user = [
+            {
+                "type": "text",
+                "text": text_prompt,
+            },
+        ]
+        # Process images if provided
+        if image_base64 is not None:
+            image_base64 = (
+                image_base64
+                if isinstance(image_base64, list)
+                else [image_base64]
             )
+            for img in image_base64:
+                if isinstance(img, Image.Image):
+                    buffer = BytesIO()
+                    img.save(buffer, format=img.format or "PNG")
+                    buffer.seek(0)
+                    image_binary = buffer.read()
+                    img = base64.b64encode(image_binary).decode("utf-8")
+                elif (
+                    len(os.path.splitext(img)) > 1
+                    and os.path.splitext(img)[-1].lower() in self.image_formats
+                ):
+                    if not os.path.exists(img):
+                        raise FileNotFoundError(f"Image file not found: {img}")
+                    with open(img, "rb") as f:
+                        img = base64.b64encode(f.read()).decode("utf-8")
+                content_user.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{img}"},
+                    }
                 )
+        payload = {
+            "messages": [
+                {"role": "system", "content": system_role},
+                {"role": "user", "content": content_user},
+            ],
+            "temperature": 0.1,
+            "max_tokens": 500,
+            "top_p": 0.1,
+            "frequency_penalty": 0,
+            "presence_penalty": 0,
+            "stop": None,
+        }
+        payload.update({"model": self.model_name})
+        response = None
+        try:
+            response = self.completion_with_backoff(**payload)
+            response = response.choices[0].message.content
+        except Exception as e:
+            logger.error(f"Error GPTclint {self.endpoint} API call: {e}")
+            response = None
+        if self.verbose:
+            logger.info(f"Prompt: {text_prompt}")
+            logger.info(f"Response: {response}")
+        return response
+from embodied_gen.utils.gpt_clients import GPT_CLIENT
+print(GPT_CLIENT.api_version, GPT_CLIENT.model_name, GPT_CLIENT.endpoint)
+def debug_gptclient(text_prompt, images, system_role):
+    try:
+        # Handle image input (Gradio passes images as PIL.Image or file paths)
+        image_base64 = images if images else None
+        response = GPT_CLIENT.query(
+            text_prompt=text_prompt,
+            image_base64=image_base64,
+            system_role=system_role
+        )
+        return response if response else "No response received or an error occurred."
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Create Gradio interface
+iface = gr.Interface(
+    fn=debug_gptclient,
+    inputs=[
+        gr.Textbox(label="Text Prompt", placeholder="Enter your text prompt here"),
+        gr.File(label="Images (Optional)", type="filepath", file_count="multiple"),
+        gr.Textbox(
+            label="System Role (Optional)",
+            placeholder="Enter system role or leave empty for default",
+            value="You are a highly knowledgeable assistant specializing in physics, engineering, and object properties."
+        )
+    ],
+    outputs=gr.Textbox(label="Response"),
+    title="GPTclient Debug Interface",
+    description="A simple interface to debug GPTclient inputs and outputs."
+)
 if __name__ == "__main__":
+    iface.launch()

embodied_gen/utils/gpt_clients.py CHANGED Viewed

@@ -61,6 +61,7 @@ class GPTclient:
         self.endpoint = endpoint
         self.model_name = model_name
         self.image_formats = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif"}
         self.verbose = verbose
         logger.info(f"Using GPT model: {self.model_name}.")

         self.endpoint = endpoint
         self.model_name = model_name
+        self.api_version = api_version
         self.image_formats = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif"}
         self.verbose = verbose
         logger.info(f"Using GPT model: {self.model_name}.")