Spaces:
Running
on
Zero
Running
on
Zero
# Project EmbodiedGen | |
# | |
# Copyright (c) 2025 Horizon Robotics. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | |
# implied. See the License for the specific language governing | |
# permissions and limitations under the License. | |
import argparse | |
import json | |
import logging | |
import os | |
import re | |
import json_repair | |
from embodied_gen.utils.enum import ( | |
LayoutInfo, | |
RobotItemEnum, | |
Scene3DItemEnum, | |
SpatialRelationEnum, | |
) | |
from embodied_gen.utils.gpt_clients import GPT_CLIENT, GPTclient | |
from embodied_gen.utils.process_media import SceneTreeVisualizer | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
__all__ = [ | |
"LayoutDesigner", | |
"LAYOUT_DISASSEMBLER", | |
"LAYOUT_GRAPHER", | |
"LAYOUT_DESCRIBER", | |
] | |
DISTRACTOR_NUM = 2 # Maximum number of distractor objects allowed | |
LAYOUT_DISASSEMBLE_PROMPT = f""" | |
You are an intelligent 3D scene planner. Given a natural language | |
description of a robotic task, output a structured description of | |
an interactive 3D scene. | |
The output must include the following fields: | |
- task: A high-level task type (e.g., "single-arm pick", | |
"dual-arm grasping", "pick and place", "object sorting"). | |
- {Scene3DItemEnum.ROBOT}: The name or type of robot involved. If not mentioned, | |
use {RobotItemEnum.FRANKA} as default. | |
- {Scene3DItemEnum.BACKGROUND}: The room or indoor environment where the task happens | |
(e.g., Kitchen, Bedroom, Living Room, Workshop, Office). | |
- {Scene3DItemEnum.CONTEXT}: A indoor object involved in the manipulation | |
(e.g., Table, Shelf, Desk, Bed, Cabinet). | |
- {Scene3DItemEnum.MANIPULATED_OBJS}: The main object(s) that the robot directly interacts with. | |
- {Scene3DItemEnum.DISTRACTOR_OBJS}: Other objects that naturally belong to the scene but are not part of the main task. | |
Constraints: | |
- The {Scene3DItemEnum.BACKGROUND} must logically match the described task. | |
- The {Scene3DItemEnum.CONTEXT} must fit within the {Scene3DItemEnum.BACKGROUND}. (e.g., a bedroom may include a table or bed, but not a workbench.) | |
- The {Scene3DItemEnum.CONTEXT} must be a concrete indoor object, such as a "table", | |
"shelf", "desk", or "bed". It must not be an abstract concept (e.g., "area", "space", "zone") | |
or structural surface (e.g., "floor", "ground"). If the input describes an interaction near | |
the floor or vague space, you must infer a plausible object like a "table", "cabinet", or "storage box" instead. | |
- {Scene3DItemEnum.MANIPULATED_OBJS} and {Scene3DItemEnum.DISTRACTOR_OBJS} objects must be plausible, | |
and semantically compatible with the {Scene3DItemEnum.CONTEXT} and {Scene3DItemEnum.BACKGROUND}. | |
- {Scene3DItemEnum.DISTRACTOR_OBJS} must not confuse or overlap with the manipulated objects. | |
- {Scene3DItemEnum.DISTRACTOR_OBJS} number limit: {DISTRACTOR_NUM} distractors maximum. | |
- All {Scene3DItemEnum.BACKGROUND} are limited to indoor environments. | |
- {Scene3DItemEnum.MANIPULATED_OBJS} and {Scene3DItemEnum.DISTRACTOR_OBJS} are rigid bodies and not include flexible objects. | |
- {Scene3DItemEnum.MANIPULATED_OBJS} and {Scene3DItemEnum.DISTRACTOR_OBJS} must be common | |
household or office items or furniture, not abstract concepts, not too small like needle. | |
- If the input includes a plural or grouped object (e.g., "pens", "bottles", "plates", "fruit"), | |
you must decompose it into multiple individual instances (e.g., ["pen", "pen"], ["apple", "pear"]). | |
- Containers that hold objects (e.g., "bowl of apples", "box of tools") must | |
be separated into individual items (e.g., ["bowl", "apple", "apple"]). | |
- Do not include transparent objects such as "glass", "plastic", etc. | |
- The output must be in compact JSON format and use Markdown syntax, just like the output in the example below. | |
Examples: | |
Input: | |
"Pick up the marker from the table and put it in the bowl robot {RobotItemEnum.UR5}." | |
Output: | |
```json | |
{{ | |
"task_desc": "Pick up the marker from the table and put it in the bowl.", | |
"task": "pick and place", | |
"{Scene3DItemEnum.ROBOT}": "{RobotItemEnum.UR5}", | |
"{Scene3DItemEnum.BACKGROUND}": "kitchen", | |
"{Scene3DItemEnum.CONTEXT}": "table", | |
"{Scene3DItemEnum.MANIPULATED_OBJS}": ["marker"], | |
"{Scene3DItemEnum.DISTRACTOR_OBJS}": ["mug", "notebook", "bowl"] | |
}} | |
``` | |
Input: | |
"Put the rubik's cube on the top of the shelf." | |
Output: | |
```json | |
{{ | |
"task_desc": "Put the rubik's cube on the top of the shelf.", | |
"task": "pick and place", | |
"{Scene3DItemEnum.ROBOT}": "{RobotItemEnum.FRANKA}", | |
"{Scene3DItemEnum.BACKGROUND}": "bedroom", | |
"{Scene3DItemEnum.CONTEXT}": "shelf", | |
"{Scene3DItemEnum.MANIPULATED_OBJS}": ["rubik's cube"], | |
"{Scene3DItemEnum.DISTRACTOR_OBJS}": ["pen", "cup", "toy car"] | |
}} | |
``` | |
Input: | |
"Remove all the objects from the white basket and put them on the table." | |
Output: | |
```json | |
{{ | |
"task_desc": "Remove all the objects from the white basket and put them on the table, robot {RobotItemEnum.PIPER}.", | |
"task": "pick and place", | |
"{Scene3DItemEnum.ROBOT}": "{RobotItemEnum.PIPER}", | |
"{Scene3DItemEnum.BACKGROUND}": "office", | |
"{Scene3DItemEnum.CONTEXT}": "table", | |
"{Scene3DItemEnum.MANIPULATED_OBJS}": ["banana", "mobile phone"], | |
"{Scene3DItemEnum.DISTRACTOR_OBJS}": ["plate", "white basket"] | |
}} | |
``` | |
Input: | |
"Pick up the rope on the chair and put it in the box." | |
Output: | |
```json | |
{{ | |
"task_desc": "Pick up the rope on the chair and put it in the box, robot {RobotItemEnum.FRANKA}.", | |
"task": "pick and place", | |
"{Scene3DItemEnum.ROBOT}": "{RobotItemEnum.FRANKA}", | |
"{Scene3DItemEnum.BACKGROUND}": "living room", | |
"{Scene3DItemEnum.CONTEXT}": "chair", | |
"{Scene3DItemEnum.MANIPULATED_OBJS}": ["rope", "box"], | |
"{Scene3DItemEnum.DISTRACTOR_OBJS}": ["magazine"] | |
}} | |
``` | |
Input: | |
"Pick up the seal tape and plastic from the counter and put them in the open drawer and close it." | |
Output: | |
```json | |
{{ | |
"task_desc": "Pick up the seal tape and plastic from the counter and put them in the open drawer and close it.", | |
"task": "pick and place", | |
"robot": "franka", | |
"background": "kitchen", | |
"context": "counter", | |
"manipulated_objs": ["seal tape", "plastic", "opened drawer"], | |
"distractor_objs": ["scissors"] | |
}} | |
``` | |
Input: | |
"Put the pens in the grey bowl." | |
Output: | |
```json | |
{{ | |
"task_desc": "Put the pens in the grey bowl.", | |
"task": "pick and place", | |
"robot": "franka", | |
"background": "office", | |
"context": "table", | |
"manipulated_objs": ["pen", "pen", "grey bowl"], | |
"distractor_objs": ["notepad", "cup"] | |
}} | |
``` | |
""" | |
LAYOUT_HIERARCHY_PROMPT = f""" | |
You are a 3D scene layout reasoning expert. | |
Your task is to generate a spatial relationship dictionary in multiway tree | |
that describes how objects are arranged in a 3D environment | |
based on a given task description and object list. | |
Input in JSON format containing the task description, task type, | |
{Scene3DItemEnum.ROBOT}, {Scene3DItemEnum.BACKGROUND}, {Scene3DItemEnum.CONTEXT}, | |
and a list of objects, including {Scene3DItemEnum.MANIPULATED_OBJS} and {Scene3DItemEnum.DISTRACTOR_OBJS}. | |
### Supported Spatial Relations: | |
- "{SpatialRelationEnum.ON}": The child object bottom is directly on top of the parent object top. | |
- "{SpatialRelationEnum.INSIDE}": The child object is inside the context object. | |
- "{SpatialRelationEnum.IN}": The {Scene3DItemEnum.ROBOT} in the {Scene3DItemEnum.BACKGROUND}. | |
- "{SpatialRelationEnum.FLOOR}": The child object bottom is on the floor of the {Scene3DItemEnum.BACKGROUND}. | |
### Rules: | |
- The {Scene3DItemEnum.CONTEXT} object must be "{SpatialRelationEnum.FLOOR}" the {Scene3DItemEnum.BACKGROUND}. | |
- {Scene3DItemEnum.MANIPULATED_OBJS} and {Scene3DItemEnum.DISTRACTOR_OBJS} must be either | |
"{SpatialRelationEnum.ON}" or "{SpatialRelationEnum.INSIDE}" the {Scene3DItemEnum.CONTEXT} | |
- Or "{SpatialRelationEnum.FLOOR}" {Scene3DItemEnum.BACKGROUND}. | |
- Use "{SpatialRelationEnum.INSIDE}" only if the parent is a container-like object (e.g., shelf, rack, cabinet). | |
- Do not define relationship edges between objects, only for the child and parent nodes. | |
- {Scene3DItemEnum.ROBOT} must "{SpatialRelationEnum.IN}" the {Scene3DItemEnum.BACKGROUND}. | |
- Ensure that each object appears only once in the layout tree, and its spatial relationship is defined with only one parent. | |
- Ensure a valid multiway tree structure with a maximum depth of 2 levels suitable for a 3D scene layout representation. | |
- Only output the final output in JSON format, using Markdown syntax as in examples. | |
### Example | |
Input: | |
{{ | |
"task_desc": "Pick up the marker from the table and put it in the bowl.", | |
"task": "pick and place", | |
"{Scene3DItemEnum.ROBOT}": "{RobotItemEnum.FRANKA}", | |
"{Scene3DItemEnum.BACKGROUND}": "kitchen", | |
"{Scene3DItemEnum.CONTEXT}": "table", | |
"{Scene3DItemEnum.MANIPULATED_OBJS}": ["marker", "bowl"], | |
"{Scene3DItemEnum.DISTRACTOR_OBJS}": ["mug", "chair"] | |
}} | |
Intermediate Think: | |
table {SpatialRelationEnum.FLOOR} kitchen | |
chair {SpatialRelationEnum.FLOOR} kitchen | |
{RobotItemEnum.FRANKA} {SpatialRelationEnum.IN} kitchen | |
marker {SpatialRelationEnum.ON} table | |
bowl {SpatialRelationEnum.ON} table | |
mug {SpatialRelationEnum.ON} table | |
Final Output: | |
```json | |
{{ | |
"kitchen": [ | |
["table", "{SpatialRelationEnum.FLOOR}"], | |
["chair", "{SpatialRelationEnum.FLOOR}"], | |
["{RobotItemEnum.FRANKA}", "{SpatialRelationEnum.IN}"] | |
], | |
"table": [ | |
["marker", "{SpatialRelationEnum.ON}"], | |
["bowl", "{SpatialRelationEnum.ON}"], | |
["mug", "{SpatialRelationEnum.ON}"] | |
] | |
}} | |
``` | |
Input: | |
{{ | |
"task_desc": "Put the marker on top of the book.", | |
"task": "pick and place", | |
"{Scene3DItemEnum.ROBOT}": "{RobotItemEnum.UR5}", | |
"{Scene3DItemEnum.BACKGROUND}": "office", | |
"{Scene3DItemEnum.CONTEXT}": "desk", | |
"{Scene3DItemEnum.MANIPULATED_OBJS}": ["marker", "book"], | |
"{Scene3DItemEnum.DISTRACTOR_OBJS}": ["pen holder", "notepad"] | |
}} | |
Intermediate Think: | |
desk {SpatialRelationEnum.FLOOR} office | |
{RobotItemEnum.UR5} {SpatialRelationEnum.IN} office | |
marker {SpatialRelationEnum.ON} desk | |
book {SpatialRelationEnum.ON} desk | |
pen holder {SpatialRelationEnum.ON} desk | |
notepad {SpatialRelationEnum.ON} desk | |
Final Output: | |
```json | |
{{ | |
"office": [ | |
["desk", "{SpatialRelationEnum.FLOOR}"], | |
["{RobotItemEnum.UR5}", "{SpatialRelationEnum.IN}"] | |
], | |
"desk": [ | |
["marker", "{SpatialRelationEnum.ON}"], | |
["book", "{SpatialRelationEnum.ON}"], | |
["pen holder", "{SpatialRelationEnum.ON}"], | |
["notepad", "{SpatialRelationEnum.ON}"] | |
] | |
}} | |
``` | |
Input: | |
{{ | |
"task_desc": "Put the rubik's cube on the top of the shelf.", | |
"task": "pick and place", | |
"{Scene3DItemEnum.ROBOT}": "{RobotItemEnum.UR5}", | |
"{Scene3DItemEnum.BACKGROUND}": "bedroom", | |
"{Scene3DItemEnum.CONTEXT}": "shelf", | |
"{Scene3DItemEnum.MANIPULATED_OBJS}": ["rubik's cube"], | |
"{Scene3DItemEnum.DISTRACTOR_OBJS}": ["toy car", "pen"] | |
}} | |
Intermediate Think: | |
shelf {SpatialRelationEnum.FLOOR} bedroom | |
{RobotItemEnum.UR5} {SpatialRelationEnum.IN} bedroom | |
rubik's cube {SpatialRelationEnum.INSIDE} shelf | |
toy car {SpatialRelationEnum.INSIDE} shelf | |
pen {SpatialRelationEnum.INSIDE} shelf | |
Final Output: | |
```json | |
{{ | |
"bedroom": [ | |
["shelf", "{SpatialRelationEnum.FLOOR}"], | |
["{RobotItemEnum.UR5}", "{SpatialRelationEnum.IN}"] | |
], | |
"shelf": [ | |
["rubik's cube", "{SpatialRelationEnum.INSIDE}"], | |
["toy car", "{SpatialRelationEnum.INSIDE}"], | |
["pen", "{SpatialRelationEnum.INSIDE}"] | |
] | |
}} | |
``` | |
Input: | |
{{ | |
"task_desc": "Put the marker in the cup on the counter.", | |
"task": "pick and place", | |
"robot": "franka", | |
"background": "kitchen", | |
"context": "counter", | |
"manipulated_objs": ["marker", "cup"], | |
"distractor_objs": ["plate", "spoon"] | |
}} | |
Intermediate Think: | |
counter {SpatialRelationEnum.FLOOR} kitchen | |
{RobotItemEnum.FRANKA} {SpatialRelationEnum.IN} kitchen | |
marker {SpatialRelationEnum.ON} counter | |
cup {SpatialRelationEnum.ON} counter | |
plate {SpatialRelationEnum.ON} counter | |
spoon {SpatialRelationEnum.ON} counter | |
Final Output: | |
```json | |
{{ | |
"kitchen": [ | |
["counter", "{SpatialRelationEnum.FLOOR}"], | |
["{RobotItemEnum.FRANKA}", "{SpatialRelationEnum.IN}"] | |
], | |
"counter": [ | |
["marker", "{SpatialRelationEnum.ON}"], | |
["cup", "{SpatialRelationEnum.ON}"], | |
["plate", "{SpatialRelationEnum.ON}"], | |
["spoon", "{SpatialRelationEnum.ON}"] | |
] | |
}} | |
``` | |
""" | |
LAYOUT_DESCRIBER_PROMPT = """ | |
You are a 3D asset style descriptor. | |
Given a task description and a dictionary where the key is the object content and | |
the value is the object type, output a JSON dictionary with each object paired | |
with a concise, styled visual description suitable for 3D asset generation. | |
Generation Guidelines: | |
- For each object, brainstorm multiple style candidates before selecting the final | |
description. Vary phrasing, material, texture, color, and spatial details. | |
- Each description must be a maximum of 15 words, including color, style, materials. | |
- Descriptions should be visually grounded, specific, and reflect surface texture and structure. | |
- For objects marked as "context", explicitly mention the object is standalone, has an empty top. | |
- Use rich style descriptors: e.g., "scratched brown wooden desk" etc. | |
- Ensure all object styles align with the task's overall context and environment. | |
Format your output in JSON like the example below. | |
Example Input: | |
"Pick up the rope on the chair and put it in the box. {'living room': 'background', 'chair': 'context', | |
'rope': 'manipulated_objs', 'box': 'manipulated_objs', 'magazine': 'distractor_objs'}" | |
Example Output: | |
```json | |
{ | |
"living room": "modern cozy living room with soft sunlight and light grey carpet", | |
"chair": "standalone dark oak chair with no surroundings and clean empty seat", | |
"rope": "twisted hemp rope with rough fibers and dusty beige texture", | |
"box": "slightly crumpled cardboard box with open flaps and brown textured surface", | |
"magazine": "celebrity magazine with glossy red cover and large bold title" | |
} | |
``` | |
""" | |
class LayoutDesigner(object): | |
def __init__( | |
self, | |
gpt_client: GPTclient, | |
system_prompt: str, | |
verbose: bool = False, | |
) -> None: | |
self.prompt = system_prompt.strip() | |
self.verbose = verbose | |
self.gpt_client = gpt_client | |
def query(self, prompt: str, params: dict = None) -> str: | |
full_prompt = self.prompt + f"\n\nInput:\n\"{prompt}\"" | |
response = self.gpt_client.query( | |
text_prompt=full_prompt, | |
params=params, | |
) | |
if self.verbose: | |
logger.info(f"Response: {response}") | |
return response | |
def format_response(self, response: str) -> dict: | |
cleaned = re.sub(r"^```json\s*|\s*```$", "", response.strip()) | |
try: | |
output = json.loads(cleaned) | |
except json.JSONDecodeError as e: | |
raise json.JSONDecodeError( | |
f"Error: {e}, failed to parse JSON response: {response}" | |
) | |
return output | |
def format_response_repair(self, response: str) -> dict: | |
return json_repair.loads(response) | |
def save_output(self, output: dict, save_path: str) -> None: | |
os.makedirs(os.path.dirname(save_path), exist_ok=True) | |
with open(save_path, 'w') as f: | |
json.dump(output, f, indent=4) | |
def __call__( | |
self, prompt: str, save_path: str = None, params: dict = None | |
) -> dict | str: | |
response = self.query(prompt, params=params) | |
output = self.format_response_repair(response) | |
self.save_output(output, save_path) if save_path else None | |
return output | |
LAYOUT_DISASSEMBLER = LayoutDesigner( | |
gpt_client=GPT_CLIENT, system_prompt=LAYOUT_DISASSEMBLE_PROMPT | |
) | |
LAYOUT_GRAPHER = LayoutDesigner( | |
gpt_client=GPT_CLIENT, system_prompt=LAYOUT_HIERARCHY_PROMPT | |
) | |
LAYOUT_DESCRIBER = LayoutDesigner( | |
gpt_client=GPT_CLIENT, system_prompt=LAYOUT_DESCRIBER_PROMPT | |
) | |
def build_scene_layout( | |
task_desc: str, output_path: str = None, gpt_params: dict = None | |
) -> LayoutInfo: | |
layout_relation = LAYOUT_DISASSEMBLER(task_desc, params=gpt_params) | |
layout_tree = LAYOUT_GRAPHER(layout_relation, params=gpt_params) | |
object_mapping = Scene3DItemEnum.object_mapping(layout_relation) | |
obj_prompt = f'{layout_relation["task_desc"]} {object_mapping}' | |
objs_desc = LAYOUT_DESCRIBER(obj_prompt, params=gpt_params) | |
layout_info = LayoutInfo( | |
layout_tree, layout_relation, objs_desc, object_mapping | |
) | |
if output_path is not None: | |
visualizer = SceneTreeVisualizer(layout_info) | |
visualizer.render(save_path=output_path) | |
logger.info(f"Scene hierarchy tree saved to {output_path}") | |
return layout_info | |
def parse_args(): | |
parser = argparse.ArgumentParser(description="3D Scene Layout Designer") | |
parser.add_argument( | |
"--task_desc", | |
type=str, | |
default="Put the apples on the table on the plate", | |
help="Natural language description of the robotic task", | |
) | |
parser.add_argument( | |
"--save_root", | |
type=str, | |
default="outputs/layout_tree", | |
help="Path to save the layout output", | |
) | |
return parser.parse_args() | |
if __name__ == "__main__": | |
from embodied_gen.utils.enum import LayoutInfo | |
from embodied_gen.utils.process_media import SceneTreeVisualizer | |
args = parse_args() | |
params = { | |
"temperature": 1.0, | |
"top_p": 0.95, | |
"frequency_penalty": 0.3, | |
"presence_penalty": 0.5, | |
} | |
layout_relation = LAYOUT_DISASSEMBLER(args.task_desc, params=params) | |
layout_tree = LAYOUT_GRAPHER(layout_relation, params=params) | |
object_mapping = Scene3DItemEnum.object_mapping(layout_relation) | |
obj_prompt = f'{layout_relation["task_desc"]} {object_mapping}' | |
objs_desc = LAYOUT_DESCRIBER(obj_prompt, params=params) | |
layout_info = LayoutInfo(layout_tree, layout_relation, objs_desc) | |
visualizer = SceneTreeVisualizer(layout_info) | |
os.makedirs(args.save_root, exist_ok=True) | |
scene_graph_path = f"{args.save_root}/scene_tree.jpg" | |
visualizer.render(save_path=scene_graph_path) | |
with open(f"{args.save_root}/layout.json", "w") as f: | |
json.dump(layout_info.to_dict(), f, indent=4) | |
print(f"Scene hierarchy tree saved to {scene_graph_path}") | |
print(f"Disassembled Layout: {layout_relation}") | |
print(f"Layout Graph: {layout_tree}") | |
print(f"Layout Descriptions: {objs_desc}") | |