diff --git a/.gitignore b/.gitignore
index 7c7caf0b426a2a32d6790fafa248b5bb46aaffa1..af7dc1d34ed0249d1c79e0cbe1c7bcece5b3a23e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,9 @@
 *.gif
 *.bmp
 *.tiff
-*.ico
\ No newline at end of file
+*.ico
+*.log
+.gradio/
+__pycache__/
+.env
+.venv/
\ No newline at end of file
diff --git a/BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/task.py b/BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/task.py
index ea227bd833e4965f2ebb07a09276eda6ff0a1bf4..e7789df5bd7335d4077138c4a06ae29c2c345e1e 100644
--- a/BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/task.py
+++ b/BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/task.py
@@ -108,7 +108,7 @@ class AssistantBenchTask(AbstractBrowserTask):
 
     def setup(self, page: Page) -> Tuple[str, dict]:
         logger.info(f"Navigating to start url: {self.start_url}")
-        page.goto(self.start_url, timeout=10000)
+        page.goto(self.start_url, timeout=50000)
         if self.save_predictions and self.output_file:
             # create an empty task entry in the output file (will raise an Exception if the entry is already there)
             add_prediction_to_jsonl(
diff --git a/BrowserGym/browsergym/browsergym.egg-info/PKG-INFO b/BrowserGym/browsergym/browsergym.egg-info/PKG-INFO
new file mode 100644
index 0000000000000000000000000000000000000000..b5232ec10893f958362c04540dd85679441ddfa6
--- /dev/null
+++ b/BrowserGym/browsergym/browsergym.egg-info/PKG-INFO
@@ -0,0 +1,22 @@
+Metadata-Version: 2.4
+Name: browsergym
+Version: 0.13.4
+Summary: BrowserGym: a gym environment for web task automation in the Chromium browser
+Author: Rim Assouel, Léo Boisvert, Massimo Caccia, Alex Drouin, Maxime Gasse, Imene Kerboua, Alex Lacoste, Thibault Le Sellier De Chezelles, Tom Marty, Aman Jaiswal
+License: Apache-2.0
+Classifier: Development Status :: 3 - Alpha
+Classifier: Programming Language :: Python :: 3
+Classifier: Operating System :: OS Independent
+Classifier: Intended Audience :: Science/Research
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: License :: OSI Approved :: Apache Software License
+Requires-Python: >3.10
+Description-Content-Type: text/markdown
+Requires-Dist: browsergym-core==0.13.4
+Requires-Dist: browsergym-miniwob==0.13.4
+Requires-Dist: browsergym-webarena==0.13.4
+Requires-Dist: browsergym-visualwebarena==0.13.4
+Requires-Dist: browsergym-assistantbench==0.13.4
+Requires-Dist: browsergym-experiments==0.13.4
+Requires-Dist: browsergym-workarena>=0.4.1
+Requires-Dist: weblinx-browsergym>=0.0.2
diff --git a/BrowserGym/browsergym/browsergym.egg-info/SOURCES.txt b/BrowserGym/browsergym/browsergym.egg-info/SOURCES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1a1959829995e91d796e4e636baa7df08324d88b
--- /dev/null
+++ b/BrowserGym/browsergym/browsergym.egg-info/SOURCES.txt
@@ -0,0 +1,6 @@
+pyproject.toml
+browsergym.egg-info/PKG-INFO
+browsergym.egg-info/SOURCES.txt
+browsergym.egg-info/dependency_links.txt
+browsergym.egg-info/requires.txt
+browsergym.egg-info/top_level.txt
\ No newline at end of file
diff --git a/BrowserGym/browsergym/browsergym.egg-info/dependency_links.txt b/BrowserGym/browsergym/browsergym.egg-info/dependency_links.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/BrowserGym/browsergym/browsergym.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/BrowserGym/browsergym/browsergym.egg-info/requires.txt b/BrowserGym/browsergym/browsergym.egg-info/requires.txt
new file mode 100644
index 0000000000000000000000000000000000000000..397e6ed00eb2e3ba721add7562967fe6b9f5d6a6
--- /dev/null
+++ b/BrowserGym/browsergym/browsergym.egg-info/requires.txt
@@ -0,0 +1,8 @@
+browsergym-core==0.13.4
+browsergym-miniwob==0.13.4
+browsergym-webarena==0.13.4
+browsergym-visualwebarena==0.13.4
+browsergym-assistantbench==0.13.4
+browsergym-experiments==0.13.4
+browsergym-workarena>=0.4.1
+weblinx-browsergym>=0.0.2
diff --git a/BrowserGym/browsergym/browsergym.egg-info/top_level.txt b/BrowserGym/browsergym/browsergym.egg-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/BrowserGym/browsergym/browsergym.egg-info/top_level.txt
@@ -0,0 +1 @@
+
diff --git a/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/__init__.cpython-311.pyc b/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac48ca36a3657ebc6c3e1805cd75206bc5dbdaa8
Binary files /dev/null and b/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/__init__.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/chat.cpython-311.pyc b/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/chat.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d0d8ec011e76a0f151246c45d88d2e49f12e5e97
Binary files /dev/null and b/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/chat.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/constants.cpython-311.pyc b/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/constants.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..44a217d884a8d82d667b94b758e2689df5897716
Binary files /dev/null and b/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/constants.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/env.cpython-311.pyc b/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..587aec300df481bd2d8dd4ee19092655d09cc1c4
Binary files /dev/null and b/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/env.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/observation.cpython-311.pyc b/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/observation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bfa2812bf29c1773388f909db6a2efc6bfaa4fac
Binary files /dev/null and b/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/observation.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/registration.cpython-311.pyc b/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/registration.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8af44fa7accbadcb5ac728946e3bad6525a498f5
Binary files /dev/null and b/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/registration.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/spaces.cpython-311.pyc b/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/spaces.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff8e02adb910bb7bba74e3beaedbbdaff9a657c3
Binary files /dev/null and b/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/spaces.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/task.cpython-311.pyc b/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/task.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1132b8abf357059ff4265122e61411e4396efe5d
Binary files /dev/null and b/BrowserGym/browsergym/core/src/browsergym/core/__pycache__/task.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/__init__.cpython-311.pyc b/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85f03d3032be255684ddd060f7f3d7b331f7c016
Binary files /dev/null and b/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/__init__.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/base.cpython-311.pyc b/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de29f75bcd8d16ff09c241f3210edcff70f1e959
Binary files /dev/null and b/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/base.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/functions.cpython-311.pyc b/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/functions.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3ea9bfd467c7deecf0db6e02e377b7979be947a
Binary files /dev/null and b/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/functions.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/highlevel.cpython-311.pyc b/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/highlevel.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..613859a26416208e722f9cb20c508187b0de1c80
Binary files /dev/null and b/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/highlevel.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/parsers.cpython-311.pyc b/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/parsers.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc26207c25983510e3814d7f1792543360e6b683
Binary files /dev/null and b/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/parsers.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/utils.cpython-311.pyc b/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40582178b2aaf59e30472482bd419bde341bb681
Binary files /dev/null and b/BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/utils.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/core/src/browsergym/core/env.py b/BrowserGym/browsergym/core/src/browsergym/core/env.py
index 115eb8c60a3cabae33af00224e9b9a366d477779..50b949944bdddc2480c9c76de2e2cc98cd0670b6 100644
--- a/BrowserGym/browsergym/core/src/browsergym/core/env.py
+++ b/BrowserGym/browsergym/core/src/browsergym/core/env.py
@@ -27,6 +27,7 @@ from .observation import (
 )
 from .spaces import AnyBox, AnyDict, Float, Unicode
 from .task import AbstractBrowserTask
+from ..utils.obs import overlay_som, flatten_axtree_to_str
 
 logger = logging.getLogger(__name__)
 
@@ -602,6 +603,8 @@ document.addEventListener("visibilitychange", () => {
         _post_extract(self.page)
 
         # obs is generic to all tasks
+        screenshot_np_array = extract_screenshot(self.page)
+        som_screenshot_np_array = overlay_som(screenshot_np_array, extra_properties)
         obs = {
             "chat_messages": tuple(copy.deepcopy(self.chat.messages)),
             "goal": _try_to_extract_legacy_goal(self.goal_object),  # legacy goal, deprecated
@@ -612,7 +615,7 @@ document.addEventListener("visibilitychange", () => {
             "open_pages_titles": tuple(page.title() for page in self.context.pages),
             "active_page_index": np.asarray([self.context.pages.index(self.page)]),
             "url": self.page.url,  # redundant with "open_pages_urls" and "active_page_index"
-            "screenshot": extract_screenshot(self.page),
+            "som_screenshot": som_screenshot_np_array,
             "dom_object": dom,
             "axtree_object": axtree,
             "extra_element_properties": extra_properties,
diff --git a/BrowserGym/browsergym/core/src/browsergym/core/task.py b/BrowserGym/browsergym/core/src/browsergym/core/task.py
index 262f829c0c0f3b2caaf1c99471ba3dd50b31ebf2..5c5b2eb3714f5d7b5dd60920a239d4de3efbc781 100644
--- a/BrowserGym/browsergym/core/src/browsergym/core/task.py
+++ b/BrowserGym/browsergym/core/src/browsergym/core/task.py
@@ -92,7 +92,7 @@ class OpenEndedTask(AbstractBrowserTask):
         self.goal = goal
 
     def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]:
-        page.goto(self.start_url, timeout=10000)
+        page.goto(self.start_url, timeout=50000)
         return self.goal, {}
 
     def teardown(self) -> None:
diff --git a/BrowserGym/browsergym/core/src/browsergym/utils/__pycache__/obs.cpython-311.pyc b/BrowserGym/browsergym/core/src/browsergym/utils/__pycache__/obs.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0b3eac1d628ac41cf263772d9a0d67b08fedfd4
Binary files /dev/null and b/BrowserGym/browsergym/core/src/browsergym/utils/__pycache__/obs.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/experiments/src/browsergym/experiments/__pycache__/__init__.cpython-311.pyc b/BrowserGym/browsergym/experiments/src/browsergym/experiments/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..764bade1f557180b93b6fbf20b04435996bcd14c
Binary files /dev/null and b/BrowserGym/browsergym/experiments/src/browsergym/experiments/__pycache__/__init__.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/experiments/src/browsergym/experiments/__pycache__/agent.cpython-311.pyc b/BrowserGym/browsergym/experiments/src/browsergym/experiments/__pycache__/agent.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63fd26332bfdad9451fb6dbdbe050fff88cc0df2
Binary files /dev/null and b/BrowserGym/browsergym/experiments/src/browsergym/experiments/__pycache__/agent.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/experiments/src/browsergym/experiments/__pycache__/loop.cpython-311.pyc b/BrowserGym/browsergym/experiments/src/browsergym/experiments/__pycache__/loop.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..318c668ad354c6ba3ad5a746e65dfb8f49349965
Binary files /dev/null and b/BrowserGym/browsergym/experiments/src/browsergym/experiments/__pycache__/loop.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/experiments/src/browsergym/experiments/__pycache__/utils.cpython-311.pyc b/BrowserGym/browsergym/experiments/src/browsergym/experiments/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8c5c6b6b0af83ec3c5253338ccc800ff8618780
Binary files /dev/null and b/BrowserGym/browsergym/experiments/src/browsergym/experiments/__pycache__/utils.cpython-311.pyc differ
diff --git a/BrowserGym/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py b/BrowserGym/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py
index 77c0dd4028166cef389a19ea94412f1e625c2491..41d2d304f8f0b08a5e0a1477c03349fb0d6b2bd5 100644
--- a/BrowserGym/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py
+++ b/BrowserGym/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py
@@ -109,7 +109,7 @@ class GenericVisualWebArenaTask(AbstractBrowserTask):
         # task properties, will be used to set up the browsergym environment
         self.viewport = {"width": 1280, "height": 720}
         self.slow_mo = 1000  # ms
-        self.timeout = 10000  # ms
+        self.timeout = 50000  # ms
 
         self.webarena_instance = VisualWebArenaInstance()
         self.config_file: str = None
diff --git a/BrowserGym/browsergym/webarena/src/browsergym/webarena/task.py b/BrowserGym/browsergym/webarena/src/browsergym/webarena/task.py
index 3467c15209b849969541e4289583d35099abdd85..182a1137c6d608f546aeefdcbd720743ea13d21c 100644
--- a/BrowserGym/browsergym/webarena/src/browsergym/webarena/task.py
+++ b/BrowserGym/browsergym/webarena/src/browsergym/webarena/task.py
@@ -34,7 +34,7 @@ class GenericWebArenaTask(AbstractBrowserTask):
         # task properties, will be used to set up the browsergym environment
         self.viewport = {"width": 1280, "height": 720}
         self.slow_mo = 1000  # ms
-        self.timeout = 10000  # ms
+        self.timeout = 50000  # ms
 
         self.webarena_instance = WebArenaInstance()
         self.config_file: str = None
diff --git a/Dockerfile b/Dockerfile
index 8246b161f5c1d54bf8e19b0dbd8c7a0ee432818e..70362cd36f6c915ca8f0aa143962670fad4779df 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -56,6 +56,11 @@ RUN curl -fsSL https://dl.google.com/linux/linux_signing_key.pub | gpg --dearmor
 # Set up working directory
 WORKDIR /app
 
+COPY templates/ templates/
+COPY browser_agent.py .
+COPY process_run.py .
+COPY utils.py .
+
 # Copy requirements and install Python dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
@@ -94,6 +99,8 @@ ENV RESOLUTION_HEIGHT=1080
 # COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
 
 # EXPOSE 7788 6080 5900
+EXPOSE 7860
+ENV GRADIO_SERVER_NAME="0.0.0.0"
 
 # CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
-# RUN python3 app.py
\ No newline at end of file
+CMD ["python", "app.py"]
\ No newline at end of file
diff --git a/agent/__init__.py b/agent/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/agent/checklist.py b/agent/checklist.py
new file mode 100644
index 0000000000000000000000000000000000000000..b469ddafb2a0215960c9af75d865f3ee385c583e
--- /dev/null
+++ b/agent/checklist.py
@@ -0,0 +1,18 @@
+from .mini_bench.agent import ChecklistGenerationAgent
+
+def generate_checklist(**data):
+    # data: 'intent', 'start_url', 'text_observation'
+    agent_config = {
+        'model_name': 'WPRM/qwen-3b-ar-reward-cot-mtl-checklist-enhanced',
+        'base_url': 'http://165.132.144.84:7701/v1',
+        'api_key': 'empty',
+        'temperature': 0.7,
+        'use_log_probs': True,
+        'use_checklist': True,
+        'use_multimodal': False,
+        'num_generate': 1,
+    }
+    checklist_generation_agent = ChecklistGenerationAgent(agent_config)
+    response_list, cost = checklist_generation_agent.generate_response(data, prompt_type='ours', constraint_str_list=["<think>", "</think>", "<answer>", "</answer>"])
+    response = response_list[0]
+    return response.split("<answer>")[-1].split("</answer>")[0].strip()
diff --git a/agent/mini_bench/__init__.py b/agent/mini_bench/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/agent/mini_bench/__pycache__/__init__.cpython-311.pyc b/agent/mini_bench/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7905063310818854817c8a42f6668ac29652395
Binary files /dev/null and b/agent/mini_bench/__pycache__/__init__.cpython-311.pyc differ
diff --git a/agent/mini_bench/__pycache__/agent.cpython-311.pyc b/agent/mini_bench/__pycache__/agent.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06d33c0998bb81fc3f60971b223755623745ad00
Binary files /dev/null and b/agent/mini_bench/__pycache__/agent.cpython-311.pyc differ
diff --git a/agent/mini_bench/__pycache__/reward_agent.cpython-311.pyc b/agent/mini_bench/__pycache__/reward_agent.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..122657f6884e90f99ed6a82a6a3b0eca24c76fea
Binary files /dev/null and b/agent/mini_bench/__pycache__/reward_agent.cpython-311.pyc differ
diff --git a/agent/mini_bench/agent.py b/agent/mini_bench/agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..5354363bbf776d794febd5c5a6953ed48d7e73ea
--- /dev/null
+++ b/agent/mini_bench/agent.py
@@ -0,0 +1,467 @@
+from abc import ABC, abstractmethod
+import time
+import requests
+import json
+import math
+from langsmith import Client
+from langchain_openai import ChatOpenAI
+
+from .prompts import get_messages
+from .prompts.judge_prompt import (
+    JUDGE_OURS_BT_MODELING_PROMPT_TEMPLATE,
+    JUDGE_OURS_BT_MODELING_WO_CHECKLIST_PROMPT_TEMPLATE,
+    JUDGE_OURS_BT_MODELING_MULTIMODAL_PROMPT_TEMPLATE,
+    JUDGE_OURS_BT_MODELING_MULTIMODAL_WO_CHECKLIST_PROMPT_TEMPLATE
+)
+from .prompts.image_utils import image_to_base64_url
+
+MAX_RETRY = 3
+RETRY_SLEEP = 5
+MODEL_COST_MAPPING = {
+    "gpt-4o-mini": {
+        "input_token_cost": 0.15,
+        "output_token_cost": 0.6
+    },
+    "gpt-4o": {
+        "input_token_cost": 2.5,
+        "output_token_cost": 10
+    },
+}
+
+
+class Agent(ABC):
+    @abstractmethod
+    def generate_response(self, inputs: dict) -> str:
+        pass
+
+class BaseAgent(Agent):
+    def __init__(self, agent_config: dict):
+        self.agent_config = agent_config
+        self._setup()
+    
+    def _setup(self):
+        use_log_probs = self.agent_config.get("use_log_probs", False)
+        if use_log_probs:
+            self.llm = ChatOpenAI(
+                model=self.agent_config["model_name"], 
+                base_url=self.agent_config["base_url"], 
+                api_key=self.agent_config["api_key"], 
+                temperature=self.agent_config["temperature"],
+                timeout=300,
+                logprobs=True,
+                top_logprobs=10
+            )
+        else:
+            self.llm = ChatOpenAI(
+                model=self.agent_config["model_name"], 
+                base_url=self.agent_config["base_url"], 
+                api_key=self.agent_config["api_key"], 
+                temperature=self.agent_config["temperature"],
+                timeout=300
+            )
+        self.temperature = self.agent_config["temperature"]
+        self.num_generate = self.agent_config["num_generate"]
+        self.use_checklist = self.agent_config.get("use_checklist", False)
+        self.use_multimodal = self.agent_config.get("use_multimodal", False)
+
+        # setup cost
+        model_cost = MODEL_COST_MAPPING.get(self.agent_config["model_name"], None)
+        if model_cost and "api" in self.agent_config["base_url"]:
+            self.input_token_cost = model_cost["input_token_cost"]
+            self.output_token_cost = model_cost["output_token_cost"]
+        else:
+            self.input_token_cost = 0.0
+            self.output_token_cost = 0.0
+
+    def generate_with_retry(self, model_input, constraint_str_list: list = None):
+        total_input_tokens = 0
+        total_output_tokens = 0
+        if self.temperature == 0:
+            response = self.llm.invoke(model_input)
+            total_input_tokens += response.response_metadata["token_usage"]["prompt_tokens"]
+            total_output_tokens += response.response_metadata["token_usage"]["completion_tokens"]
+        else:
+            for i in range(MAX_RETRY):
+                try:
+                    response = self.llm.invoke(model_input)
+                    total_input_tokens += response.response_metadata["token_usage"]["prompt_tokens"]
+                    total_output_tokens += response.response_metadata["token_usage"]["completion_tokens"]
+                    if constraint_str_list:
+                        pass_constraint_num = 0
+                        for constraint_str in constraint_str_list:
+                            if constraint_str in response.content:
+                                pass_constraint_num += 1
+                        if pass_constraint_num == len(constraint_str_list):
+                            break
+                        else:
+                            print(f"Agent has fomat issue, retry... {i+1}/{MAX_RETRY}")
+                            print(response.content)
+                    else:
+                        break
+                except Exception as e:
+                    print(f"Agent returned an Error: {e}")
+                    response = None
+                    time.sleep(RETRY_SLEEP)
+        
+        cost = self.input_token_cost * total_input_tokens / 1000000 + self.output_token_cost * total_output_tokens / 1000000
+        
+        if response is None:
+            return "", cost
+        else:
+            return response.content, cost
+    
+    def prepare_message(self, model_input: dict, prompt_type: str):
+        message = []
+        return message
+
+    def generate_response(self, model_input: dict, prompt_type: str, constraint_str_list: list = None,):
+        total_cost = 0
+        response_list = []
+        # prepare message
+        message = self.prepare_message(model_input, prompt_type)
+        # print(message)
+
+        # n sampling
+        for i in range(self.num_generate):
+            response, cost = self.generate_with_retry(message, constraint_str_list)
+            response_list.append(response)
+            total_cost += cost
+        
+        return response_list, total_cost
+
+
+class GroundingJudgeAgent(BaseAgent):
+    def __init__(self, agent_config: dict):
+        super().__init__(agent_config)
+        self._setup()
+    
+    def prepare_message(self, model_input: dict, prompt_type):
+        message = get_messages(
+            input_info=model_input,
+            inference_mode="judge_grounding",
+            prompt_type=prompt_type,
+            use_multimodal=self.use_multimodal,
+            text_obs=self.agent_config["text_obs_type"],
+            image_obs=self.agent_config["image_obs_type"]
+        )
+        return message
+
+
+class ProgressJudgeAgent(BaseAgent):
+    def __init__(self, agent_config: dict):
+        super().__init__(agent_config)
+        self._setup()
+    
+    def prepare_message(self, model_input: dict, prompt_type):
+        if self.agent_config["input_type"]=="text_only":
+            use_multimodal = False
+            text_obs = self.agent_config["text_obs_type"]
+            image_obs = None
+        elif self.agent_config["input_type"]=="image_only":
+            use_multimodal = True
+            text_obs = None
+            image_obs = self.agent_config["image_obs_type"]
+        elif self.agent_config["input_type"]=="text_image":
+            use_multimodal = True
+            text_obs = self.agent_config["text_obs_type"]
+            image_obs = self.agent_config["image_obs_type"]
+        else:
+            raise ValueError(f"Invalid input type: {self.agent_config['input_type']}")
+        
+        if self.agent_config["use_in_progress"]:
+            use_in_progress = True
+        else:
+            use_in_progress = False
+
+        message = get_messages(
+            input_info=model_input,
+            inference_mode="judge_progress",
+            prompt_type=prompt_type,
+            use_checklist=self.use_checklist,
+            use_multimodal=use_multimodal,
+            text_obs=text_obs,
+            image_obs=image_obs,
+            use_in_progress=use_in_progress
+        )
+        return message
+    
+    def add_logprob(self, ori_logprob: float, add_logprob: float):
+        if ori_logprob is None:
+            return add_logprob
+        else:
+            ori_prob = math.exp(ori_logprob)
+            add_prob = math.exp(add_logprob)
+            return math.log(ori_prob + add_prob)
+    
+    def get_judge_probs(self, logprobs: list):
+        # target_judge = {
+        #     "yes": [" Yes", "Yes"],
+        #     "no": [" No", "No"],
+        #     "in": [" In", "In"]
+        # }
+        target_judge = {
+            "yes": [
+                " Yes", "ĠYes", "Yes", "ĊYes",
+                "Ġyes", "yes", "Ċyes",
+                "ĠYES", "YES", "ĊYES",
+                "ĠDone", "Done", "ĊDone",
+                "ĠCompleted", "Completed", "ĊCompleted",
+                "ĠCorrect", "Correct", "ĊCorrect"
+            ],
+            "no": [
+                " No", "ĠNo", "No", "ĊNo",
+                "ĠNO", "NO", "ĊNO",
+                "ĠNot", "Not", "ĊNot",
+                "ĠNone", "None", "ĊNone",
+                "ĠNope", "Nope", "ĊNope",
+                "ĠUn", "Un", "ĊUn",
+                "ĠWrong", "Wrong", "ĊWrong"
+            ],
+            "in": [
+                " In", "ĠIn", "In", "ĊIn",
+                "ĠPending", "Pending", "ĊPending",
+                "ĠPart", "Part", "ĊPart",
+                "ĠPartial", "Partial", "ĊPartial",
+                "ĠInProgress", "InProgress", "ĊInProgress"
+            ]
+        }
+        response_str = ""
+        judge_probs_list = []
+        # print(logprobs)
+        for i, log_prob in enumerate(logprobs):
+            # Start to find judge string
+            if "<answer>" in response_str:
+                find_judge_str = None
+                for judge_type in target_judge:
+                    if log_prob["token"] in target_judge[judge_type]:
+                        # print(log_prob)
+                        find_judge_str = judge_type
+                        break
+                if find_judge_str:
+                    # print("find judge str")
+                    token_judge_dict = {
+                        "yes": None,
+                        "no": None,
+                        "in": None
+                    }
+                    if "top_logprobs" in log_prob:
+                        for token_info in log_prob["top_logprobs"]:
+                            for judge_type in target_judge:
+                                for judge_str in target_judge[judge_type]:
+                                    # if judge_str in token_info["token"] and token_info["logprob"] > token_judge_dict[judge_type]:
+                                    #     token_judge_dict[judge_type] = token_info["logprob"]
+                                    if judge_str in token_info["token"]:
+                                        # print(token_info["logprob"])
+                                        token_judge_dict[judge_type] = self.add_logprob(token_judge_dict[judge_type], token_info["logprob"])
+                        # for None case
+                        for judge_type in token_judge_dict:
+                            if token_judge_dict[judge_type] is None:
+                                token_judge_dict[judge_type] = float("-inf")
+                        judge_probs_list.append(token_judge_dict)
+                    else:
+                        # for vllm bugs : no top_logprobs
+                        for judge_type in token_judge_dict:
+                            if judge_type == find_judge_str:
+                                token_judge_dict[judge_type] = log_prob["logprob"]
+                            else:
+                                token_judge_dict[judge_type] = float("-inf")
+                        judge_probs_list.append(token_judge_dict)
+                    # print(token_judge_dict)
+            
+            if "</answer>" in response_str:
+                break
+            
+            response_str += log_prob["token"]
+        # print(response_str.replace("Ġ", " ").replace("Ċ", "\n"))
+        # print(judge_probs_list)
+        if len(judge_probs_list) == 0:
+            return [{
+                "yes": 0.0,
+                "no": 0.0,
+                "in": 0.0
+            }]
+        else:
+            # convert with softmax
+            final_judge_probs_list = []
+            for judge_probs in judge_probs_list:
+                exp_logprobs = [math.exp(x) for x in [judge_probs["yes"], judge_probs["no"], judge_probs["in"]]]
+                sum_exp_logprobs = sum(exp_logprobs)
+                softmax_probs = [x / sum_exp_logprobs for x in exp_logprobs]
+                final_judge_probs_list.append({
+                    "yes": softmax_probs[0], 
+                    "no": softmax_probs[1],
+                    "in": softmax_probs[2]
+                })
+            return final_judge_probs_list
+    
+    def generate_probs(self, model_input: dict, prompt_type: str):
+        total_cost = 0
+        response_list = []
+        # prepare message
+        message = self.prepare_message(model_input, prompt_type)
+        # print(message)
+
+        for i in range(self.num_generate):
+            try:
+                response = self.llm.invoke(message)
+                total_input_tokens = response.response_metadata["token_usage"]["prompt_tokens"]
+                total_output_tokens = response.response_metadata["token_usage"]["completion_tokens"]
+                total_cost = self.input_token_cost * total_input_tokens / 1000000 + self.output_token_cost * total_output_tokens / 1000000  
+                logprobs = response.response_metadata["logprobs"]["content"]
+                response_list.append(
+                    {
+                        "response": response.content,
+                        "judge_probs": self.get_judge_probs(logprobs)
+                    }
+                )
+            except Exception as e:
+                print(f"Error: {e}")
+                # print(response.response_metadata["logprobs"])
+                response_list.append(
+                    {
+                        "response": response.content,
+                        "judge_probs": []
+                    }
+                )
+        return response_list, total_cost
+
+
+class ChecklistGenerationAgent(BaseAgent):
+    def __init__(self, agent_config: dict):
+        super().__init__(agent_config)
+        self._setup()
+    
+    def prepare_message(self, model_input: dict, prompt_type):
+        message = get_messages(
+            input_info=model_input,
+            inference_mode="checklist_generation",
+            prompt_type=prompt_type
+        )
+        return message
+    
+
+class ClassifierRewardAgent(Agent):
+    def __init__(self, url: str, use_checklist: bool = False, use_multimodal: bool = False):
+        self.url = url
+        self.use_checklist = use_checklist
+        self.use_multimodal = use_multimodal
+
+    def _process_multimodal_message(self, prompt: str, image_list: list[str]):
+        multimodal_message = []
+        text_prompt_prefix = prompt.split("<IMAGE_PLACEHOLDER>")[0]
+        text_prompt_suffix = prompt.split("<IMAGE_PLACEHOLDER>")[1]
+        multimodal_message = [
+            {"type": "text", "text": text_prompt_prefix},
+            # {"type": "image_url", "image_url": {"url": image_to_base64_url(image_list[0])}},
+            {"type": "image", "image": image_to_base64_url(image_list[0])},
+            {"type": "text", "text": text_prompt_suffix}
+        ]
+        return multimodal_message
+
+    def _make_query(self, user_prompt_template: dict, model_input: dict | list[dict]):
+        if self.use_multimodal:
+            tmp_user_prompt = user_prompt_template["user"].format(
+                **model_input
+            )
+            user_prompt =  self._process_multimodal_message(tmp_user_prompt, model_input["image_list"])
+        else:
+            user_prompt = user_prompt_template["user"].format(
+                **model_input
+            )
+        assistant_prompt = user_prompt_template["assistant"].format(
+            **model_input
+        )
+        query = [
+            {"role": "user", "content": user_prompt},
+            {"role": "assistant", "content": assistant_prompt}
+        ]
+        return query
+
+    def prepare_message(self, model_input: dict | list[dict], batch: bool = False):
+        if self.use_checklist:
+            if self.use_multimodal:
+                user_prompt_template = JUDGE_OURS_BT_MODELING_MULTIMODAL_PROMPT_TEMPLATE
+            else:
+                user_prompt_template = JUDGE_OURS_BT_MODELING_PROMPT_TEMPLATE
+        else:
+            if self.use_multimodal:
+                user_prompt_template = JUDGE_OURS_BT_MODELING_MULTIMODAL_WO_CHECKLIST_PROMPT_TEMPLATE
+            else:
+                user_prompt_template = JUDGE_OURS_BT_MODELING_WO_CHECKLIST_PROMPT_TEMPLATE
+
+        if self.use_multimodal:
+            if batch:
+                message = [self._make_query(user_prompt_template, input) for input in model_input]
+            else:
+                message = [self._make_query(user_prompt_template, model_input)]
+        else:
+            if batch:
+                message = {
+                    "query": [self._make_query(user_prompt_template, input) for input in model_input],
+                    "promptts": []
+                }
+            else:
+                message = {
+                    "query": self._make_query(user_prompt_template, model_input),
+                    "prompts": []
+                }
+
+        return message
+    
+    def get_rm_scroe(self, message: dict | list):
+        headers = {"Content-Type": "application/json"}
+        
+        try:
+            if self.use_multimodal:
+                response = requests.post(
+                    self.url,
+                    json={"messages": message},
+                    timeout=600
+                )
+            else:
+                response = requests.post(
+                    self.url,
+                    headers=headers,
+                    data=json.dumps(message),
+                    timeout=300
+                )
+            response.raise_for_status()
+            
+            response_json = response.json()
+
+            if "rewards" not in response_json:
+                print(f"Error: 'rewards' key not found in API response: {response_json}")
+                return []
+            
+            if "get_reward" in self.url:
+                # use openrlhf
+                return response_json["rewards"]
+            elif "pooling" in self.url:
+                # use vllm server
+                return response_json["reward"]
+            else:
+                # error
+                raise ValueError(f"Invalid URL: {self.url}")
+
+        except requests.exceptions.Timeout:
+            print(f"Error: Request timed out to {self.url}")
+            return []
+        except requests.exceptions.RequestException as e:
+            print(f"Error during request to {self.url}: {e}")
+            return []
+        except json.JSONDecodeError:
+            print(f"Error: Failed to decode JSON response from {self.url}")
+            return []
+        except KeyError as e:
+             print(f"Error: Missing key {e} in response from {self.url}")
+             return []
+
+
+    def generate_response(self, model_input: dict | list[dict], batch: bool = False):
+        if batch:
+            message = self.prepare_message(model_input, batch=True)
+        else:
+            message = self.prepare_message(model_input)
+        rewards = self.get_rm_scroe(message)
+        
+        return rewards, 0
\ No newline at end of file
diff --git a/agent/mini_bench/checklist_eval.py b/agent/mini_bench/checklist_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3b5a5ddcd7eb0c8efbdc09875391dc8751c0953
--- /dev/null
+++ b/agent/mini_bench/checklist_eval.py
@@ -0,0 +1,95 @@
+import re
+
+from langchain_openai import ChatOpenAI
+
+from .agent import BaseAgent
+
+SYSTEM_PROMPT = "You are an expert evaluator. Your task is to assess how well a Web Agent’s generated checklist aligns with the reference checklist for a given user instruction."
+
+USER_PROMPT = """# Task Description
+Use the provided task description, evaluation criteria, and both checklists to assign a score from 1 to 5. Justify your rating with a brief explanation that considers both content overlap and logical structure.
+
+## Score Criteria
+- 5: Checklist covers all subgoals, is correct and clearly expressed
+- 4: Minor omissions or phrasing issues but mostly accurate and complete
+- 3: Partially matches, but with noticeable gaps or errors
+- 2: Incomplete or includes incorrect steps
+- 1: Mostly irrelevant, incorrect, or missing the task goal
+
+## User Instruction:
+{intent}
+
+## Reference Checklist:
+{gt_checklist}
+
+## Agent’s Generated Checklist:
+{generated_checklist}
+
+# Output Format
+Your response should be in the following format:
+REASON: [Write 2–4 sentences explaining how well the generated checklist matches the reference. Mention specific matches, omissions, errors, or strengths.]
+SCORE: [1–5]
+"""
+
+
+class ChecklistEvalAgent(BaseAgent):
+    def __init__(self, agent_config: dict):
+        super().__init__(agent_config)
+        self._setup()
+    
+    def prepare_message(self, model_input: dict, prompt_type):
+        message = [
+            {
+                "role": "system",
+                "content": SYSTEM_PROMPT
+            },
+            {
+                "role": "user",
+                "content": USER_PROMPT.format(
+                    intent=model_input["intent"],
+                    gt_checklist=model_input["gt_checklist"],
+                    generated_checklist=model_input["generated_checklist"]
+                )
+            }
+        ]
+        return message
+    
+    def generate_response(self, model_input: dict):
+        total_cost = 0
+        response_list = []
+        # prepare message
+        message = self.prepare_message(model_input)
+
+        # n sampling
+        for _ in range(self.num_generate):
+            response, cost = self.generate_with_retry(message, ["SCORE"])
+            response_list.append(response)
+            total_cost += cost
+
+        return response_list, total_cost
+
+def parsing_score(response: str):
+    score = response.split("SCORE:")[-1].split("\n")[0].strip()
+    match = re.search(r'\d+', score)
+    
+    if match:
+        return int(match.group())
+    else:
+        return None
+
+def average_score(scores: list[int]):
+    if len(scores) == 0:
+        return 0
+    return sum(scores) / len(scores)
+
+def get_score(results: list[dict]):
+    score_list = []
+    for result in results:
+        tmp_scores = [parsing_score(response) for response in result["response"]]
+        scores = [score for score in tmp_scores if score is not None]
+        result["score_list"] = scores
+        final_score = average_score(scores)
+        result["score"] = final_score
+        score_list.append(result)
+
+    return results, score_list
\ No newline at end of file
diff --git a/agent/mini_bench/eval_utils.py b/agent/mini_bench/eval_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..21b211cd239a627b3f9800ab68ed6d5a6840c3e8
--- /dev/null
+++ b/agent/mini_bench/eval_utils.py
@@ -0,0 +1,309 @@
+import re
+import random
+from collections import Counter
+
+from .utils import load_json, save_json, create_html_report
+
+random.seed(42)
+def get_score(response_list: list, indicator: str) -> int:
+    if len(response_list) == 0:
+        return [-100]
+
+    if isinstance(response_list[0], float):
+        return response_list
+    
+    if indicator == "prob":
+        score_list = []
+        for response in response_list:
+            total_score = 0
+            for judge_probs in response:
+                yes_prob = judge_probs.get("yes", 0)
+                in_progress_prob = judge_probs.get("in", 0)
+                total_score += yes_prob + in_progress_prob * 0.5
+            if len(response) > 0:
+                score_list.append(total_score / len(response))
+            else:
+                score_list.append(0)
+        return score_list
+    else:
+        score_list = []
+        for response in response_list:
+            if indicator == "SCORE":
+                if "SCORE" in response:
+                    try:
+                        score_str = response.split("SCORE:")[1].split("\n")[0].strip()
+                    except:
+                        score_str = response.split("SCORE:")[-1].strip()
+                    # find first integer
+                    try:
+                        score = re.search(r'-?\d+', score_str).group()
+                        score_list.append(int(score))
+                    except:
+                        score_list.append(0)
+                else:
+                    try:
+                        score_str = response.split("<answer>")[1].split("</answer>")[0].strip()
+                    except:
+                        score_str = response.split("<answer>")[-1].split("</answer>")[0].strip()
+                    # find "Yes" or "No"
+                    if "Yes" in score_str:
+                        score_list.append(1)
+                    elif "In Progress" in score_str:
+                        score_list.append(0.5)
+                    elif "No" in score_str:
+                        score_list.append(0)
+                    else:
+                        score_list.append(0)
+            elif indicator == "JUDGE":
+                try:
+                    judge_str = response.split("JUDGE:")[1].split("\n")[0].strip()
+                except:
+                    judge_str = response.split("JUDGE:")[-1].strip()
+                if "Yes" in judge_str:
+                    score_list.append(1)
+                elif "No" in judge_str:
+                    score_list.append(0)
+                else:
+                    score_list.append(0)
+            elif indicator == "CHECKLIST EVALUATION":
+                if "<answer>" in response:
+                    try:
+                        checklist_str = response.split("<answer>")[1].split("</answer>")[0].strip()
+                    except:
+                        checklist_str = response.split("<answer>")[-1].split("</answer>")[0].strip()
+                else:
+                    checklist_str = response.split("CHECKLIST EVALUATION:")[-1].strip()
+                
+                count_yes = checklist_str.count("Yes")
+                count_no = checklist_str.count("No")
+                count_in_progress = checklist_str.count("In Progress")
+                try:
+                    total_score = (count_yes + count_in_progress*0.5) / (count_yes + count_no + count_in_progress)
+                except:
+                    total_score = 0
+                score_list.append(total_score)
+            else:
+                raise ValueError(f"Invalid indicator: {indicator}")
+    return score_list
+
+def get_acc_and_mrr(chosen_score, rejected_scores):
+    if len(rejected_scores) == 0:
+        return 0, False
+    
+    same_score_num = rejected_scores.count(chosen_score)
+    all_scores = rejected_scores + [chosen_score]
+    sorted_scores = sorted(all_scores, reverse=True)
+    rank = sorted_scores.index(chosen_score) + 1 + same_score_num  # draw penalty
+    if all(chosen_score > r for r in rejected_scores):
+        accuracy = True
+    else:
+        accuracy = False
+    return 1 / rank, accuracy
+
+def average_score(score_list: list[float]):
+    if len(score_list) == 0:
+        return -100
+    return sum(score_list) / len(score_list)
+
+def self_consistency_score(score_list: list[float]):
+    if len(score_list) == 0:
+        return -100
+    counter = Counter(score_list)
+    return max(counter.values()) / len(score_list)
+
+def get_chosen_rejected_scores(data: dict, agg_func: str):
+    if len(data["chosen"]) == 0:
+        data["chosen"] = [{"score": [-100]}]
+    if len(data["rejected"]) == 0:
+        data["rejected"] = [{"score": [-100]}]
+    if not isinstance(data["chosen"][0], dict):
+        data["chosen"][0]["score"] = [-100]
+    if not isinstance(data["rejected"][0], dict):
+        data["rejected"][0]["score"] = [-100]
+        
+    if agg_func == "average":
+        chosen_score = average_score(data["chosen"][0]["score"])
+        rejected_scores = [average_score(rejected_score["score"]) for rejected_score in data["rejected"]]
+    elif agg_func == "self_consistency":
+        chosen_score = self_consistency_score(data["chosen"][0]["score"])
+        rejected_scores = [self_consistency_score(rejected_score["score"]) for rejected_score in data["rejected"]]
+    else:
+        raise ValueError(f"Invalid agg_func: {agg_func}")
+    return chosen_score, rejected_scores
+
+def get_score_results(results, agg_func):
+    score_dict = {"mrr": [], "accuracy": [], "traj_accuracy": []}
+    task_accuracy = {}
+    for result in results:
+        chosen_score, rejected_scores = get_chosen_rejected_scores(result, agg_func)
+        mrr, accuracy = get_acc_and_mrr(chosen_score, rejected_scores)
+        score_dict["mrr"].append(mrr)
+        score_dict["accuracy"].append(accuracy)
+        if result["task_id"] not in task_accuracy:
+            task_accuracy[result["task_id"]] = []
+        task_accuracy[result["task_id"]].append(accuracy)
+
+    for task_id in task_accuracy:
+        if sum(task_accuracy[task_id]) == len(task_accuracy[task_id]):
+            score_dict["traj_accuracy"].append(True)
+        else:
+            score_dict["traj_accuracy"].append(False)
+
+    return score_dict
+
+def calculate_stats(results, agg_func: str="average"):
+    if len(results) == 0:
+        return {
+            "MRR": 0,
+            "Accuracy": 0,
+            "Traj_Accuracy": 0,
+        }
+    total_score = get_score_results(results, agg_func)
+    stats = {
+        "MRR": sum(total_score["mrr"]) / len(total_score["mrr"]),
+        "Accuracy": sum(total_score["accuracy"]) / len(total_score["accuracy"]),
+        "Traj_Accuracy": sum(total_score["traj_accuracy"]) / len(total_score["traj_accuracy"]),
+    }
+    
+    return stats
+
+def group_by_task(results, split_indicator: str):
+    # sort results by task_id and step_id
+    results.sort(key=lambda x: (x["task_id"], x["step_id"]))
+    # group by task_name
+    grouped_task_dict = {}
+    for result in results:
+        task_name = "task_" + str(result["task_id"]) + "_step_" + str(result["step_id"])
+        if task_name not in grouped_task_dict:
+            grouped_task_dict[task_name] = {
+                "task_id": result["task_id"],
+                "step_id": result["step_id"],
+                "intent": result["intent"],
+                "start_url": result["start_url"],
+                "gt_checklist": result["gt_checklist"],
+                "generated_checklist": result.get("generated_checklist", None)  ,
+                "trajectory": result["trajectory"],
+                "current_url": result["current_url"],
+                "text_observation": result["text_observation"],
+                # "image_list": result["image_list"],
+                "chosen": [],
+                "rejected": [],
+                "source_name": result["source_name"],
+            }
+        
+        response = result["response"] if "response" in result else []
+        type_data = {
+            "thought": result["thought"],
+            "action": result["action"],
+            "response": response,
+            "score": get_score(response, split_indicator) if split_indicator != "prob" else get_score(result["judge_probs"], split_indicator),
+        }
+        if split_indicator == "prob":
+            type_data["judge_probs"] = result["judge_probs"]
+        if result["type"] == "chosen":
+            grouped_task_dict[task_name]["chosen"].append(type_data)
+        elif result["type"] == "rejected":
+            grouped_task_dict[task_name]["rejected"].append(type_data)
+    
+    return list(grouped_task_dict.values())
+
+
+def processing_results(results, evaluation_mode: str, num_generate: int, use_batch: bool=False):
+    if "judge_probs" in results[0]:
+        split_indicator = "prob"
+    else:
+        if evaluation_mode == "judge_with_checklist_generation" or evaluation_mode == "judge_with_gt_checklist":
+            split_indicator = "CHECKLIST EVALUATION" 
+        else:
+            split_indicator = "SCORE"
+
+    # if use_batch is True, make it flattened
+    if use_batch:
+        tmp_results = []
+        for result in results:
+            for d in result:
+                tmp_results.append(d)
+        grouped_results = group_by_task(tmp_results, split_indicator)
+    else:
+        grouped_results = group_by_task(results, split_indicator)
+
+    mind2web_results = []
+    webarena_results = []
+    mind2web_task_results = []
+    mind2web_website_results = []
+    mind2web_domain_results = []
+    
+    for grouped_result in grouped_results:
+        if "mind2web" in grouped_result["source_name"]:
+            mind2web_results.append(grouped_result)
+            if grouped_result["source_name"] == "mind2web_test_task":
+                mind2web_task_results.append(grouped_result)
+            elif grouped_result["source_name"] == "mind2web_test_website":
+                mind2web_website_results.append(grouped_result)
+            elif grouped_result["source_name"] == "mind2web_test_domain":
+                mind2web_domain_results.append(grouped_result)
+        elif "webarena" in grouped_result["source_name"]:
+            webarena_results.append(grouped_result)
+            
+    try:
+        final_stats = {
+            "mind2web": {
+                "MRR": {},
+                "Accuracy": {},
+                "Traj_Accuracy": {},
+            },
+            "webarena": {
+                "MRR": {},
+                "Accuracy": {},
+                "Traj_Accuracy": {},
+            },
+            "mind2web_task": {
+                "MRR": {},
+                "Accuracy": {},
+                "Traj_Accuracy": {},
+            },
+            "mind2web_website": {
+                "MRR": {},
+                "Accuracy": {},
+                "Traj_Accuracy": {},
+            },
+            "mind2web_domain": {
+                "MRR": {},
+                "Accuracy": {},
+                "Traj_Accuracy": {},
+            },
+        }
+        for source_results in [
+            ("mind2web", mind2web_results), 
+            ("webarena", webarena_results),
+            ("mind2web_task", mind2web_task_results),
+            ("mind2web_website", mind2web_website_results),
+            ("mind2web_domain", mind2web_domain_results)
+        ]:
+            average_stats = calculate_stats(source_results[1], "average")
+            self_consistency_stats = calculate_stats(source_results[1], "self_consistency")
+            for metric in average_stats:
+                final_stats[source_results[0]][metric]["Average"] = average_stats[metric]
+            for metric in self_consistency_stats:
+                final_stats[source_results[0]][metric]["Self_Consistency"] = self_consistency_stats[metric]
+        
+        if num_generate == 1:
+            for source_name in final_stats:
+                for metric in final_stats[source_name]:
+                    print(f"{round(100 * final_stats[source_name][metric]['Average'], 2)}", end=", ")
+            print()
+        else:
+            for agg_func in ["Average", "Self_Consistency"]:
+                print(f"{agg_func}")
+                for source_name in final_stats:
+                    for metric in final_stats[source_name]:
+                        print(f"{round(100 * final_stats[source_name][metric][agg_func], 2)}", end=", ")
+                print()
+    except Exception as e:
+        print(e)
+        return grouped_results, None    
+    
+    # add function to convert json format results to html format results
+    # TODO: implement this function
+    # create_html_report(results, "results.html")
+    return grouped_results, final_stats
\ No newline at end of file
diff --git a/agent/mini_bench/inference_utils.py b/agent/mini_bench/inference_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..947d35c13c2245fdbba067272a70f52b72328ec8
--- /dev/null
+++ b/agent/mini_bench/inference_utils.py
@@ -0,0 +1,87 @@
+import time
+
+from multiprocessing import Process, Manager
+from tqdm import tqdm
+
+
+def worker_main(work_queue, result_queue, process_func, config):
+    while True:
+        item = work_queue.get()
+        if item is None:
+            result_queue.put(None)
+            break
+        try:
+            results, cost = process_func(config, item)
+            result_queue.put((results, cost))
+        except Exception as e:
+            item_info = item.get('idx', item.get('id', 'unknown item'))
+            print(f"Error processing item {item_info}: {e}")
+            result_queue.put(None)
+        finally:
+            work_queue.task_done()
+
+def run_parallel_evaluation(dataset, process_func, config, num_workers, description):
+    """
+    Runs parallel evaluation on the given dataset and returns the results.
+
+    Args:
+        dataset (list or datasets.Dataset): Data to evaluate.
+        process_func (callable): Function to process each data item.
+        config (dict): Configuration for the process_func.
+        num_workers (int): Number of worker processes to use.
+        description (str): Description to display on the tqdm progress bar.
+
+    Returns:
+        tuple: (list of evaluation results, total cost)
+    """
+    manager = Manager()
+    work_queue = manager.Queue()
+    result_queue = manager.Queue()
+
+    # Add data to the work queue
+    dataset_list = list(dataset) if not isinstance(dataset, list) else dataset
+    for data in dataset_list:
+        work_queue.put(data)
+    
+    # Add termination signals for workers
+    for _ in range(num_workers):
+        work_queue.put(None)
+
+    # Start parallel processing
+    processes = []
+    for _ in range(num_workers):
+        p = Process(target=worker_main, args=(work_queue, result_queue, process_func, config))
+        p.start()
+        processes.append(p)
+    
+    # Show progress bar and collect results
+    process_results = []
+    process_cost = 0
+    completed_workers = 0
+
+    with tqdm(total=len(dataset_list), desc=description) as pbar:
+        while completed_workers < num_workers:
+            result_item = result_queue.get()
+            if result_item is None:
+                completed_workers += 1
+            else:
+                results, cost = result_item
+                if results is not None:
+                    process_results.append(results)
+                    process_cost += cost if cost is not None else 0
+                pbar.update(1)
+
+    # Wait for all processes to finish
+    for p in processes:
+        p.join()
+
+    # Collect remaining results
+    while not result_queue.empty():
+        result_item = result_queue.get_nowait()
+        if result_item is not None:
+            results, cost = result_item
+            if results is not None:
+                process_results.append(results)
+                process_cost += cost if cost is not None else 0
+
+    return process_results, process_cost
\ No newline at end of file
diff --git a/agent/mini_bench/prompts/__init__.py b/agent/mini_bench/prompts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4cbd42a34e971cd6b890e59d67d445b3e1f8030
--- /dev/null
+++ b/agent/mini_bench/prompts/__init__.py
@@ -0,0 +1 @@
+from .construct_messages import get_messages
\ No newline at end of file
diff --git a/agent/mini_bench/prompts/__pycache__/__init__.cpython-311.pyc b/agent/mini_bench/prompts/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9be9bcaf9b2b7378f6d7e5e143c51b6dc2ae05ba
Binary files /dev/null and b/agent/mini_bench/prompts/__pycache__/__init__.cpython-311.pyc differ
diff --git a/agent/mini_bench/prompts/__pycache__/action.cpython-311.pyc b/agent/mini_bench/prompts/__pycache__/action.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc6e4d6f24370bdb633435da39ab875bbd88ad0d
Binary files /dev/null and b/agent/mini_bench/prompts/__pycache__/action.cpython-311.pyc differ
diff --git a/agent/mini_bench/prompts/__pycache__/checklist_prompt.cpython-311.pyc b/agent/mini_bench/prompts/__pycache__/checklist_prompt.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a520e7063b5b181eafad979001e8072e9a3e2cc
Binary files /dev/null and b/agent/mini_bench/prompts/__pycache__/checklist_prompt.cpython-311.pyc differ
diff --git a/agent/mini_bench/prompts/__pycache__/construct_messages.cpython-311.pyc b/agent/mini_bench/prompts/__pycache__/construct_messages.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3a5955988073f5bbcab7b7ec9b148db58448f01
Binary files /dev/null and b/agent/mini_bench/prompts/__pycache__/construct_messages.cpython-311.pyc differ
diff --git a/agent/mini_bench/prompts/__pycache__/eval_type.cpython-311.pyc b/agent/mini_bench/prompts/__pycache__/eval_type.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54bab0e0898c44c89d59c5ae2c91058c8877a2ae
Binary files /dev/null and b/agent/mini_bench/prompts/__pycache__/eval_type.cpython-311.pyc differ
diff --git a/agent/mini_bench/prompts/__pycache__/image_utils.cpython-311.pyc b/agent/mini_bench/prompts/__pycache__/image_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e08ee689eb62a5f4a2af37e6334980b8d437e58c
Binary files /dev/null and b/agent/mini_bench/prompts/__pycache__/image_utils.cpython-311.pyc differ
diff --git a/agent/mini_bench/prompts/__pycache__/input_information.cpython-311.pyc b/agent/mini_bench/prompts/__pycache__/input_information.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc2ba33c854ee1f72c1b368b8d8dc4d091f8e78b
Binary files /dev/null and b/agent/mini_bench/prompts/__pycache__/input_information.cpython-311.pyc differ
diff --git a/agent/mini_bench/prompts/__pycache__/judge_prompt.cpython-311.pyc b/agent/mini_bench/prompts/__pycache__/judge_prompt.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9dc6b3ab5f34cde0cdc1ae323c7de13237f7c2fb
Binary files /dev/null and b/agent/mini_bench/prompts/__pycache__/judge_prompt.cpython-311.pyc differ
diff --git a/agent/mini_bench/prompts/__pycache__/utils.cpython-311.pyc b/agent/mini_bench/prompts/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31f684751392709b04668c6c86eae85afe18d1e9
Binary files /dev/null and b/agent/mini_bench/prompts/__pycache__/utils.cpython-311.pyc differ
diff --git a/agent/mini_bench/prompts/action.py b/agent/mini_bench/prompts/action.py
new file mode 100644
index 0000000000000000000000000000000000000000..66ed24a7d2d36bd0cdea4088d5d60692543afc40
--- /dev/null
+++ b/agent/mini_bench/prompts/action.py
@@ -0,0 +1,93 @@
+ACTION_SPACE_PROMPT = """Note: This action set allows you to interact with your environment. Most of them are python function executing playwright code. The primary way of referring to elements in the page is through bid which are specified in your observations.
+
+15 different types of actions are available.
+
+noop(wait_ms: float = 1000)
+    Examples:
+        noop()
+
+        noop(500)
+
+scroll(delta_x: float, delta_y: float)
+    Examples:
+        scroll(0, 200)
+
+        scroll(-50.2, -100.5)
+
+keyboard_press(key: str)
+    Examples:
+        keyboard_press('Backspace')
+
+        keyboard_press('ControlOrMeta+a')
+
+        keyboard_press('Meta+Shift+t')
+
+click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
+    Examples:
+        click('a51')
+
+        click('b22', button='right')
+
+        click('48', button='middle', modifiers=['Shift'])
+
+fill(bid: str, value: str)
+    Examples:
+        fill('237', 'example value')
+
+        fill('45', 'multi-line\nexample')
+
+        fill('a12', 'example with "quotes"')
+
+hover(bid: str)
+    Examples:
+        hover('b8')
+
+tab_focus(index: int)
+    Examples:
+        tab_focus(2)
+
+new_tab()
+    Examples:
+        new_tab()
+
+go_back()
+    Examples:
+        go_back()
+
+go_forward()
+    Examples:
+        go_forward()
+
+goto(url: str)
+    Examples:
+        goto('http://www.example.com')
+
+tab_close()
+    Examples:
+        tab_close()
+
+select_option(bid: str, options: str | list[str])
+    Examples:
+        select_option('a48', 'blue')
+
+        select_option('c48', ['red', 'green', 'blue'])
+
+send_msg_to_user(text: str)
+    Examples:
+        send_msg_to_user('Based on the results of my search, the city was built in 1751.')
+
+report_infeasible(reason: str)
+    Examples:
+        report_infeasible('I cannot follow these instructions because there is no email field in this form.')
+
+Only a single action can be provided at once. Example:
+fill('a12', 'example with "quotes"')
+
+Note:
+* Some tasks may be game like and may require to interact with the mouse position in x, y coordinates.
+* Some text field might have auto completion. To see it, you have to type a few characters and wait until next step.
+* If you have to cut and paste, don't forget to select the text first.
+* Coordinate inside an SVG are relative to it's top left corner.
+* Make sure to use bid to identify elements when using commands.
+* Interacting with combobox, dropdowns and auto-complete fields can be tricky, sometimes you need to use select_option, while other times you need to use fill or click and wait for the reaction of the page.
+"""
\ No newline at end of file
diff --git a/agent/mini_bench/prompts/checklist_prompt.py b/agent/mini_bench/prompts/checklist_prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab78eaafa2b857a00232036bcb42ed47381e2a84
--- /dev/null
+++ b/agent/mini_bench/prompts/checklist_prompt.py
@@ -0,0 +1,50 @@
+CHECKLIST_SYSTEM_PROMPT = "You are an AI assistant tasked with generating structured checklists that highlight key subgoals necessary to complete a task."
+
+CHECKLIST_USER_PROMPT = """## Task Description
+User Instruction (Goal): "{intent}"
+Start Website URL: {start_url}
+
+## Guidelines for Checklist Generation
+1. Identify Essential High-Level Subgoals:
+- A subgoal should represent a significant step involving user interaction that leads to noticeable page transitions or meaningful changes in system state.
+- Consolidate closely related user actions (such as applying multiple filters or selecting several options) into a single subgoal, rather than separate checklist items for each action.
+- Prioritize only the most critical interactions necessary for meaningful progression, avoiding the inclusion of minor or unnecessary steps (e.g., scroll, hover).
+2. Provide a Concise Subgoal Analysis:
+- Before creating the checklist, offer a brief paragraph summarizing the main subgoals, emphasizing significant transitions or page-level interactions.
+3. Ensure Clear Goal:
+- If multiple related interactions occur (e.g., setting filters 1, 2, and 3), combine them into one subgoal with clear criteria verifying all required conditions.
+- The checklist should contain only essential steps, explicitly excluding unnecessary actions, and should not exceed five critical subgoals. It is not necessary to use all five checklist items if fewer steps adequately represent the essential subgoals.
+
+### Output Format
+Before generating the checklist, first produce a concise subgoal analysis in a single paragraph summarizing the required interactions. Then, based on this, generate the checklist following the format below:
+[SUBGOAL ANALYSIS]  
+[One-paragraph summary explaining the key subgoals and their logical sequence in task completion.]  
+
+[CHECKLISTS]
+Checklist X: [Short title of the action/goal]
+- Goal: [Brief description of the subgoal at this stage, emphasizing the purpose of the action.]
+"""
+
+# TODO: implement ours
+CHECKLIST_OURS_SYSTEM_PROMPT = ""
+
+CHECKLIST_OURS_USER_PROMPT = """You are an AI assistant tasked with generating structured checklists that highlight key subgoals necessary to complete a task.
+
+# Task Description
+Generate a checklist which are key milestones for achieving the given instruction. Frist, provide a concise 
+subgoal analysis in a single paragraph summarizing the required interactions. Then, based on this, generate the checklist with breif description.
+
+Note: If the target website requires login, assume the user is already logged in and starts from an authenticated session.
+
+# Given Information
+## User Instruction
+{intent}
+
+## Current State
+### Current URL
+{start_url}
+
+### AXTREE
+Note: [bid] is the unique alpha-numeric identifier at the beginning of lines for each element in the AXTree. Always use bid to refer to elements in your actions.
+{text_observation}
+"""
\ No newline at end of file
diff --git a/agent/mini_bench/prompts/construct_messages.py b/agent/mini_bench/prompts/construct_messages.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9203e2a6aa3b5db48b7ebc610c9f1123a0473c9
--- /dev/null
+++ b/agent/mini_bench/prompts/construct_messages.py
@@ -0,0 +1,309 @@
+from abc import ABC, abstractmethod
+
+from .action import ACTION_SPACE_PROMPT
+from .eval_type import (
+    GROUNDING,
+    PROGRESS_LIKERT_SCALE,
+    PROGRESS_THREE_CLASS,
+    PROGRESS_WITH_CHECKLIST,
+    PROGRESS_WITH_CHECKLIST_IN_PROGRESS,
+    PROGRESS_OURS
+)
+from .input_information import (
+    USER_INSTRUCTION,
+    TRAJECTORY,
+    AGENT_RESPONSE,
+    CHECKLIST,
+    CURRENT_URL,
+    TEXT_OBSERVATION,
+    SOM_IMAGE_OBSERVATION, 
+    COORD_IMAGE_OBSERVATION
+)
+from .judge_prompt import (
+    JUDGE_GROUNDING_PROMPT_TEMPLATE,
+    JUDGE_LIKERT_SCALE_PROMPT_TEMPLATE,
+    JUDGE_THREE_CLASS_PROMPT_TEMPLATE,
+    JUDGE_WITH_CHECKLIST_PROMPT_TEMPLATE,
+    JUDGE_OURS_PROMPT_TEMPLATE,
+    JUDGE_OURS_WO_CHECKLIST_PROMPT_TEMPLATE
+)
+from .checklist_prompt import (
+    CHECKLIST_SYSTEM_PROMPT,
+    CHECKLIST_USER_PROMPT,
+    CHECKLIST_OURS_SYSTEM_PROMPT,
+    CHECKLIST_OURS_USER_PROMPT
+)
+from .image_utils import image_to_base64_url
+
+
+class Message(ABC):    
+    @abstractmethod
+    def get_messages(self):
+        pass
+
+class BaseMessage(Message):
+    def __init__(self, input_info:dict, use_multimodal:bool=False):
+        self.input_info = input_info
+        self.use_multimodal = use_multimodal
+
+    def _get_system_message(self):
+        system_message = {"role": "system", "content": "You are a helpful assistant."}
+        return system_message
+    
+    def _process_multimodal_message(self, prompt: str, image_list: list[str]):
+        multimodal_message = []
+        text_prompt_prefix = prompt.split("<IMAGE_PLACEHOLDER>")[0]
+        text_prompt_suffix = prompt.split("<IMAGE_PLACEHOLDER>")[1]
+        multimodal_message.append({"type": "text", "text": text_prompt_prefix})
+        for i, image in enumerate(image_list):
+            # TODO: text prompt for multiple images
+            # multimodal_message.append({"type": "text", "text": f"IMAGE {i+1}\n"})
+            multimodal_message.append({"type": "image_url", "image_url": {"url": image_to_base64_url(image), "detail": "low"}})
+        multimodal_message.append({"type": "text", "text": text_prompt_suffix})
+        return {"role": "user", "content": multimodal_message}
+    
+    def _get_user_message(self):
+        user_prompt = "What is the capital of France?"
+        if self.use_multimodal:
+            image_list = self.input_info.get("image_list", [])
+            user_message = self._process_multimodal_message(user_prompt, image_list)
+        else:
+            user_message = {"role": "user", "content": user_prompt}
+        return user_message
+
+    def get_messages(self):
+        message = []
+        system_message = self._get_system_message()
+        user_message = self._get_user_message()
+
+        message.append(system_message)
+        # message.append({"role": "system", "content": ""})
+        message.append(user_message)
+        return message
+    
+
+class ProgressMessage(BaseMessage):
+    '''
+    Progress Judge Message
+    '''
+    def __init__(self, input_info:dict, use_multimodal:bool, prompt_type:str, text_obs: str, image_obs: str, use_checklist:bool, use_in_progress:bool):
+        super().__init__(input_info, use_multimodal)
+        self.prompt_type = prompt_type
+        self.text_obs = text_obs
+        self.image_obs = image_obs
+        self.use_checklist = use_checklist
+        self.use_in_progress = use_in_progress
+    
+    def _get_system_message(self):
+        if self.prompt_type == "likert_scale":
+            system_message = {"role": "system", "content": JUDGE_LIKERT_SCALE_PROMPT_TEMPLATE["system"]}
+        elif self.prompt_type == "three_class":
+            system_message = {"role": "system", "content": JUDGE_THREE_CLASS_PROMPT_TEMPLATE["system"]}
+        elif self.prompt_type == "with_checklist":
+            system_message = {"role": "system", "content": JUDGE_WITH_CHECKLIST_PROMPT_TEMPLATE["system"]}
+        elif self.prompt_type == "ours":
+            system_message = {"role": "system", "content": JUDGE_OURS_PROMPT_TEMPLATE["system"]}
+        else:
+            raise ValueError(f"Invalid prompt type: {self.prompt_type}")
+        return system_message
+        
+    def _setup_input_information(self):
+        observation = "## Current State\n"
+
+        observation += CURRENT_URL
+
+        # text observation
+        if self.text_obs:
+            observation += TEXT_OBSERVATION
+        
+        # image observation (som, coord, none)
+        if self.image_obs == "som":
+            observation += SOM_IMAGE_OBSERVATION
+        elif self.image_obs == "coord":
+            observation += COORD_IMAGE_OBSERVATION
+
+
+        if self.use_checklist:
+            input_information = USER_INSTRUCTION + TRAJECTORY + observation + CHECKLIST + AGENT_RESPONSE
+        else:
+            input_information = USER_INSTRUCTION + TRAJECTORY + observation + AGENT_RESPONSE
+
+        return input_information
+    
+    def _setup_task_info(self):
+        if self.prompt_type == "likert_scale":
+            task_description = PROGRESS_LIKERT_SCALE["task_description"]
+            output_format = PROGRESS_LIKERT_SCALE["output_format"]
+        elif self.prompt_type == "three_class":
+            task_description = PROGRESS_THREE_CLASS["task_description"]
+            output_format = PROGRESS_THREE_CLASS["output_format"]
+        elif self.prompt_type == "with_checklist":
+            if self.use_in_progress:
+                task_description = PROGRESS_WITH_CHECKLIST_IN_PROGRESS["task_description"]
+                output_format = PROGRESS_WITH_CHECKLIST_IN_PROGRESS["output_format"]
+            else:
+                task_description = PROGRESS_WITH_CHECKLIST["task_description"]
+                output_format = PROGRESS_WITH_CHECKLIST["output_format"]
+        else:
+            raise ValueError(f"Invalid prompt type: {self.prompt_type}")
+        return task_description, output_format
+    
+    def _get_user_prompt_template(self):
+        if self.prompt_type == "likert_scale":
+            user_prompt = JUDGE_LIKERT_SCALE_PROMPT_TEMPLATE["user"]
+        elif self.prompt_type == "three_class":
+            user_prompt = JUDGE_THREE_CLASS_PROMPT_TEMPLATE["user"]
+        elif self.prompt_type == "with_checklist":
+            user_prompt = JUDGE_WITH_CHECKLIST_PROMPT_TEMPLATE["user"]
+        else:
+            raise ValueError(f"Invalid prompt type: {self.prompt_type}")
+        return user_prompt
+    
+    def _get_user_message(self):
+        # setup input information (user_instruction, trajectory, current_state, agent_response, checklist)
+        input_information_template = self._setup_input_information()
+        input_information = input_information_template.format(**self.input_info)
+
+        if self.prompt_type == "ours":
+            if self.use_checklist:
+                user_prompt = JUDGE_OURS_PROMPT_TEMPLATE["user"].format(
+                    input_information=input_information,
+                )
+            else:
+                user_prompt = JUDGE_OURS_WO_CHECKLIST_PROMPT_TEMPLATE["user"].format(
+                    input_information=input_information,
+                )
+        else:
+            task_description, output_format = self._setup_task_info()
+            # get user prompt template by prompt type
+            user_prompt_template = self._get_user_prompt_template()
+            user_prompt = user_prompt_template.format(
+                action_space=ACTION_SPACE_PROMPT,
+                task_description=task_description,
+                input_information=input_information,
+                output_format=output_format
+            )
+
+        # process multimodal message
+        if self.use_multimodal:
+            image_list = self.input_info.get("image_list", [])
+            user_message = self._process_multimodal_message(user_prompt, image_list)
+        else:
+            user_message = {"role": "user", "content": user_prompt}
+
+        return user_message
+
+
+class GroundingMessage(BaseMessage):
+    '''
+    Grounding Judge Message
+    '''
+    def __init__(self, input_info:dict, use_multimodal:bool, prompt_type:str, text_obs: str, image_obs: str):
+        super().__init__(input_info, use_multimodal)
+        self.prompt_type = prompt_type
+        self.text_obs = text_obs
+        self.image_obs = image_obs
+
+    def _get_system_message(self):
+        if self.prompt_type == "ours":
+            # TODO: implement ours
+            system_message = {"role": "system", "content": "You are a helpful assistant."}
+        elif self.prompt_type == "default":
+            system_message = {"role": "system", "content": JUDGE_GROUNDING_PROMPT_TEMPLATE["system"]}
+        else:
+            raise ValueError(f"Invalid prompt type: {self.prompt_type}")
+        return system_message
+
+    def _setup_input_information(self):
+        observation = "## Current State\n"
+
+        observation += CURRENT_URL
+
+        # text observation
+        if self.text_obs:
+            observation += TEXT_OBSERVATION
+        
+        # image observation (som, coord, none)
+        if self.image_obs == "som":
+            observation += SOM_IMAGE_OBSERVATION
+        elif self.image_obs == "coord":
+            observation += COORD_IMAGE_OBSERVATION
+
+        # input_information = USER_INSTRUCTION + TRAJECTORY + observation + AGENT_RESPONSE # with trajectory
+        input_information = USER_INSTRUCTION + observation + AGENT_RESPONSE # without trajectory
+
+        return input_information
+    
+    def _get_user_message(self):
+        if self.prompt_type == "ours":
+            # TODO: implement ours
+            user_message = {"role": "user", "content": "TODO"}
+        elif self.prompt_type == "default":
+            action_space = ACTION_SPACE_PROMPT
+            task_description = GROUNDING["task_description"]
+            output_format = GROUNDING["output_format"]
+            input_information_template = self._setup_input_information() 
+            input_information = input_information_template.format(**self.input_info)
+
+            user_prompt = JUDGE_GROUNDING_PROMPT_TEMPLATE["user"].format(
+                action_space=action_space,
+                task_description=task_description,
+                input_information=input_information,
+                output_format=output_format
+            )
+
+            # process multimodal message
+            if self.use_multimodal:
+                image_list = self.input_info.get("image_list", [])
+                user_message = self._process_multimodal_message(user_prompt, image_list)
+            else:
+                user_message = {"role": "user", "content": user_prompt} 
+        else:
+            raise ValueError(f"Invalid prompt type: {self.prompt_type}")
+        return user_message
+
+
+class ChecklistMessage(BaseMessage):
+    '''
+    Checklist Message
+    '''
+    def __init__(self, input_info:dict, use_multimodal:bool, prompt_type:str):
+        super().__init__(input_info, use_multimodal)
+        self.prompt_type = prompt_type
+    
+    def _get_system_message(self):
+        if self.prompt_type == "ours":
+            # TODO: implement ours
+            system_message = {"role": "system", "content": CHECKLIST_OURS_SYSTEM_PROMPT}
+        elif self.prompt_type == "default":
+            system_message = {"role": "system", "content": CHECKLIST_SYSTEM_PROMPT}
+        else:
+            raise ValueError(f"Invalid prompt type: {self.prompt_type}")
+        return system_message
+    
+    def _get_user_message(self):
+        if self.prompt_type == "ours":
+            user_message = {"role": "user", "content": CHECKLIST_OURS_USER_PROMPT.format(**self.input_info)}
+        elif self.prompt_type == "default":
+            user_message = {"role": "user", "content": CHECKLIST_USER_PROMPT.format(**self.input_info)}
+        else:
+            raise ValueError(f"Invalid prompt type: {self.prompt_type}")
+        return user_message
+    
+
+def get_messages(input_info:dict, inference_mode:str, prompt_type:str, text_obs:str=None, image_obs:str=None, use_multimodal:bool=False, use_checklist:bool=False, use_in_progress:bool=False):
+    message_list = []
+    if inference_mode == "judge_grounding":
+        message = GroundingMessage(input_info, use_multimodal=use_multimodal, prompt_type=prompt_type, text_obs=text_obs, image_obs=image_obs)
+    elif inference_mode == "judge_progress":
+        message = ProgressMessage(input_info, use_multimodal=use_multimodal, prompt_type=prompt_type, text_obs=text_obs, image_obs=image_obs, use_checklist=use_checklist, use_in_progress=use_in_progress)
+    elif inference_mode == "checklist_generation":
+        message = ChecklistMessage(input_info, use_multimodal=False, prompt_type=prompt_type)
+    else:
+        raise ValueError(f"Invalid inference mode: {inference_mode}")
+    
+    system_message, user_message = message.get_messages()
+    
+    message_list.append(system_message)
+    message_list.append(user_message)
+    return message_list
\ No newline at end of file
diff --git a/agent/mini_bench/prompts/eval_type.py b/agent/mini_bench/prompts/eval_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d3fd156b2536b02a84bedc35212a23fdc6c2af2
--- /dev/null
+++ b/agent/mini_bench/prompts/eval_type.py
@@ -0,0 +1,107 @@
+# Task Description & Output Format
+GROUNDING_TASK = """Your task is to evaluate whether the agent's ACTION is properly grounded in its THOUGHT, considering the current state of the webpage.
+Use the user instruction, the current webpage state, and the agent's thought and action as evidence for your judgment. Your evaluation should assess whether the ACTION logically follows from the THOUGHT and is feasible and appropriate in the given environment.
+Mark the action as 'Yes' only if it is clearly and fully grounded in the thought and current webpage state. If there is any inconsistency, ambiguity, irrelevance, or if the action is not supported by the current page state, mark it as 'No'."""
+
+GROUNDING_FORMAT = """Please return your response in the following format:
+REASON: [Your explanation for whether the action is properly grounded]
+JUDGE: [Yes / No]"""
+
+
+PROGRESS_LIKERT_SCALE_TASK = """Evaluate how helpful the given thought and action is for achieving the goal. Use the following scale:
+**Scoring Criteria (1 to 5):**
+- **5 (Very Helpful)**: The action directly and effectively moves toward fulfilling a key part of the goal.
+- **4 (Helpful)**: The action contributes meaningfully to progress, though it may require follow-up actions.
+- **3 (Somewhat Helpful)**: The action is partially relevant or a preparatory step, but doesn’t make immediate progress.
+- **2 (Slightly Helpful)**: The action is weakly related to the goal or might only indirectly help.
+- **1 (Not Helpful)**: The action is unrelated, redundant, or distracts from the goal."""
+
+PROGRESS_LIKERT_SCALE_FORMAT = """Please return your response in the following format:
+REASON: [Your explanation for the score]
+SCORE: [1-5]"""
+
+
+PROGRESS_THREE_CLASS_TASK = """Evaluate how helpful the given thought and action is for achieving the goal. Use the following scale:
+**Scoring Criteria:**
+- **1 (Helpful)**: The action clearly contributes to achieving the goal. It takes a necessary or productive step toward completing the task.
+- **0 (Neutral)**: The action is neither helpful nor harmful. It may be a placeholder, irrelevant at the current step, or too ambiguous to evaluate.
+- **-1 (Not Helpful)**: The action works against the goal, causes confusion, repeats a previous step unnecessarily, or leads the agent off track."""
+
+PROGRESS_THREE_CLASS_FORMAT = """Please return your response in the following format:
+REASON: [Your explanation for the score]
+SCORE: [-1 / 0 / 1]"""
+
+
+PROGRESS_WITH_CHECKLIST_TASK = """Your task is to evaluate how well the agent's THOUGHT and ACTION satisfy each item in the checklist.
+Use the task instruction, trajectory (including previously completed steps from history), current webpage state, and the agent's current response as evidence for your evaluation.
+For each checklist item:
+- Mark it as 'Yes' if it is clearly and fully satisfied either in the current response or already completed in the history.
+- Mark it as 'No' if there is ambiguity, insufficient evidence, or the step is incomplete or not yet started."""
+
+PROGRESS_WITH_CHECKLIST_FORMAT = """Please return your response in the following format:
+REASON: [Write a single, coherent paragraph explaining how well the agent's response satisfies the checklist overall. Use both the history and the agent's current thought/action as evidence. Mention specific strengths or missing elements that influence your decision.]
+CHECKLIST EVALUATION:  
+Checklist X: [Yes / No]  
+"""
+
+PROGRESS_WITH_CHECKLIST_IN_PROGRESS_TASK = """Your task is to evaluate how well the agent's THOUGHT and ACTION satisfy each item in the checklist.
+Use the task instruction, trajectory (including previously completed steps from history), current webpage state, and the agent's current response as evidence for your evaluation. Clearly consider any items already successfully completed or currently in progress according to the provided trajectory.
+For each checklist item:
+- Mark it as 'Yes' if it is clearly and fully satisfied either in the current response or already completed in the history.
+- Mark it as 'In Progress' if the agent has made partial but meaningful progress toward completing the item.
+- Mark it as 'No' if there is ambiguity, insufficient evidence, or the step is incomplete or not yet started."""
+
+PROGRESS_WITH_CHECKLIST_IN_PROGRESS_FORMAT = """Please return your response in the following format:
+REASON: [Write a single, coherent paragraph explaining how well the agent's response satisfies the checklist overall. Use both the history and the agent's current thought/action as evidence. Mention specific strengths or missing elements that influence your decision.]
+CHECKLIST EVALUATION:  
+Checklist X: [Yes / In Progress / No]  
+"""
+
+
+GROUNDING_OURS_TASK = """
+"""
+
+GROUNDING_OURS_FORMAT = """
+"""
+
+PROGRESS_OURS_TASK = """
+"""
+
+PROGRESS_OURS_FORMAT = """
+"""
+
+## EVALUATION TYPE
+GROUNDING = {
+    "task_description": GROUNDING_TASK,
+    "output_format": GROUNDING_FORMAT,
+}
+
+GROUNDING_OURS = {
+    "task_description": GROUNDING_OURS_TASK,
+    "output_format": GROUNDING_OURS_FORMAT,
+}
+
+PROGRESS_LIKERT_SCALE = {
+    "task_description": PROGRESS_LIKERT_SCALE_TASK,
+    "output_format": PROGRESS_LIKERT_SCALE_FORMAT,
+}
+
+PROGRESS_THREE_CLASS = {
+    "task_description": PROGRESS_THREE_CLASS_TASK,
+    "output_format": PROGRESS_THREE_CLASS_FORMAT,
+}
+
+PROGRESS_WITH_CHECKLIST = {
+    "task_description": PROGRESS_WITH_CHECKLIST_TASK,
+    "output_format": PROGRESS_WITH_CHECKLIST_FORMAT,
+}
+
+PROGRESS_WITH_CHECKLIST_IN_PROGRESS = {
+    "task_description": PROGRESS_WITH_CHECKLIST_IN_PROGRESS_TASK,
+    "output_format": PROGRESS_WITH_CHECKLIST_IN_PROGRESS_FORMAT,
+}
+
+PROGRESS_OURS = {
+    "task_description": PROGRESS_OURS_TASK,
+    "output_format": PROGRESS_OURS_FORMAT,
+}
\ No newline at end of file
diff --git a/agent/mini_bench/prompts/image_utils.py b/agent/mini_bench/prompts/image_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e3f487bdd2831541870ce5990366ab09270dd9d
--- /dev/null
+++ b/agent/mini_bench/prompts/image_utils.py
@@ -0,0 +1,19 @@
+import base64
+import io
+from PIL import Image
+
+
+def image_to_base64_url(image: str | Image.Image):
+    if isinstance(image, str):
+        with open(image, "rb") as f:
+            image = f.read()
+    elif isinstance(image, Image.Image):
+        if image.mode in ("RGBA", "LA"):
+            image = image.convert("RGB")
+        with io.BytesIO() as buffer:
+            image.save(buffer, format="PNG")
+            image = buffer.getvalue()
+    else:
+        raise ValueError(f"Invalid image type: {type(image)}")
+    
+    return "data:image/png;base64," + base64.b64encode(image).decode("utf-8")
\ No newline at end of file
diff --git a/agent/mini_bench/prompts/input_information.py b/agent/mini_bench/prompts/input_information.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce90a7d6fd976b3244beb2a13e6800a42476ef7f
--- /dev/null
+++ b/agent/mini_bench/prompts/input_information.py
@@ -0,0 +1,36 @@
+USER_INSTRUCTION = """## User Instruction
+{intent}
+"""
+
+TRAJECTORY = """## Trajectory
+{trajectory}"""
+
+AGENT_RESPONSE = """## Agent's Response
+THOUGHT: {thought}
+ACTION: {action}
+"""
+
+CHECKLIST = """## Checklist
+{checklist}
+"""
+
+
+# Observation
+CURRENT_URL = """### Current URL
+{current_url}
+"""
+
+TEXT_OBSERVATION = """### AXTREE
+Note: [bid] is the unique alpha-numeric identifier at the beginning of lines for each element in the AXTree. Always use bid to refer to elements in your actions.
+{text_observation}
+"""
+
+SOM_IMAGE_OBSERVATION = """### SOM Image Screenshot
+Here is a current image screenshot of the page, it is annotated with bounding boxes and corresponding bids:
+<IMAGE_PLACEHOLDER>
+"""
+
+COORD_IMAGE_OBSERVATION = """### Raw Image Screenshot
+Here is a screenshot of the page:
+<IMAGE_PLACEHOLDER>
+"""
diff --git a/agent/mini_bench/prompts/judge_prompt.py b/agent/mini_bench/prompts/judge_prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..78140cfe9606c6005f949ae560bff64e59c212c4
--- /dev/null
+++ b/agent/mini_bench/prompts/judge_prompt.py
@@ -0,0 +1,159 @@
+# SYSTEM PROMPT
+DEFAULT_SYSTEM_PROMPT_FORMAT = "You are an expert evaluator of web agent. {role_description}"
+
+PROGRESS_WITHOUT_CHECKLIST_ROLE = "Your task is to assess how helpful a given agent's THOUGHT and ACTION is in making progress toward the user's goal, based on the current state of the webpage."
+PROGRESS_WITH_CHECKLIST_ROLE = "Your task is to assess how helpful a given agent's THOUGHT and ACTION is in making progress toward the user's goal, based on the current state of the webpage."
+
+GROUNDING_ROLE = "Your task is to assess whether the ACTION taken by the agent is properly grounded, based on agent's THOUGHT and the current state of the webpage."
+
+# USER PROMPT
+DEFAULT_USER_PROMPT_FORMAT = """# Action space:
+{action_space}
+
+# Task Description
+{task_description}
+
+# Given Information
+{input_information}
+
+# Output Format
+{output_format}
+"""
+
+
+JUDGE_OURS_WO_CHECKLIST_USER_PROMPT_FORMAT = """You are an expert evaluator of web agent. Your task is to assess how helpful a given agent's THOUGHT and ACTION is in making progress toward the user's goal, based on the current state of the webpage.
+
+# Task Description
+Evaluate how well the agent’s THOUGHT and ACTION satisfy each item in the checklist using the task instruction, trajectory (including previously completed steps), current webpage state, and the agent’s latest response. Start by writing a concise paragraph summarizing the agent’s overall performance. Refer to the reasoning provided in the trajectory, and discuss whether the THOUGHT is appropriate and the ACTION moves the task forward.
+
+# Given Information
+{input_information}
+"""
+
+
+JUDGE_OURS_USER_PROMPT_FORMAT = """You are an expert evaluator of web agent. Your task is to assess how helpful a given agent's THOUGHT and ACTION is in making progress toward the user's goal, based on the current state of the webpage.
+
+# Task Description
+Evaluate how well the agent’s THOUGHT and ACTION satisfy each item in the checklist using the task instruction, trajectory (including previously completed steps), current webpage state, and the agent’s latest response. Start by writing a concise paragraph summarizing the agent’s overall performance. Refer to the reasoning provided in the trajectory, and discuss whether the THOUGHT is appropriate and the ACTION moves the task forward.
+Then, assess each checklist item individually using the following labels:
+- Yes: The item is fully and clearly satisfied, either in the current response or previously completed.
+- In Progress: There is meaningful partial progress toward completing the item.
+- No: The item is not satisfied due to ambiguity, insufficient evidence, or lack of progress.
+
+# Given Information
+{input_information}
+"""
+
+
+JUDGE_OURS_BT_MODELING_USER_PROMPT_FORMAT = """You are an expert web agent that browses internet via GUI actions. Your task is to achieve the user's goal described in the user instruction.
+
+# Task Description
+Generate the most appropriate GUI action to achieve the user's goal. When choosing your action, consider the current webpage state and the checklist which can be interpreted as subtasks.
+
+# Given Information
+## User Instruction
+{intent}
+
+## Trajectory
+{trajectory}
+
+## Current State
+### Current URL
+{current_url}
+
+### AXTREE
+Note: [bid] is the unique alpha-numeric identifier at the beginning of lines for each element in the AXTree. Always use bid to refer to elements in your actions.
+{text_observation}
+
+## Checklist
+{checklist}
+
+## Agent's Response
+"""
+
+JUDGE_OURS_BT_MODELING_BASE_PROMPT = """You are an expert web agent that browses internet via GUI actions. Your task is to achieve the user's goal described in the user instruction.
+
+# Task Description
+Generate the most appropriate GUI action to achieve the user's goal. When choosing your action, consider the current webpage state and the checklist which can be interpreted as subtasks.
+
+# Given Information
+## User Instruction
+{intent}
+
+## Trajectory
+{trajectory}
+
+## Current State
+### Current URL
+{current_url}
+
+### AXTREE
+Note: [bid] is the unique alpha-numeric identifier at the beginning of lines for each element in the AXTree. Always use bid to refer to elements in your actions.
+{text_observation}
+"""
+
+JUDGE_OURS_IMAGE_INPUT = """
+### Image Screenshot
+<IMAGE_PLACEHOLDER>
+"""
+
+JUDGE_OURS_WITH_CHECKLIST = """
+## Checklist
+{checklist}
+"""
+
+BT_MODELING_RESPONSE_FORMAT = """
+THOUGHT: {thought}
+ACTION: {action}
+"""
+
+## PROMPT TEMPLATE
+JUDGE_GROUNDING_PROMPT_TEMPLATE = {
+    "system": DEFAULT_SYSTEM_PROMPT_FORMAT.format(role_description=GROUNDING_ROLE),
+    "user": DEFAULT_USER_PROMPT_FORMAT,
+}
+
+JUDGE_LIKERT_SCALE_PROMPT_TEMPLATE = {
+    "system": DEFAULT_SYSTEM_PROMPT_FORMAT.format(role_description=PROGRESS_WITHOUT_CHECKLIST_ROLE),
+    "user": DEFAULT_USER_PROMPT_FORMAT
+}
+
+JUDGE_THREE_CLASS_PROMPT_TEMPLATE = {
+    "system": DEFAULT_SYSTEM_PROMPT_FORMAT.format(role_description=PROGRESS_WITHOUT_CHECKLIST_ROLE),
+    "user": DEFAULT_USER_PROMPT_FORMAT
+}
+
+JUDGE_WITH_CHECKLIST_PROMPT_TEMPLATE = {
+    "system": DEFAULT_SYSTEM_PROMPT_FORMAT.format(role_description=PROGRESS_WITH_CHECKLIST_ROLE),
+    "user": DEFAULT_USER_PROMPT_FORMAT
+}
+
+JUDGE_OURS_PROMPT_TEMPLATE = {
+    "system": "",
+    "user": JUDGE_OURS_USER_PROMPT_FORMAT,
+}
+
+JUDGE_OURS_WO_CHECKLIST_PROMPT_TEMPLATE = {
+    "system": "",
+    "user": JUDGE_OURS_WO_CHECKLIST_USER_PROMPT_FORMAT,
+}
+
+JUDGE_OURS_BT_MODELING_PROMPT_TEMPLATE = {
+    "user": JUDGE_OURS_BT_MODELING_BASE_PROMPT+JUDGE_OURS_WITH_CHECKLIST+"\n## Agent's Response\n",
+    "assistant": BT_MODELING_RESPONSE_FORMAT,
+}
+
+JUDGE_OURS_BT_MODELING_MULTIMODAL_PROMPT_TEMPLATE = {
+    "user": JUDGE_OURS_BT_MODELING_BASE_PROMPT+JUDGE_OURS_IMAGE_INPUT+JUDGE_OURS_WITH_CHECKLIST+"\n## Agent's Response\n",
+    "assistant": BT_MODELING_RESPONSE_FORMAT,
+}
+
+JUDGE_OURS_BT_MODELING_WO_CHECKLIST_PROMPT_TEMPLATE = {
+    "user": JUDGE_OURS_BT_MODELING_BASE_PROMPT+"\n## Agent's Response\n",
+    "assistant": BT_MODELING_RESPONSE_FORMAT,
+}
+
+JUDGE_OURS_BT_MODELING_MULTIMODAL_WO_CHECKLIST_PROMPT_TEMPLATE = {
+    "user": JUDGE_OURS_BT_MODELING_BASE_PROMPT+JUDGE_OURS_IMAGE_INPUT+"\n## Agent's Response\n",
+    "assistant": BT_MODELING_RESPONSE_FORMAT,
+}
diff --git a/agent/mini_bench/prompts/utils.py b/agent/mini_bench/prompts/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8694ef624237c32d1d3a722ab12547a8216984f0
--- /dev/null
+++ b/agent/mini_bench/prompts/utils.py
@@ -0,0 +1,18 @@
+from langchain.schema import HumanMessage, AIMessage, SystemMessage
+
+def convert_dict_messages(dict_messages):
+    message_objs = []
+    for msg in dict_messages:
+        role = msg.get("role")
+        content = msg.get("content", "")
+        
+        if role == "user":
+            message_objs.append(HumanMessage(content=content))
+        elif role == "assistant":
+            message_objs.append(AIMessage(content=content))
+        elif role == "system":
+            message_objs.append(SystemMessage(content=content))
+        else:
+            raise ValueError(f"Unknown role: {role}")
+    
+    return message_objs
\ No newline at end of file
diff --git a/agent/mini_bench/reward_agent.py b/agent/mini_bench/reward_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..6838b76e60988b436308770defb9e6cc90c6b764
--- /dev/null
+++ b/agent/mini_bench/reward_agent.py
@@ -0,0 +1,465 @@
+from abc import ABC, abstractmethod
+import time
+import requests
+import json
+import math
+from langsmith import Client
+import numpy as np
+from langchain_openai import ChatOpenAI
+
+from .prompts import get_messages
+from .prompts.judge_prompt import (
+    JUDGE_OURS_BT_MODELING_PROMPT_TEMPLATE,
+    JUDGE_OURS_BT_MODELING_WO_CHECKLIST_PROMPT_TEMPLATE,
+    JUDGE_OURS_BT_MODELING_MULTIMODAL_PROMPT_TEMPLATE,
+    JUDGE_OURS_BT_MODELING_MULTIMODAL_WO_CHECKLIST_PROMPT_TEMPLATE
+)
+from .prompts.image_utils import image_to_base64_url
+from .prompts.utils import convert_dict_messages
+
+MAX_RETRY = 3
+RETRY_SLEEP = 5
+MODEL_COST_MAPPING = {
+    "gpt-4o-mini": {
+        "input_token_cost": 0.15,
+        "output_token_cost": 0.6
+    },
+    "gpt-4o": {
+        "input_token_cost": 2.5,
+        "output_token_cost": 10
+    },
+}
+
+
+class Agent(ABC):
+    @abstractmethod
+    def generate_response(self, inputs: dict) -> str:
+        pass
+
+class BaseAgent(Agent):
+    def __init__(self, agent_config: dict):
+        self.agent_config = agent_config
+        self._setup()
+
+    def _init_llm_object(self, **extra_kwargs):
+        config = self.agent_config
+        config.update(extra_kwargs)
+        
+        use_log_probs = config.get("use_log_probs", False)
+        if use_log_probs:
+            self.llm = ChatOpenAI(
+                model=config["model_name"], 
+                base_url=config["base_url"], 
+                api_key=config["api_key"], 
+                temperature=config["temperature"],
+                timeout=300,
+                logprobs=True,
+                top_logprobs=10,
+                n=config.get('n', None)
+            )
+        else:
+            self.llm = ChatOpenAI(
+                model=config["model_name"], 
+                base_url=config["base_url"], 
+                api_key=config["api_key"], 
+                temperature=config["temperature"],
+                timeout=300,
+                n=config.get('n', None)
+            )
+    
+    def _setup(self):
+        self._init_llm_object()
+        
+        self.temperature = self.agent_config["temperature"]
+        self.num_generate = self.agent_config["num_generate"]
+        self.use_checklist = self.agent_config.get("use_checklist", False)
+        self.use_multimodal = self.agent_config.get("use_multimodal", False)
+
+        # setup cost
+        model_cost = MODEL_COST_MAPPING.get(self.agent_config["model_name"], None)
+        if model_cost and "api" in self.agent_config["base_url"]:
+            self.input_token_cost = model_cost["input_token_cost"]
+            self.output_token_cost = model_cost["output_token_cost"]
+        else:
+            self.input_token_cost = 0.0
+            self.output_token_cost = 0.0
+
+    def generate_with_retry(self, model_input, constraint_str_list: list = None):
+        total_input_tokens = 0
+        total_output_tokens = 0
+        if self.temperature == 0:
+            response = self.llm.invoke(model_input)
+            total_input_tokens += response.response_metadata["token_usage"]["prompt_tokens"]
+            total_output_tokens += response.response_metadata["token_usage"]["completion_tokens"]
+        else:
+            for i in range(MAX_RETRY):
+                try:
+                    response = self.llm.invoke(model_input)
+                    total_input_tokens += response.response_metadata["token_usage"]["prompt_tokens"]
+                    total_output_tokens += response.response_metadata["token_usage"]["completion_tokens"]
+                    if constraint_str_list:
+                        pass_constraint_num = 0
+                        for constraint_str in constraint_str_list:
+                            if constraint_str in response.content:
+                                pass_constraint_num += 1
+                        if pass_constraint_num == len(constraint_str_list):
+                            break
+                        else:
+                            print(f"Agent has fomat issue, retry... {i+1}/{MAX_RETRY}")
+                    else:
+                        break
+                except Exception as e:
+                    print(f"Agent returned an Error: {e}")
+                    response = None
+                    time.sleep(RETRY_SLEEP)
+        
+        cost = self.input_token_cost * total_input_tokens / 1000000 + self.output_token_cost * total_output_tokens / 1000000
+        
+        if response is None:
+            return "", cost
+        else:
+            return response.content, cost
+    
+    def prepare_message(self, model_input: dict, prompt_type: str):
+        message = []
+        return message
+
+    def generate_response(self, model_input: dict, prompt_type: str, constraint_str_list: list = None,):
+        total_cost = 0
+        response_list = []
+        # prepare message
+        message = self.prepare_message(model_input, prompt_type)
+
+        # n sampling
+        for i in range(self.num_generate):
+            response, cost = self.generate_with_retry(message, constraint_str_list)
+            response_list.append(response)
+            total_cost += cost
+        
+        return response_list, total_cost
+
+
+class GroundingJudgeAgent(BaseAgent):
+    def __init__(self, agent_config: dict):
+        super().__init__(agent_config)
+        self._setup()
+    
+    def prepare_message(self, model_input: dict, prompt_type):
+        message = get_messages(
+            input_info=model_input,
+            inference_mode="judge_grounding",
+            prompt_type=prompt_type,
+            use_multimodal=self.use_multimodal,
+            text_obs=self.agent_config["text_obs_type"],
+            image_obs=self.agent_config["image_obs_type"]
+        )
+        return message
+
+
+class ProgressJudgeAgent(BaseAgent):
+    def __init__(self, agent_config: dict):
+        super().__init__(agent_config)
+        self._setup()
+    
+    def prepare_message(self, model_input: dict, prompt_type):
+        if self.agent_config["input_type"]=="text_only":
+            use_multimodal = False
+            text_obs = self.agent_config["text_obs_type"]
+            image_obs = None
+        elif self.agent_config["input_type"]=="image_only":
+            use_multimodal = True
+            text_obs = None
+            image_obs = self.agent_config["image_obs_type"]
+        elif self.agent_config["input_type"]=="text_image":
+            use_multimodal = True
+            text_obs = self.agent_config["text_obs_type"]
+            image_obs = self.agent_config["image_obs_type"]
+        else:
+            raise ValueError(f"Invalid input type: {self.agent_config['input_type']}")
+        
+        if self.agent_config["use_in_progress"]:
+            use_in_progress = True
+        else:
+            use_in_progress = False
+
+        message = get_messages(
+            input_info=model_input,
+            inference_mode="judge_progress",
+            prompt_type=prompt_type,
+            use_checklist=self.use_checklist,
+            use_multimodal=use_multimodal,
+            text_obs=text_obs,
+            image_obs=image_obs,
+            use_in_progress=use_in_progress
+        )
+        return message
+    
+    def get_judge_probs(self, logprobs: list):
+        # target_judge = {
+        #     "yes": [" Yes", "Yes", "ĠYes", "ĊYes"],
+        #     "no": [" No", "No", "ĠNo", "ĊNo"],
+        #     "in": [" In", "In", "ĠIn", "ĊIn"]
+        # }
+        target_judge = {
+            "yes": [
+                "ĠYes", "Yes", "ĊYes",
+                "Ġyes", "yes", "Ċyes",
+                "ĠYES", "YES", "ĊYES",
+                "ĠDone", "Done", "ĊDone",
+                "ĠCompleted", "Completed", "ĊCompleted",
+                "ĠCorrect", "Correct", "ĊCorrect"
+            ],
+            "no": [
+                "ĠNo", "No", "ĊNo",
+                "ĠNO", "NO", "ĊNO",
+                "ĠNot", "Not", "ĊNot",
+                "ĠNone", "None", "ĊNone",
+                "ĠNope", "Nope", "ĊNope",
+                "ĠUn", "Un", "ĊUn",
+                "ĠWrong", "Wrong", "ĊWrong"
+            ],
+            "in": [
+                "ĠIn", "In", "ĊIn",
+                "ĠPending", "Pending", "ĊPending",
+                "ĠPart", "Part", "ĊPart",
+                "ĠPartial", "Partial", "ĊPartial",
+                "ĠInProgress", "InProgress", "ĊInProgress"
+            ]
+        }
+        response_str = ""
+        judge_probs_list = []
+        for i, log_prob in enumerate(logprobs):
+            # Start to find judge string
+            if "<answer>" in response_str:
+                find_judge_str = False
+                for judge_type in target_judge:
+                    if log_prob["token"] in target_judge[judge_type]:
+                        # print(log_prob)
+                        find_judge_str = True
+                        break
+                if find_judge_str:
+                    token_judge_dict = {
+                        "yes": None,
+                        "no": None,
+                        "in": None
+                    }
+                    for token_info in log_prob["top_logprobs"]:
+                        for judge_type in target_judge:
+                            for judge_str in target_judge[judge_type]:
+                                if judge_str in token_info["token"] :
+                                    if token_judge_dict[judge_type] is None:
+                                        token_judge_dict[judge_type] = math.exp(token_info["logprob"])
+                                    else:
+                                        token_judge_dict[judge_type] += math.exp(token_info["logprob"])
+
+                    token_judge_dict = {
+                        "yes": math.log(token_judge_dict["yes"]) if token_judge_dict["yes"] is not None else -float('inf'),
+                        "no": math.log(token_judge_dict["no"]) if token_judge_dict["no"] is not None else -float('inf'),
+                        "in": math.log(token_judge_dict["in"]) if token_judge_dict["in"] is not None else -float('inf')
+                    }
+                    judge_probs_list.append(token_judge_dict)
+            
+            if "</answer>" in response_str:
+                break
+            
+            response_str += log_prob["token"]
+
+        if len(judge_probs_list) == 0:
+            return [{
+                "yes": 0.0,
+                "no": 0.0,
+                "in": 0.0
+            }]
+        else:
+            # convert with softmax
+            final_judge_probs_list = []
+            max_in_prob = -float('inf')
+            for idx, judge_probs in enumerate(judge_probs_list):
+                exp_logprobs = [math.exp(x) for x in [judge_probs["yes"], judge_probs["no"], judge_probs["in"]]]
+                sum_exp_logprobs = sum(exp_logprobs)
+                softmax_probs = [x / sum_exp_logprobs for x in exp_logprobs]
+                if softmax_probs[2] > max_in_prob:
+                    max_in_prob = softmax_probs[2]
+                final_judge_probs_list.append({
+                    "yes": softmax_probs[0], 
+                    "no": softmax_probs[1],
+                    "in": softmax_probs[2]
+                })
+            return final_judge_probs_list
+    
+    def generate_probs(self, model_input: dict, prompt_type: str, n=1, temperature=None):
+        total_cost = 0
+        # prepare message
+        message = self.prepare_message(model_input, prompt_type)
+        messages = convert_dict_messages(message)
+
+        kwargs = {'n': n}
+        if temperature is not None:
+            kwargs['temperature'] = temperature
+        self._init_llm_object(**kwargs)
+        
+        try:
+            response = self.llm.generate([messages])  # assume single batch
+        finally:
+            print('request url: ', self.agent_config['base_url'])
+
+        
+        # parse responses
+        response_list = []
+        for generation in response.generations[0]:  # assume singel batch
+            # parse logprobs
+            logprobs = generation.message.response_metadata["logprobs"]["content"]
+            response_list.append(
+                {
+                    "response": generation.message.content,
+                    "judge_probs": self.get_judge_probs(logprobs)
+                }
+            )
+
+        # calculate cost
+        total_input_tokens = response.llm_output["token_usage"]["prompt_tokens"]
+        total_output_tokens = response.llm_output["token_usage"]["completion_tokens"]
+        total_cost = self.input_token_cost * total_input_tokens / 1000000 + self.output_token_cost * total_output_tokens / 1000000  
+        
+        return response_list, total_cost
+
+
+class ChecklistGenerationAgent(BaseAgent):
+    def __init__(self, agent_config: dict):
+        super().__init__(agent_config)
+        self._setup()
+    
+    def prepare_message(self, model_input: dict, prompt_type):
+        message = get_messages(
+            input_info=model_input,
+            inference_mode="checklist_generation",
+            prompt_type=prompt_type
+        )
+        return message
+    
+
+class ClassifierRewardAgent(Agent):
+    def __init__(self, url: str, use_checklist: bool = False, use_multimodal: bool = False):
+        self.url = url
+        self.use_checklist = use_checklist
+        self.use_multimodal = use_multimodal
+
+    def _process_multimodal_message(self, prompt: str, image_list: list[str]):
+        multimodal_message = []
+        text_prompt_prefix = prompt.split("<IMAGE_PLACEHOLDER>")[0]
+        text_prompt_suffix = prompt.split("<IMAGE_PLACEHOLDER>")[1]
+        multimodal_message = [
+            {"type": "text", "text": text_prompt_prefix},
+            # {"type": "image_url", "image_url": {"url": image_to_base64_url(image_list[0])}},
+            {"type": "image", "image": image_to_base64_url(image_list[0])},
+            {"type": "text", "text": text_prompt_suffix}
+        ]
+        return multimodal_message
+
+    def _make_query(self, user_prompt_template: dict, model_input: dict | list[dict]):
+        if self.use_multimodal:
+            tmp_user_prompt = user_prompt_template["user"].format(
+                **model_input
+            )
+            user_prompt =  self._process_multimodal_message(tmp_user_prompt, model_input["image_list"])
+        else:
+            user_prompt = user_prompt_template["user"].format(
+                **model_input
+            )
+        assistant_prompt = user_prompt_template["assistant"].format(
+            **model_input
+        )
+        query = [
+            {"role": "user", "content": user_prompt},
+            {"role": "assistant", "content": assistant_prompt}
+        ]
+        return query
+
+    def prepare_message(self, model_input: dict | list[dict], batch: bool = False):
+        if self.use_checklist:
+            if self.use_multimodal:
+                user_prompt_template = JUDGE_OURS_BT_MODELING_MULTIMODAL_PROMPT_TEMPLATE
+            else:
+                user_prompt_template = JUDGE_OURS_BT_MODELING_PROMPT_TEMPLATE
+        else:
+            if self.use_multimodal:
+                user_prompt_template = JUDGE_OURS_BT_MODELING_MULTIMODAL_WO_CHECKLIST_PROMPT_TEMPLATE
+            else:
+                user_prompt_template = JUDGE_OURS_BT_MODELING_WO_CHECKLIST_PROMPT_TEMPLATE
+
+        if self.use_multimodal:
+            if batch:
+                message = [self._make_query(user_prompt_template, input) for input in model_input]
+            else:
+                message = [self._make_query(user_prompt_template, model_input)]
+        else:
+            if batch:
+                message = {
+                    "query": [self._make_query(user_prompt_template, input) for input in model_input],
+                    "promptts": []
+                }
+            else:
+                message = {
+                    "query": self._make_query(user_prompt_template, model_input),
+                    "prompts": []
+                }
+
+        return message
+    
+    def get_rm_scroe(self, message: dict | list):
+        headers = {"Content-Type": "application/json"}
+        
+        try:
+            if self.use_multimodal:
+                response = requests.post(
+                    self.url,
+                    json={"messages": message},
+                    timeout=600
+                )
+            else:
+                response = requests.post(
+                    self.url,
+                    headers=headers,
+                    data=json.dumps(message),
+                    timeout=300
+                )
+            response.raise_for_status()
+            
+            response_json = response.json()
+
+            if "rewards" not in response_json:
+                print(f"Error: 'rewards' key not found in API response: {response_json}")
+                return []
+            
+            if "get_reward" in self.url:
+                # use openrlhf
+                return response_json["rewards"]
+            elif "pooling" in self.url:
+                # use vllm server
+                return response_json["reward"]
+            else:
+                # error
+                raise ValueError(f"Invalid URL: {self.url}")
+
+        except requests.exceptions.Timeout:
+            print(f"Error: Request timed out to {self.url}")
+            return []
+        except requests.exceptions.RequestException as e:
+            print(f"Error during request to {self.url}: {e}")
+            return []
+        except json.JSONDecodeError:
+            print(f"Error: Failed to decode JSON response from {self.url}")
+            return []
+        except KeyError as e:
+             print(f"Error: Missing key {e} in response from {self.url}")
+             return []
+
+
+    def generate_response(self, model_input: dict | list[dict], batch: bool = False):
+        if batch:
+            message = self.prepare_message(model_input, batch=True)
+        else:
+            message = self.prepare_message(model_input)
+        rewards = self.get_rm_scroe(message)
+        
+        return rewards, 0
\ No newline at end of file
diff --git a/agent/mini_bench/utils.py b/agent/mini_bench/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bf25e45036e7ff480d2bc88bff6d00faee5bfcb
--- /dev/null
+++ b/agent/mini_bench/utils.py
@@ -0,0 +1,269 @@
+import json
+import base64
+import io
+import html
+from PIL import Image
+
+
+def image_to_base64_url(image: str | Image.Image):
+    if isinstance(image, str):
+        with open(image, "rb") as f:
+            image = f.read()
+    elif isinstance(image, Image.Image):
+        if image.mode in ("RGBA", "LA"):
+            image = image.convert("RGB")
+        with io.BytesIO() as buffer:
+            image.save(buffer, format="PNG")
+            image = buffer.getvalue()
+    else:
+        raise ValueError(f"Invalid image type: {type(image)}")
+    
+    return "data:image/png;base64," + base64.b64encode(image).decode("utf-8")
+
+
+def load_json(file_path: str) -> dict:
+    with open(file_path, "r") as f:
+        return json.load(f)
+    
+def save_json(data: dict, file_path: str):
+    with open(file_path, "w") as f:
+        json.dump(data, f, indent=4)
+
+def str_to_bool(s: str) -> bool:
+    if s.lower() in ["true", "1", "yes", "y"]:
+        return True
+    elif s.lower() in ["false", "0", "no", "n"]:
+        return False
+    else:
+        raise ValueError(f"Invalid boolean string: {s}")
+    
+
+def create_html_report(json_path, html_path, checklist_generation=False):
+    """
+    Reads the given JSON result file and generates a filterable HTML report.
+
+    Args:
+        json_path (str): Path to the input JSON file.
+        html_path (str): Path to the output HTML file.
+    """
+    try:
+        with open(json_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+    except FileNotFoundError:
+        print(f"Error: JSON file not found - {json_path}") # Error message in English
+        return
+    except json.JSONDecodeError:
+        print(f"Error: JSON file parsing error - {json_path}") # Error message in English
+        return
+    except Exception as e:
+        print(f"Unexpected error during data loading: {e}") # Error message in English
+        return
+
+    # Extract unique Task IDs and sort them
+    task_ids = sorted(list(set(item.get("task_id") for item in data if item.get("task_id") is not None)))
+
+    html_content = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Benchmark Results Report</title>
+    <style>
+        body { font-family: sans-serif; line-height: 1.6; padding: 20px; }
+        .task-step { border: 1px solid #ccc; margin-bottom: 20px; padding: 15px; border-radius: 5px; background-color: #f9f9f9; }
+        .task-step h2 { margin-top: 0; color: #333; border-bottom: 1px solid #eee; padding-bottom: 5px;}
+        .task-step h3 { color: #555; margin-top: 15px; margin-bottom: 5px; }
+        .task-step h4 { color: #777; margin-top: 10px; margin-bottom: 5px; font-style: italic;}
+        pre { background-color: #eee; padding: 10px; border-radius: 3px; white-space: pre-wrap; word-wrap: break-word; font-size: 0.9em; margin-top: 5px; }
+        details { margin-top: 10px; border: 1px solid #ddd; border-radius: 3px; background-color: #fff; }
+        summary { cursor: pointer; padding: 8px; background-color: #f8f9fa; font-weight: bold; border-bottom: 1px solid #ddd; }
+        details[open] summary { border-bottom: 1px solid #ddd; }
+        details > pre { border: none; background-color: #fff; padding: 10px 8px; }
+        .response-item-toggle { margin-top: 10px; }
+        .chosen-section { border-left: 5px solid #4CAF50; padding-left: 10px; margin-top: 15px; }
+        .rejected-section { border-left: 5px solid #f44336; padding-left: 10px; margin-top: 15px; }
+        hr { border: 0; border-top: 1px solid #eee; margin: 15px 0; }
+        .thought-action { background-color: #f0f0f0; padding: 10px; border-radius: 3px; margin-bottom: 10px; border: 1px solid #e0e0e0;}
+        .thought-action h4 { margin-top: 0; color: #666; }
+        .task-container { display: none; }
+        .filter-controls { margin-bottom: 20px; padding: 10px; background-color: #e9ecef; border-radius: 5px; }
+        .filter-controls label { margin-right: 10px; font-weight: bold; }
+        .filter-controls select { padding: 5px; border-radius: 3px; border: 1px solid #ced4da; }
+    </style>
+</head>
+<body>
+    <h1>Benchmark Results Report</h1>
+
+    <!-- Task ID Filter Dropdown -->
+    <div class="filter-controls">
+        <label for="taskSelector">Select Task ID:</label>
+        <select id="taskSelector">
+            <option value="">-- Show All --</option>
+"""
+    # Add dropdown options
+    for tid in task_ids:
+        html_content += f'            <option value="{html.escape(str(tid))}">{html.escape(str(tid))}</option>\n'
+
+    html_content += """
+        </select>
+    </div>
+
+    <!-- Results Display Area -->
+    <div id="resultsArea">
+"""
+
+    # Process each Task/Step data
+    for i, step_data in enumerate(data):
+        task_id = step_data.get("task_id", "N/A")
+        step_id = step_data.get("step_id", "N/A")
+        intent = step_data.get("intent", "N/A")
+        start_url = step_data.get("start_url", "N/A")
+        gt_checklist = step_data.get("gt_checklist", "N/A")
+        generated_checklist = step_data.get("generated_checklist", None)
+        trajectory = step_data.get("trajectory", "N/A")
+        text_observation = step_data.get("text_observation", "N/A")
+        source_name = step_data.get("source_name", "")
+
+        # Wrap each Task/Step in a container with a unique ID (hidden initially)
+        html_content += f"""
+    <div class="task-container" data-task-id="{html.escape(str(task_id))}">
+        <div class="task-step">
+            <h2>Task ID: {html.escape(str(task_id))} | Step ID: {html.escape(str(step_id))} {f'({html.escape(source_name)})' if source_name else ''}</h2>
+            <h3>Intent:</h3>
+            <p>{html.escape(intent)}</p>
+            <p><strong>Start URL:</strong> <a href="{html.escape(start_url)}" target="_blank">{html.escape(start_url)}</a></p>
+
+            <h3>Ground Truth Checklist:</h3>
+            <pre>{html.escape(gt_checklist)}</pre>
+"""
+        if checklist_generation and generated_checklist is not None:
+            html_content += f"""
+            <details>
+                <summary>Generated Checklist (Click to expand/collapse)</summary>
+                <pre>{html.escape(str(generated_checklist))}</pre>
+            </details>
+"""
+
+        html_content += f"""
+            <details>
+                <summary>Trajectory (Click to expand/collapse)</summary>
+                <pre>{html.escape(trajectory)}</pre>
+            </details>
+
+            <details>
+                <summary>Text Observation (Click to expand/collapse)</summary>
+                <pre>{html.escape(text_observation)}</pre>
+            </details>
+            <hr>
+"""
+
+        # Chosen Responses
+        if 'chosen' in step_data and step_data['chosen']:
+            html_content += '<div class="chosen-section"><h3>Chosen Responses:</h3>'
+            for choice_block in step_data['chosen']:
+                thought = choice_block.get('thought', 'N/A')
+                action = choice_block.get('action', 'N/A')
+                responses = choice_block.get('response', [])
+                scores = choice_block.get('score', [])
+
+                # Add Thought and Action information
+                html_content += f"""
+            <div class="thought-action">
+                <h4>Thought:</h4>
+                <pre>{html.escape(thought)}</pre>
+                <h4>Action:</h4>
+                <pre>{html.escape(action)}</pre>
+            </div>"""
+
+                # Loop through responses and create toggles
+                for idx, (response, score) in enumerate(zip(responses, scores)):
+                     html_content += f"""
+            <details class="response-item-toggle">
+                <summary>Judge Response {idx + 1}: {html.escape(str(score))}</summary>
+                <pre>{html.escape(str(response))}</pre>
+            </details>"""
+            html_content += '</div>' # End chosen-section
+
+        # Rejected Responses
+        if 'rejected' in step_data and step_data['rejected']:
+            html_content += '<div class="rejected-section"><h3>Rejected Responses:</h3>'
+            for rejection_block in step_data['rejected']:
+                thought = rejection_block.get('thought', 'N/A')
+                action = rejection_block.get('action', 'N/A')
+                responses = rejection_block.get('response', [])
+                scores = rejection_block.get('score', [])
+
+                # Add Thought and Action information
+                html_content += f"""
+            <div class="thought-action">
+                <h4>Thought:</h4>
+                <pre>{html.escape(thought)}</pre>
+                <h4>Action:</h4>
+                <pre>{html.escape(action)}</pre>
+            </div>"""
+
+                # Loop through responses and create toggles
+                for idx, (response, score) in enumerate(zip(responses, scores)):
+                     html_content += f"""
+            <details class="response-item-toggle">
+                <summary>Judge Response {idx + 1}: {html.escape(str(score))}</summary>
+                <pre>{html.escape(str(response))}</pre>
+            </details>"""
+            html_content += '</div>' # End rejected-section
+
+        html_content += """
+        </div> <!-- End task-step -->
+    </div> <!-- End task-container -->
+"""
+
+    # Finalize HTML and add JavaScript
+    html_content += """
+    </div> <!-- End resultsArea -->
+
+    <script>
+        document.addEventListener('DOMContentLoaded', function() {
+            const taskSelector = document.getElementById('taskSelector');
+            const taskContainers = document.querySelectorAll('.task-container');
+
+            function filterTasks() {
+                const selectedTaskId = taskSelector.value;
+
+                taskContainers.forEach(container => {
+                    const containerTaskId = container.getAttribute('data-task-id');
+                    // Show if no Task ID is selected (Show All) or if the container's Task ID matches
+                    if (selectedTaskId === "" || containerTaskId === selectedTaskId) {
+                        container.style.display = 'block';
+                    } else {
+                        // Otherwise, hide it
+                        container.style.display = 'none';
+                    }
+                });
+            }
+
+            // Run filter function on dropdown change
+            taskSelector.addEventListener('change', filterTasks);
+
+            // Run initial filtering on page load (default: Show All)
+            filterTasks();
+        });
+    </script>
+
+</body>
+</html>
+"""
+
+    # Save the HTML file
+    try:
+        with open(html_path, 'w', encoding='utf-8') as f:
+            f.write(html_content)
+        print(f"Completed: HTML report created at {html_path}")
+    except IOError:
+        print(f"Error: Failed to write HTML file - {html_path}")
+    except Exception as e:
+        print(f"Unexpected error during HTML file saving: {e}")
+
+# --- Example Usage ---
+# input_json_file = 'path/to/your/results.json'
+# output_html_file = 'trajectory_report.html'
+# create_html_report(input_json_file, output_html_file)
\ No newline at end of file
diff --git a/agent/reward.py b/agent/reward.py
new file mode 100644
index 0000000000000000000000000000000000000000..2337bc24110a83634c0ec9d6feba3edde13b85f8
--- /dev/null
+++ b/agent/reward.py
@@ -0,0 +1,96 @@
+import time
+from typing import List, Dict, Any, Optional, Union
+import numpy as np
+from .mini_bench.reward_agent import ProgressJudgeAgent
+from .reward_postprocessor import REWARD_PROCESSORS, REWARD_PROCESSOR_N_SAMPLES, extract_judge_hash
+import json
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+def _process_unit(idx, unit, configs, n_samples, reward_processor, max_retries=5):
+    """하나의 unit을 처리해 (idx, reward, thought)를 돌려준다."""
+    agent = ProgressJudgeAgent(configs)
+    current_temperature = configs["temperature"]
+
+    rewards = []
+    n_err = 0
+    retry_count = 0
+    judge_hash_count_thought = {}
+
+    while len(rewards) < n_samples and retry_count < max_retries:
+        # 외부 API 호출
+        responses, _ = agent.generate_probs(
+            unit, "ours", n=n_samples - len(rewards), temperature=current_temperature
+        )
+
+        for response in responses:
+            content = response["response"]
+            thought = content                          # 전체를 로그로 저장
+            reward = REWARD_PROCESSORS[reward_processor](response)
+            rewards.append(reward)
+
+            if np.isnan(reward) or reward is None:
+                n_err += 1
+            else:
+                judge_hash = extract_judge_hash(response)
+                judge_hash_count_thought[judge_hash] = (judge_hash_count_thought.get(judge_hash, (0, None))[0] + 1, thought)
+
+        if n_err > 0:
+            # 실패 시 온도를 높여 재시도
+            if n_samples == 1:
+                current_temperature = 0.5
+        retry_count += 1
+
+    reward = np.nanmean(rewards)
+    if np.isnan(reward):
+        print(f"[idx={idx}] Warning: reward is NaN after retries -> set 0")
+        reward = 0.0
+    print(judge_hash_count_thought)
+    thought = max(judge_hash_count_thought.values(), key=lambda x: x[0])[1]
+
+    return idx, reward, thought
+
+
+def get_ar_reward(dataset, base_url, model_name, reward_processor='avg_logits', max_workers=8):
+    """원본 get_ar_reward를 스레드 버전으로 교체."""
+    n_samples = REWARD_PROCESSOR_N_SAMPLES[reward_processor]
+
+    temperature = 0.5 if n_samples > 1 else 0.0
+
+    configs = {
+        "model_name": model_name,
+        "base_url": base_url,
+        "api_key": "empty",
+        "temperature": temperature,
+        "num_generate": 1,
+        "use_checklist": True,
+        "input_type": "text_only",
+        "text_obs_type": "axtree",
+        "image_obs_type": "som",
+        "use_in_progress": True,
+        "use_multimodal": False,
+        "use_log_probs": True,
+    }
+
+    t_start = time.time()
+    results = [None] * len(dataset)
+
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [
+            executor.submit(
+                _process_unit, idx, unit, configs, n_samples, reward_processor
+            )
+            for idx, unit in enumerate(dataset)
+        ]
+
+        for fut in as_completed(futures):
+            idx, reward, thought = fut.result()
+            results[idx] = (reward, thought)
+
+    # 순서 보존된 리스트로 분리
+    final_rewards = [float(r) for r, _ in results]
+    thoughts = [t for _, t in results]
+
+    print(f"Time taken (threaded): {time.time() - t_start:.2f} s")
+    return final_rewards, thoughts
+
diff --git a/agent/reward_postprocessor.py b/agent/reward_postprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..4124dba8cf12745a64d756002c3c5e99a17f1123
--- /dev/null
+++ b/agent/reward_postprocessor.py
@@ -0,0 +1,41 @@
+import numpy as np
+import re
+
+
+def extract_judge_hash(response):
+    """
+    checklist 별로 yes, in, no를 판단한 정보를 hash 형태로 변환하여 반환
+    """
+    content = response['response']
+    
+    try:
+        judge_content = content.lower().replace(' ', '').split('<answer>')[1].split('</answer>')[0]
+    except:
+        import traceback
+        traceback.print_exc()
+        return None
+    pattern = r":yes|:inprogress|:no"
+    matches = re.findall(pattern, judge_content)
+    matches = [{':yes': 'y', ':inprogress': 'i', ':no': 'n'}[match] for match in matches]
+    return ''.join(matches)
+
+def average_logits(response):
+    """
+    yes, in, no를 logits 레벨에서 계산.
+    """
+    judge_probs = response['judge_probs']
+    
+    yes_ = np.mean([r['yes'] for r in judge_probs])
+    in_ = np.mean([r['in'] for r in judge_probs])
+    
+    reward = yes_ + 0.5 * in_
+    return reward
+
+
+REWARD_PROCESSORS = {
+    'avg_logits': average_logits
+}
+
+REWARD_PROCESSOR_N_SAMPLES = {
+    'avg_logits': 5
+}
\ No newline at end of file
diff --git a/app.py b/app.py
index 4027860d2a81cd7a3b2b1abc5b4ba1801bf4988f..3f6f650e1c8e873fea880b1e697dd6a80a4266cd 100644
--- a/app.py
+++ b/app.py
@@ -1,20 +1,25 @@
+# import os
+# import subprocess
+
+# # Install BrowserGym dependencies before running the main application
+# def install_browsergym():
+#     try:
+#         print("Installing BrowserGym dependencies...")
+#         subprocess.run("cd BrowserGym && make install", shell=True, check=True)
+#         print("BrowserGym installation completed successfully")
+#     except subprocess.CalledProcessError as e:
+#         print(f"Error installing BrowserGym: {e}")
+#         raise
+
+# install_browsergym()
+
 import os
-import json
-import base64
-import io
-import argparse
 import logging
 import gradio as gr
 import openai
-import gymnasium as gym
-import browsergym.core
-from PIL import Image
-import numpy as np
-from browsergym.core.action.highlevel import HighLevelActionSet
-from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html
-from browsergym.experiments import Agent
-from dotenv import load_dotenv
-import cv2
+import multiprocessing
+
+from process_run import process_run
 
 # Configure logging
 logging.basicConfig(
@@ -22,433 +27,103 @@ logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(message)s',
     handlers=[
         logging.StreamHandler(),
-        logging.FileHandler('browser_agent.log')
     ]
 )
 logger = logging.getLogger(__name__)
-
-load_dotenv()
+logger.setLevel('INFO')
 
 # Set your OpenAI API key
 openai.api_key = os.getenv("OPENAI_API_KEY")
 
+
 # Example instructions to display
 EXAMPLES = [
-    "Search for the latest AI news on Google",
-    "Go to Wikipedia and find the population of Seoul",
-    "Open YouTube and play the top trending video",
+    "When did the solar system form? Find on wikipedia.",
+    "Find the rating of Monopoly (1935) on boardgamegeek.com",
 ]
 
-def str2bool(v):
-    if isinstance(v, bool):
-        return v
-    if v.lower() in ("yes", "true", "t", "y", "1"):
-        return True
-    elif v.lower() in ("no", "false", "f", "n", "0"):
-        return False
-    else:
-        raise argparse.ArgumentTypeError("Boolean value expected.")
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Run BrowserGym web agent.")
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        default="gpt-4o",
-        help="OpenAI model name.",
-    )
-    parser.add_argument(
-        "--start_url",
-        type=str,
-        default="https://www.duckduckgo.com",
-        help="Starting URL for the openended task.",
-    )
-    parser.add_argument(
-        "--visual_effects",
-        type=str2bool,
-        default=True,
-        help="Add visual effects when the agent performs actions.",
-    )
-    parser.add_argument(
-        "--use_html",
-        type=str2bool,
-        default=False,
-        help="Use HTML in the agent's observation space.",
-    )
-    parser.add_argument(
-        "--use_axtree",
-        type=str2bool,
-        default=True,
-        help="Use AXTree in the agent's observation space.",
-    )
-    parser.add_argument(
-        "--use_screenshot",
-        type=str2bool,
-        default=False,
-        help="Use screenshot in the agent's observation space.",
-    )
-    parser.add_argument(
-        "--log_level",
-        type=str,
-        default="INFO",
-        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
-        help="Set the logging level.",
-    )
-    return parser.parse_args()
-
-def image_to_jpg_base64_url(image: np.ndarray | Image.Image):
-    """Convert a numpy array to a base64 encoded image url."""
-    if isinstance(image, np.ndarray):
-        image = Image.fromarray(image)
-    if image.mode in ("RGBA", "LA"):
-        image = image.convert("RGB")
-
-    with io.BytesIO() as buffer:
-        image.save(buffer, format="JPEG")
-        image_base64 = base64.b64encode(buffer.getvalue()).decode()
-
-    return f"data:image/jpeg;base64,{image_base64}"
-
-class BrowserAgent(Agent):
-    def obs_preprocessor(self, obs: dict) -> dict:
-        return {
-            "chat_messages": obs["chat_messages"],
-            "screenshot": obs["screenshot"],
-            "goal_object": obs["goal_object"],
-            "last_action": obs["last_action"],
-            "last_action_error": obs["last_action_error"],
-            "open_pages_urls": obs["open_pages_urls"],
-            "open_pages_titles": obs["open_pages_titles"],
-            "active_page_index": obs["active_page_index"],
-            "axtree_txt": flatten_axtree_to_str(obs["axtree_object"], filter_visible_only=True, extra_properties=obs['extra_element_properties']),
-            "pruned_html": prune_html(flatten_dom_to_str(obs["dom_object"])),
-        }
-
-    def __init__(self, model_name: str = "gpt-4o", use_html: bool = False, use_axtree: bool = True, use_screenshot: bool = False):
-        super().__init__()
-        logger.info(f"Initializing BrowserAgent with model: {model_name}")
-        logger.info(f"Observation space: HTML={use_html}, AXTree={use_axtree}, Screenshot={use_screenshot}")
-        
-        self.model_name = model_name
-        self.use_html = use_html
-        self.use_axtree = use_axtree
-        self.use_screenshot = use_screenshot
-        
-        if not (use_html or use_axtree):
-            raise ValueError("Either use_html or use_axtree must be set to True.")
-        
-        self.openai_client = openai.OpenAI()
-        
-        self.action_set = HighLevelActionSet(
-            subsets=["chat", "tab", "nav", "bid", "infeas"],
-            strict=False,
-            multiaction=False,
-            demo_mode="default"
-        )
-        self.action_history = []
-
-    def get_action(self, obs: dict) -> tuple[str, dict]:
-        logger.debug("Preparing action request")
-        
-        system_msgs = [{
-            "type": "text",
-            "text": """\
-# Instructions
-
-You are a UI Assistant, your goal is to help the user perform tasks using a web browser. You can
-communicate with the user via a chat, to which the user gives you instructions and to which you
-can send back messages. You have access to a web browser that both you and the user can see,
-and with which only you can interact via specific commands.
-
-Review the instructions from the user, the current state of the page and all other information
-to find the best possible next action to accomplish your goal. Your answer will be interpreted
-and executed by a program, make sure to follow the formatting instructions.
-"""
-        }]
-
-        user_msgs = []
-        
-        # Add chat messages
-        user_msgs.append({
-            "type": "text",
-            "text": "# Chat Messages\n"
-        })
-        for msg in obs["chat_messages"]:
-            if msg["role"] in ("user", "assistant", "infeasible"):
-                user_msgs.append({
-                    "type": "text",
-                    "text": f"- [{msg['role']}] {msg['message']}\n"
-                })
-                logger.debug(f"Added chat message: [{msg['role']}] {msg['message']}")
-            elif msg["role"] == "user_image":
-                user_msgs.append({"type": "image_url", "image_url": msg["message"]})
-                logger.debug("Added user image message")
-
-        # Add open tabs info
-        user_msgs.append({
-            "type": "text",
-            "text": "# Currently open tabs\n"
-        })
-        for page_index, (page_url, page_title) in enumerate(
-            zip(obs["open_pages_urls"], obs["open_pages_titles"])
-        ):
-            user_msgs.append({
-                "type": "text",
-                "text": f"""\
-Tab {page_index}{" (active tab)" if page_index == obs["active_page_index"] else ""}
-  Title: {page_title}
-  URL: {page_url}
-"""
-            })
-            logger.debug(f"Added tab info: {page_title} ({page_url})")
-
-        # Add accessibility tree if enabled
-        if self.use_axtree:
-            user_msgs.append({
-                "type": "text",
-                "text": f"""\
-# Current page Accessibility Tree
-
-{obs["axtree_txt"]}
-
-"""
-            })
-            logger.debug("Added accessibility tree")
-
-        # Add HTML if enabled
-        if self.use_html:
-            user_msgs.append({
-                "type": "text",
-                "text": f"""\
-# Current page DOM
-
-{obs["pruned_html"]}
-
-"""
-            })
-            logger.debug("Added HTML DOM")
-
-        # Add screenshot if enabled
-        if self.use_screenshot:
-            user_msgs.append({
-                "type": "text",
-                "text": "# Current page Screenshot\n"
-            })
-            user_msgs.append({
-                "type": "image_url",
-                "image_url": {
-                    "url": image_to_jpg_base64_url(obs["screenshot"]),
-                    "detail": "auto"
-                }
-            })
-            logger.debug("Added screenshot")
-
-        # Add action space description
-        user_msgs.append({
-            "type": "text",
-            "text": f"""\
-# Action Space
-
-{self.action_set.describe(with_long_description=False, with_examples=True)}
-
-Here are examples of actions with chain-of-thought reasoning:
-
-I now need to click on the Submit button to send the form. I will use the click action on the button, which has bid 12.
-```click("12")```
-
-I found the information requested by the user, I will send it to the chat.
-```send_msg_to_user("The price for a 15\\" laptop is 1499 USD.")```
-
-"""
-        })
-
-        # Add action history and errors
-        if self.action_history:
-            user_msgs.append({
-                "type": "text",
-                "text": "# History of past actions\n"
-            })
-            for action in self.action_history:
-                user_msgs.append({
-                    "type": "text",
-                    "text": f"\n{action}\n"
-                })
-                logger.debug(f"Added past action: {action}")
-
-            if obs["last_action_error"]:
-                user_msgs.append({
-                    "type": "text",
-                    "text": f"""\
-# Error message from last action
-
-{obs["last_action_error"]}
-
-"""
-                })
-                logger.warning(f"Last action error: {obs['last_action_error']}")
-
-        # Ask for next action
-        user_msgs.append({
-            "type": "text",
-            "text": """\
-# Next action
-
-You will now think step by step and produce your next best action. Reflect on your past actions, any resulting error message, and the current state of the page before deciding on your next action.
-"""
-        })
-
-        # Log the full prompt for debugging
-        prompt_text_strings = []
-        for message in system_msgs + user_msgs:
-            match message["type"]:
-                case "text":
-                    prompt_text_strings.append(message["text"])
-                case "image_url":
-                    image_url = message["image_url"]
-                    if isinstance(message["image_url"], dict):
-                        image_url = image_url["url"]
-                    if image_url.startswith("data:image"):
-                        prompt_text_strings.append(
-                            "image_url: " + image_url[:30] + "... (truncated)"
-                        )
-                    else:
-                        prompt_text_strings.append("image_url: " + image_url)
-                case _:
-                    raise ValueError(
-                        f"Unknown message type {repr(message['type'])} in the task goal."
-                    )
-        full_prompt_txt = "\n".join(prompt_text_strings)
-        logger.debug(full_prompt_txt)
-
-        # Query OpenAI model
-        logger.info("Sending request to OpenAI")
-        response = self.openai_client.chat.completions.create(
-            model=self.model_name,
-            messages=[
-                {"role": "system", "content": system_msgs},
-                {"role": "user", "content": user_msgs}
-            ]
-        )
-        action = response.choices[0].message.content
-        logger.info(f"Received action from OpenAI: {action}")
-        self.action_history.append(action)
-        return action, {}
-
-def run_agent(instruction: str, model_name: str = "gpt-4o", start_url: str = "https://www.duckduckgo.com",
-              use_html: bool = False, use_axtree: bool = True, use_screenshot: bool = False):
-    logger.info(f"Starting agent with instruction: {instruction}")
-    logger.info(f"Configuration: model={model_name}, start_url={start_url}")
-    
-    trajectory = []
-    agent = BrowserAgent(
-        model_name=model_name,
-        use_html=use_html,
-        use_axtree=use_axtree,
-        use_screenshot=use_screenshot
-    )
-
-    # Initialize BrowserGym environment
-    logger.info("Initializing BrowserGym environment")
-    env = gym.make(
-        "browsergym/openended",
-        task_kwargs={
-            "start_url": start_url,
-            "task": "openended",  # Required task parameter
-            "goal": instruction,
-        },
-        wait_for_user_message=True
-    )
-    obs, info = env.reset()
-    logger.info("Environment initialized")
-
-    # Send user instruction to the environment
-    logger.info("Sending user instruction to environment")
-    obs, reward, terminated, truncated, info = env.step({
-        "type": "send_msg_to_user",
-        "message": instruction
-    })
-    processed_obs = agent.obs_preprocessor(obs)
-    logger.info(f"Obs: {processed_obs.keys()}")
-    logger.info(f"axtree_txt: {processed_obs['axtree_txt']}")
-
-    # 초기 상태 yield
-    trajectory.append((obs['screenshot'], "Initial state"))
-    yield obs['screenshot'], trajectory.copy()
-
-    try:
-        step_count = 0
-        while True:
-            logger.info(f"Step {step_count}: Getting next action")
-            # Get next action from agent
-            action, _ = agent.get_action(processed_obs)
-            
-            # Execute action
-            logger.info(f"Step {step_count}: Executing action: {action}")
-            obs, reward, terminated, truncated, info = env.step(action)
-            processed_obs = agent.obs_preprocessor(obs)
-            
-            # trajectory에 numpy array 직접 저장
-            trajectory.append((obs['screenshot'], action))
-            logger.info(f"Step {step_count}: Saved screenshot and updated trajectory")
-            step_count += 1
-
-            # 매 step마다 yield
-            yield obs['screenshot'], trajectory.copy()
-
-            if terminated or truncated:
-                logger.info(f"Episode ended: terminated={terminated}, truncated={truncated}")
-                break
-
-    finally:
-        logger.info("Closing environment")
-        env.close()
+URL_EXAMPLES = [
+    "about:blank",
+    "https://www.wikipedia.org",
+    "https://www.boardgamegeek.com"
+]
 
 def main():
-    args = parse_args()
-    
-    # Set logging level from command line argument
-    logger.setLevel(getattr(logging, args.log_level))
     logger.info("Starting BrowserGym web agent")
-    logger.info(f"Arguments: {args}")
     
-    with gr.Blocks(title="🎯 Web Agent Demo with BrowserGym & OpenAI") as demo:
-        gr.Markdown("# Web Agent Demo (BrowserGym + OpenAI)")
+    with gr.Blocks(title="WebShephered Demo") as demo:
+        # Add CSS for outlined groups
+        gr.Markdown("# WebShephered Demo")
         with gr.Row():
-            with gr.Column(scale=1):
-                gr.Markdown("## Examples")
-                gr.Examples(
-                    examples=[[e] for e in EXAMPLES],
-                    inputs=[gr.Textbox(label="Instruction")],
-                    cache_examples=False,
-                )
             with gr.Column(scale=2):
-                instruction = gr.Textbox(
-                    label="Enter your instruction here",
-                    placeholder="E.g., 'Search for AI then click #result-stats'",
-                    lines=2,
-                )
+                with gr.Column():
+                    instruction = gr.Textbox(
+                        label="Instruction",
+                        placeholder="Enter your instruction here",
+                        lines=2,
+                    )
+                    gr.Examples(
+                        examples=[[e] for e in EXAMPLES],
+                        inputs=instruction,
+                        cache_examples=False,
+                    )
+                
+                gr.Markdown("\n\n")
+                
+                with gr.Column():
+                    start_url = gr.Textbox(
+                        label="Starting URL",
+                        placeholder="URL to start the browser at",
+                        value="about:blank"
+                    )
+                    gr.Examples(
+                        examples=URL_EXAMPLES,
+                        inputs=start_url,
+                        cache_examples=False,
+                    )
+
+                gr.Markdown("\n\n")
+
                 model_name = gr.Dropdown(
-                    label="Model",
-                    choices=["gpt-4o", "gpt-4o-mini"],
-                    value=args.model_name
+                    label="Agent Model",
+                    choices=["gpt-4o"],
+                    value="gpt-4o"
                 )
-                run_btn = gr.Button("Run Agent")
-                browser_view = gr.Image(label="Browser View")
+                run_btn = gr.Button("Run Demo")
+
+                gr.Markdown("---")
+                
+                with gr.Column():
+                    gr.Markdown("## Current State")
+                    state_view = gr.Markdown()
+                    browser_view = gr.Image(label="Browser View")
+
+                    gr.Markdown("### Task Checklist from WebShephered")
+                    checklist_view = gr.Markdown()
+                    
+                    gr.Markdown("### Action Selection in current step")
+                    with gr.Row() as rm_row:
+                        rm_cards_container = gr.HTML()
             with gr.Column(scale=2):
-                gr.Markdown("## Trajectory History")
-                trajectory_gallery = gr.Gallery(label="Action & State", columns=2)
+                gr.Markdown("## Trajectory")
+                trajectory_container = gr.HTML() # Placeholder for our custom trajectory component
 
+        
+            
         run_btn.click(
-            fn=run_agent,
-            inputs=[instruction, model_name],
-            outputs=[browser_view, trajectory_gallery],
+            fn=process_run,
+            inputs=[instruction, model_name, start_url],
+            outputs=[state_view, browser_view, checklist_view, rm_cards_container, trajectory_container],
             api_name="run_agent",
-            show_progress=True,
-            concurrency_limit=1
+            concurrency_limit=32,
+            show_progress=True
         )
 
     logger.info("Launching Gradio interface")
-    demo.launch(share=True)
+    # Set max_threads to allow multiple concurrent requests
+    demo.launch(share=True, max_threads=32)
 
 if __name__ == "__main__":
+    # Add support for multiprocessing on Windows
+    multiprocessing.freeze_support()
     main()
diff --git a/browser_agent.py b/browser_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..389a0da2a48b912c8f6ea42f9de119eb0414a403
--- /dev/null
+++ b/browser_agent.py
@@ -0,0 +1,282 @@
+import logging
+import os
+from typing import Any, List, Tuple
+
+from browsergym.core.action.highlevel import HighLevelActionSet
+from browsergym.utils.obs import (
+    flatten_axtree_to_str,
+    flatten_dom_to_str,
+    prune_html,
+)
+from browsergym.experiments import Agent
+
+from utils import remove_inline_comments_safe, image_to_jpg_base64_url
+
+import openai
+
+
+logger = logging.getLogger(__name__)
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+
+class BrowserAgent(Agent):
+    def obs_preprocessor(self, obs: dict) -> dict:
+        return {
+            "chat_messages": obs["chat_messages"],
+            "som_screenshot": obs["som_screenshot"],
+            "goal_object": obs["goal_object"],
+            "last_action": obs["last_action"],
+            "last_action_error": obs["last_action_error"],
+            "open_pages_urls": obs["open_pages_urls"],
+            "open_pages_titles": obs["open_pages_titles"],
+            "active_page_index": obs["active_page_index"],
+            "axtree_txt": flatten_axtree_to_str(obs["axtree_object"], filter_visible_only=True, extra_properties=obs['extra_element_properties'], filter_som_only=True),
+            "pruned_html": prune_html(flatten_dom_to_str(obs["dom_object"])),
+        }
+
+    def __init__(self, model_name: str = "gpt-4o", use_html: bool = False, use_axtree: bool = True, use_screenshot: bool = False):
+        super().__init__()
+        logger.info(f"Initializing BrowserAgent with model: {model_name}")
+        logger.info(f"Observation space: HTML={use_html}, AXTree={use_axtree}, Screenshot={use_screenshot}")
+        
+        self.model_name = model_name
+        self.use_html = use_html
+        self.use_axtree = use_axtree
+        self.use_screenshot = use_screenshot
+        
+        if not (use_html or use_axtree):
+            raise ValueError("Either use_html or use_axtree must be set to True.")
+        
+        self.openai_client = openai.OpenAI()
+        
+        self.action_set = HighLevelActionSet(
+            subsets=["chat", "tab", "nav", "bid", "infeas"],
+            strict=False,
+            multiaction=False,
+            demo_mode="default"
+        )
+        self.action_history = []
+
+    def get_action(self, obs: dict) -> tuple[str, dict]:
+        logger.debug("Preparing action request")
+        
+        system_msgs = [{
+            "type": "text",
+            "text": """\
+# Instructions
+
+You are a UI Assistant, your goal is to help the user perform tasks using a web browser. You can
+communicate with the user via a chat, to which the user gives you instructions and to which you
+can send back messages. You have access to a web browser that both you and the user can see,
+and with which only you can interact via specific commands.
+
+Review the instructions from the user, the current state of the page and all other information
+to find the best possible next action to accomplish your goal. Your answer will be interpreted
+and executed by a program, make sure to follow the formatting instructions.
+"""
+        }]
+
+        user_msgs = []
+        
+        # Add chat messages
+        user_msgs.append({
+            "type": "text",
+            "text": "# Chat Messages\n"
+        })
+        for msg in obs["chat_messages"]:
+            if msg["role"] in ("user", "assistant", "infeasible"):
+                user_msgs.append({
+                    "type": "text",
+                    "text": f"- [{msg['role']}] {msg['message']}\n"
+                })
+                logger.debug(f"Added chat message: [{msg['role']}] {msg['message']}")
+            elif msg["role"] == "user_image":
+                user_msgs.append({"type": "image_url", "image_url": msg["message"]})
+                logger.debug("Added user image message")
+
+        # Add open tabs info
+        user_msgs.append({
+            "type": "text",
+            "text": "# Currently open tabs\n"
+        })
+        for page_index, (page_url, page_title) in enumerate(
+            zip(obs["open_pages_urls"], obs["open_pages_titles"])
+        ):
+            user_msgs.append({
+                "type": "text",
+                "text": f"""\
+Tab {page_index}{" (active tab)" if page_index == obs["active_page_index"] else ""}
+  Title: {page_title}
+  URL: {page_url}
+"""
+            })
+            logger.debug(f"Added tab info: {page_title} ({page_url})")
+
+        # Add accessibility tree if enabled
+        if self.use_axtree:
+            user_msgs.append({
+                "type": "text",
+                "text": f"""\
+# Current page Accessibility Tree
+
+{obs["axtree_txt"]}
+
+"""
+            })
+            logger.debug("Added accessibility tree")
+
+        # Add HTML if enabled
+        if self.use_html:
+            user_msgs.append({
+                "type": "text",
+                "text": f"""\
+# Current page DOM
+
+{obs["pruned_html"]}
+
+"""
+            })
+            logger.debug("Added HTML DOM")
+
+        # Add screenshot if enabled
+        if self.use_screenshot:
+            user_msgs.append({
+                "type": "text",
+                "text": "# Current page Screenshot\n"
+            })
+            user_msgs.append({
+                "type": "image_url",
+                "image_url": {
+                    "url": image_to_jpg_base64_url(obs["som_screenshot"]),
+                    "detail": "auto"
+                }
+            })
+            logger.debug("Added screenshot")
+
+        # Add action space description
+        user_msgs.append({
+            "type": "text",
+            "text": f"""\
+# Action Space
+
+{self.action_set.describe(with_long_description=False, with_examples=True)}
+
+Here are examples of actions with chain-of-thought reasoning:
+
+I now need to click on the Submit button to send the form. I will use the click action on the button, which has bid 12.
+```click("12")```
+
+I found the information requested by the user, I will send it to the chat.
+```send_msg_to_user("The price for a 15\\" laptop is 1499 USD.")```
+
+"""
+        })
+
+        # Add action history and errors
+        if self.action_history:
+            user_msgs.append({
+                "type": "text",
+                "text": "# History of past actions\n"
+            })
+            for action in self.action_history:
+                user_msgs.append({
+                    "type": "text",
+                    "text": f"\n{action}\n"
+                })
+                logger.debug(f"Added past action: {action}")
+
+            if obs["last_action_error"]:
+                user_msgs.append({
+                    "type": "text",
+                    "text": f"""\
+# Error message from last action
+
+{obs["last_action_error"]}
+
+"""
+                })
+                logger.warning(f"Last action error: {obs['last_action_error']}")
+
+        # Ask for next action
+        user_msgs.append({
+            "type": "text",
+            "text": """\
+# Next action
+
+You will now think step by step and produce your next best action. Reflect on your past actions, any resulting error message, and the current state of the page before deciding on your next action.
+Note: You might use 'goto' action if you're in a blank page.
+"""
+        })
+
+        # Log the full prompt for debugging
+        prompt_text_strings = []
+        for message in system_msgs + user_msgs:
+            match message["type"]:
+                case "text":
+                    prompt_text_strings.append(message["text"])
+                case "image_url":
+                    image_url = message["image_url"]
+                    if isinstance(message["image_url"], dict):
+                        image_url = image_url["url"]
+                    if image_url.startswith("data:image"):
+                        prompt_text_strings.append(
+                            "image_url: " + image_url[:30] + "... (truncated)"
+                        )
+                    else:
+                        prompt_text_strings.append("image_url: " + image_url)
+                case _:
+                    raise ValueError(
+                        f"Unknown message type {repr(message['type'])} in the task goal."
+                    )
+        full_prompt_txt = "\n".join(prompt_text_strings)
+        logger.debug(full_prompt_txt)
+
+        # Query OpenAI model
+        logger.info("Sending request to OpenAI")
+        response = self.openai_client.chat.completions.create(
+            model=self.model_name,
+            messages=[
+                {"role": "system", "content": system_msgs},
+                {"role": "user", "content": user_msgs}
+            ],
+            n=20,
+            temperature=0.8
+        )
+        parses = []
+        for i, choice in enumerate(response.choices):
+            response = choice.message.content
+            try:
+                parses.append({
+                    'response': response,
+                    'thought': response.split('```')[0].strip(),
+                    'action': remove_inline_comments_safe(response.split('```')[1].strip('`').strip().strip('`').strip()),
+                })
+            except Exception as e:
+                logger.error(f"Error parsing action: {e}")
+                logger.error(f"Response: {response}")
+                logger.error(f"Choice: {choice}")
+                logger.error(f"Index: {i}")
+                logger.error(f"Response: {response}")
+                
+        candidates = self.get_top_k_actions(parses)
+        logger.info(f"Received action from OpenAI: {[cand['action'] for cand in candidates]}")
+        return candidates, {}
+
+    def get_top_k_actions(self, parses, k=3):
+        count_dict = {}
+        action_to_parsed = {}
+        for parsed in parses:
+            action = parsed["action"]
+            if action in count_dict:
+                count_dict[action] += 1
+            else:
+                count_dict[action] = 1
+                action_to_parsed[action] = parsed.copy()
+        
+        # Get the top_k most frequent actions
+        sorted_actions = sorted(count_dict.items(), key=lambda x: x[1], reverse=True)
+        top_k_actions = [action_to_parsed[action] for action, _ in sorted_actions[:k]]
+        
+        return top_k_actions
\ No newline at end of file
diff --git a/process_run.py b/process_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5a2dd217ab6fcecd566df280d4ae6621e7fcd87
--- /dev/null
+++ b/process_run.py
@@ -0,0 +1,301 @@
+from pathlib import Path
+import multiprocessing
+import logging
+from PIL import Image
+import io
+import base64
+import numpy as np
+import gymnasium as gym
+import os
+
+from agent.checklist import generate_checklist
+from agent.reward import get_ar_reward
+
+from browser_agent import BrowserAgent
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel('INFO')
+
+templates_dir = Path(__file__).parent / "templates"
+CSS_RM_CARDS: str = (templates_dir / "rm_cards.css").read_text()
+CSS_TRAJECTORY: str = (templates_dir / "trajectory.css").read_text()
+CARD_HTML_TEMPLATE: str = (templates_dir / "card.html").read_text()
+
+RM_BASE_URL = os.environ['RM_BASE_URL']
+RM_MODEL_NAME = os.environ['RM_MODEL_NAME']
+
+def return_state(state, screenshot=None):
+    return state, None, None, screenshot, None
+
+def run_agent(instruction: str, model_name: str = "gpt-4o", start_url: str = "about:blank",
+              use_html: bool = False, use_axtree: bool = True, use_screenshot: bool = False, max_steps: int = 20):
+    logger.info(f"Starting agent with instruction: {instruction}")
+    logger.info(f"Configuration: model={model_name}, start_url={start_url}")
+    
+    trajectory = []
+    trajectory_str = ''
+    agent = BrowserAgent(
+        model_name=model_name,
+        use_html=use_html,
+        use_axtree=use_axtree,
+        use_screenshot=use_screenshot
+    )
+
+    # Initialize BrowserGym environment
+    logger.info("Initializing BrowserGym environment")
+    yield return_state("## Initializing BrowserGym environment...", None)
+    env = gym.make(
+        "browsergym/openended",
+        task_kwargs={
+            "start_url": start_url,
+            "goal": instruction,
+        },
+        wait_for_user_message=True
+    )
+    obs, info = env.reset()
+    logger.info("Environment initialized")
+
+    # Send user instruction to the environment
+    logger.info("Sending user instruction to environment")
+    obs, reward, terminated, truncated, info = env.step({
+        "type": "send_msg_to_user",
+        "message": instruction
+    })
+    processed_obs = agent.obs_preprocessor(obs)
+    logger.info(f"Obs: {processed_obs.keys()}")
+    logger.info(f"axtree_txt: {processed_obs['axtree_txt']}")
+
+    yield return_state("## Generating checklist...", obs['som_screenshot'])
+    checklist = generate_checklist(intent=instruction, start_url=start_url, text_observation=processed_obs['axtree_txt'])
+
+    # yield initial state
+    current_screenshot = obs['som_screenshot'].copy()
+    yield "## Rollout actions from policy...", checklist, [], current_screenshot, trajectory.copy()
+
+    try:
+        step_count = 0
+        while step_count < max_steps:
+            logger.info(f"Step {step_count}: Getting next action")
+            # Get next action from agent
+            candidates, _ = agent.get_action(processed_obs)
+
+            yield return_state(f"## Rewarding actions...", current_screenshot)
+
+            total_rewards, total_thoughts = get_ar_reward(
+                dataset=[
+                    {
+                        'text_observation': processed_obs['axtree_txt'],
+                        'intent': instruction,
+                        'trajectory': trajectory_str,
+                        'current_url': processed_obs['open_pages_urls'][processed_obs['active_page_index'][0]],
+                        'checklist': checklist,
+                        'thought': cand['thought'],
+                        'action': cand['action'],
+                    } for cand in candidates
+                ],
+                base_url=RM_BASE_URL,
+                model_name=RM_MODEL_NAME,
+            )
+
+            # process rewards
+            diff_reward = abs(max(total_rewards) - total_rewards[0])  # reward difference between actions with the highest reward and the most frequent.
+            if diff_reward <= 0.01:
+                logger.info(f"diff_reward: {diff_reward} -> most frequent action")
+                max_index = 0  # most frequent action
+            else:
+                logger.info(f"diff_reward: {diff_reward} -> highest reward")
+                max_index = total_rewards.index(max(total_rewards))  # highest reward
+
+            # sort by reward
+            sorted_indices = sorted(list(enumerate(total_rewards)), key=lambda x: (-1 if x[0] == max_index else 0, -x[1]))
+            new_order = [idx for idx, _ in sorted_indices]
+            candidates = [candidates[idx] for idx in new_order]
+            total_rewards = [total_rewards[idx] for idx in new_order]
+            total_thoughts = [total_thoughts[idx] for idx in new_order]
+
+            best_cand = candidates[0]
+
+            agent.action_history.append(best_cand['response'])
+
+            action = best_cand['action']
+
+            # processing action
+            step_info = {
+                'thought': best_cand['thought'],
+                'action': action
+            }
+            current_cards = [{'thought': cand['thought'], 'action': cand['action'], 'feedback': feedback, 'reward': round(reward, 2)} for idx, (cand, reward, feedback) in enumerate(zip(candidates, total_rewards, total_thoughts))]
+
+            trajectory_str += f'THOUGHT {step_count+1}: {step_info["thought"]}\nACTION {step_count+1}: {step_info["action"]}\n\n'
+            
+            # Execute action
+            logger.info(f"Step {step_count}: Executing action: {action}")
+            yield f"## Executing action: {action}", checklist, current_cards, current_screenshot, trajectory.copy()
+            if action.startswith('send_msg_to_user'):
+                terminated = True
+                truncated = False
+            else:
+                obs, reward, terminated, truncated, info = env.step(action)
+            trajectory.append((processed_obs['som_screenshot'], [{'action': cand['action'], 'reward': round(reward, 2)} for cand, reward in zip(candidates, total_rewards)]))
+            processed_obs = agent.obs_preprocessor(obs)
+            current_screenshot = processed_obs['som_screenshot'].copy()
+
+            while '\n\n' in step_info['thought']:
+                step_info['thought'] = step_info['thought'].replace('\n\n', '\n')
+            
+            # trajectory에 numpy array 직접 저장
+            logger.info(f"Step {step_count}: Saved screenshot and updated trajectory")
+            step_count += 1
+
+            # yield by each step
+            yield "## Rollout actions from policy...", checklist, current_cards, current_screenshot, trajectory.copy()
+
+            if terminated or truncated:
+                logger.info(f"Episode ended: terminated={terminated}, truncated={truncated}")
+                yield return_state("## Episode ended", current_screenshot)
+                break
+
+    finally:
+        logger.info("Finished")
+
+
+def run_agent_worker(instruction, model_name, start_url, use_html, use_axtree, use_screenshot, max_steps, return_queue):
+    """Worker function that runs the agent in a separate process and puts results in a queue."""
+    try:
+        for result in run_agent(instruction, model_name, start_url, use_html, use_axtree, use_screenshot, max_steps):
+            return_queue.put(result)
+    except Exception as e:
+        logger.error(f"Error in agent worker process: {e}")
+        return_queue.put(("Error occurred in agent process", [], None, []))
+        import traceback
+        traceback.print_exc()
+    finally:
+        # Signal that the process is done
+        return_queue.put(None)
+
+def run_agent_wrapper(instruction, model_name="gpt-4o", start_url="about:blank",
+                     use_html=False, use_axtree=True, use_screenshot=False, max_steps=20):
+    """Wrapper function that runs the agent in a separate process and yields results."""
+    return_queue = multiprocessing.Queue()
+    
+    # Start the agent in a separate process
+    p = multiprocessing.Process(
+        target=run_agent_worker, 
+        args=(instruction, model_name, start_url, use_html, use_axtree, use_screenshot, max_steps, return_queue)
+    )
+    p.daemon = True  # Ensure process terminates when parent terminates
+    p.start()
+    
+    # Get results from the queue and yield them
+    while True:
+        result = return_queue.get()
+        if result is None:  # End signal
+            break
+        yield result
+    
+    # Clean up
+    if p.is_alive():
+        p.terminate()
+    p.join()
+
+def process_run(instruction, model_name, start_url):
+    # Use the wrapper function instead of directly calling run_agent
+    trajectory_generator = run_agent_wrapper(
+        instruction, 
+        model_name, 
+        start_url,
+        use_html=False,
+        use_axtree=True,
+        use_screenshot=False
+    )
+
+    all_trajectory = []
+    last_checklist_view, last_trajectory_html = None, None
+
+    for state, checklist_view, rm_cards, screenshot, trajectory in trajectory_generator:
+        if checklist_view is None:
+            yield state, screenshot, last_checklist_view, None, last_trajectory_html
+            continue
+        # Create HTML for reward model cards
+        rm_cards_html = f"""
+        <style>
+            {CSS_RM_CARDS}
+        </style>
+        <div class="rm-cards-container">
+        """
+        
+        for idx, card in enumerate(rm_cards):
+            rm_cards_html += CARD_HTML_TEMPLATE.format(
+                additional_class='top-candidate' if idx == 0 else '',
+                k=idx+1,
+                suffix='(best)' if idx == 0 else '',
+                thought=card['thought'],
+                action=card['action'],
+                reward=card['reward'],
+                feedback=card['feedback']
+            )
+        
+        rm_cards_html += "</div>"
+        all_trajectory = trajectory
+        
+        # Create HTML for trajectory display
+        trajectory_html = f"""
+        <style>
+            {CSS_TRAJECTORY}
+        </style>
+        <div class="trajectory-container">
+        """
+        
+        for idx, (after_img, cands) in enumerate(all_trajectory):
+            # Convert image to base64 if needed
+            img = all_trajectory[idx][0]
+            if isinstance(img, np.ndarray):
+                img = Image.fromarray(img)
+            if isinstance(img, Image.Image):
+                buffer = io.BytesIO()
+                img.save(buffer, format="JPEG")
+                img_str = base64.b64encode(buffer.getvalue()).decode()
+                img_src = f"data:image/jpeg;base64,{img_str}"
+            else:
+                img_src = img
+            
+            trajectory_html += f"""
+            <div class="step-container">
+                <div class="step-header">Step {idx + 1}</div>
+                <div class="step-content">
+                    <div class="step-image">
+                        <img src="{img_src}" alt="Browser state">
+                    </div>
+                    <div class="step-info">
+                        <div class="box-title">Action Candidates:</div>
+                        <div class="action-candidates">
+            """
+            
+            # Display all candidates for this step
+            for i, cand in enumerate(cands):
+                action = cand['action']
+                reward = cand['reward']
+                
+                trajectory_html += f"""
+                <div class="candidate-box{' selected' if i == 0 else ''}">
+                    <div class="box-title">
+                        Action {i+1}{' (Selected)' if i == 0 else ''}
+                        <span class="reward-text">Reward: {reward}</span>
+                    </div>
+                    <pre>{action}</pre>
+                </div>
+                """
+            
+            trajectory_html += """
+                        </div>
+                    </div>
+                </div>
+            </div>
+            """
+        
+        trajectory_html += "</div>" 
+        
+        last_checklist_view, last_trajectory_html = checklist_view, trajectory_html
+        yield state, screenshot, last_checklist_view, rm_cards_html, last_trajectory_html
+    yield state, screenshot, last_checklist_view, rm_cards_html, last_trajectory_html
diff --git a/requirements.txt b/requirements.txt
index 5582dc4fb19a373861662822e075bcd4cf6ff9a0..bb8e47a583bb7b74c8504053dbb375c759cbc3ea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,7 @@
 gradio
 openai
 numpy
-gymnasium
\ No newline at end of file
+gymnasium
+langsmith
+langchain[openai]
+pillow
\ No newline at end of file
diff --git a/templates/card.html b/templates/card.html
new file mode 100644
index 0000000000000000000000000000000000000000..274a06f43b3b615e0af8f310946c603bbc44a0dd
--- /dev/null
+++ b/templates/card.html
@@ -0,0 +1,33 @@
+<div class="rm-card {additional_class}">
+    <div class="rm-card-header">
+        Top {k} Action Candidate {suffix}
+    </div>
+    <div class="rm-card-body">
+        <div class="card-section">
+            <div class="card-section-title">Thought:</div>
+            <div class="thought-content">
+                <pre>{thought}</pre>
+            </div>
+        </div>
+        <div class="card-section">
+            <div class="card-section-title">Action:</div>
+            <div class="action-content">
+                <pre>{action}</pre>
+            </div>
+        </div>
+        <div class="card-section">
+            <div class="card-section-title">Reward:</div>
+            <div class="reward-content">
+                <pre>{reward}</pre>
+            </div>
+        </div>
+        <div class="card-section">
+            <details>
+                <summary>Feedback (click to view)</summary>
+                <div class="feedback-content">
+                    <pre>{feedback}</pre>
+                </div>
+            </details>
+        </div>
+    </div>
+</div>
\ No newline at end of file
diff --git a/templates/rm_cards.css b/templates/rm_cards.css
new file mode 100644
index 0000000000000000000000000000000000000000..e09b97dd6a01ae909031d4e476af4120da0b4d0b
--- /dev/null
+++ b/templates/rm_cards.css
@@ -0,0 +1,106 @@
+.rm-cards-container {
+    display: flex;
+    gap: 15px;
+    padding: 10px 0;
+    overflow-x: auto;
+}
+
+.rm-card {
+    min-width: 300px;
+    max-width: 400px;
+    border: 1px solid #ddd;
+    border-radius: 8px;
+    overflow: hidden;
+    box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
+    background: rgba(255, 255, 255, 0.0);
+}
+
+.rm-card.top-candidate {
+    border: 3px solid #007bff;
+    box-shadow: 0 4px 8px rgba(0, 0, 150, 0.2);
+}
+
+.rm-card-header {
+    background: rgba(240, 240, 240, 0.3);
+    padding: 10px 15px;
+    font-weight: bold;
+    border-bottom: 1px solid #ddd;
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+}
+
+.top-candidate .rm-card-header {
+    background: rgba(230, 242, 255, 0.3);
+}
+
+.reward-badge {
+    background: #007bff;
+    color: white;
+    padding: 3px 8px;
+    border-radius: 12px;
+    font-size: 0.9em;
+}
+
+.rm-card-body {
+    padding: 15px;
+    background-color: rgba(240, 240, 240, 0.0);
+}
+
+.card-section {
+    margin-bottom: 12px;
+}
+
+.card-section-title {
+    font-weight: bold;
+    margin-bottom: 5px;
+    color: #555;
+}
+
+.thought-content {
+    background: rgba(247, 247, 255, 0.3);
+    border: 1px solid #d0d0ff;
+    border-radius: 6px;
+    padding: 8px;
+}
+
+.action-content {
+    background: rgba(240, 255, 240, 0.3);
+    border: 1px solid #d0ffd0;
+    border-radius: 6px;
+    padding: 8px;
+}
+
+.feedback-content {
+    background: rgba(255, 247, 240, 0.3);
+    border: 1px solid #ffd0a0;
+    border-radius: 6px;
+    padding: 8px;
+}
+
+.reward-content {
+    background: rgba(240, 240, 255, 0.3);
+    border: 1px solid #d0d0ff;
+    border-radius: 6px;
+    padding: 8px;
+}
+
+details {
+    margin-top: 5px;
+}
+
+summary {
+    cursor: pointer;
+    font-weight: bold;
+    color: #555;
+}
+
+summary:hover {
+    color: #007bff;
+}
+
+pre {
+    margin: 0;
+    white-space: pre-wrap;
+    word-break: break-word;
+}
\ No newline at end of file
diff --git a/templates/trajectory.css b/templates/trajectory.css
new file mode 100644
index 0000000000000000000000000000000000000000..5795985190beaec7a73168113106e8094a16c0cd
--- /dev/null
+++ b/templates/trajectory.css
@@ -0,0 +1,82 @@
+/* templates/trajectory.css */
+
+.trajectory-container {
+    display: flex;
+    flex-direction: column;
+    gap: 20px;
+}
+
+.step-container {
+    border: 1px solid #ddd;
+    border-radius: 8px;
+    overflow: hidden;
+    box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
+}
+
+.step-header {
+    background: rgba(240, 240, 240, 0.0);
+    padding: 10px 15px;
+    font-weight: bold;
+    border-bottom: 1px solid #ddd;
+}
+
+.step-content {
+    display: flex;
+    gap: 15px;
+    flex-wrap: wrap;
+}
+
+.step-image {
+    flex: 1 1 40%;
+    padding: 10px;
+}
+
+.step-image img {
+    width: 100%;
+    border: 1px solid #eee;
+    border-radius: 4px;
+}
+
+.step-info {
+    flex: 1 1 55%;
+    display: flex;
+    flex-direction: column;
+    padding: 10px;
+}
+
+.action-candidates {
+    display: flex;
+    flex-direction: column;
+    gap: 10px;
+}
+
+.candidate-box {
+    background: rgba(245, 240, 255, 0.0);
+    border: 1px solid #d0d0ff;
+    border-radius: 6px;
+    padding: 10px;
+}
+
+.candidate-box.selected {
+    border: 2px solid #7030a0;
+    box-shadow: 0 2px 4px rgba(112, 48, 160, 0.1);
+}
+
+.box-title {
+    font-weight: bold;
+    margin-bottom: 5px;
+}
+
+.reward-text {
+    display: inline-block;
+    color: #555;
+    font-size: 0.9em;
+    margin-left: 8px;
+    font-style: italic;
+}
+
+pre {
+    margin: 0;
+    white-space: pre-wrap;
+    word-break: break-word;
+}
\ No newline at end of file
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c133195e82b192db4b825a1938ba2a1918a1457c
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,41 @@
+import tokenize
+import io
+import base64
+import numpy as np
+from PIL import Image
+
+def remove_inline_comments_safe(code: str) -> str:
+    result = []
+    tokens = tokenize.generate_tokens(io.StringIO(code).readline)
+
+    last_line = -1
+    current_line = ''
+    for tok_type, tok_string, (srow, scol), (_, _), _ in tokens:
+        if srow != last_line:
+            if current_line:
+                result.append(current_line.rstrip())
+            current_line = ''
+            last_line = srow
+
+        if tok_type == tokenize.COMMENT:
+            # 주석 무시 (아무 것도 추가하지 않음)
+            continue
+
+        current_line += tok_string
+
+    if current_line:
+        result.append(current_line.rstrip())
+
+    return '\n'.join(result)
+
+
+def image_to_jpg_base64_url(image: Image.Image | np.ndarray) -> str:
+    """Return a base64 *JPEG* data‑URL from a PIL image or NumPy array."""
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    if image.mode in {"RGBA", "LA"}:
+        image = image.convert("RGB")
+    with io.BytesIO() as buffer:
+        image.save(buffer, format="JPEG")
+        encoded: str = base64.b64encode(buffer.getvalue()).decode()
+    return f"data:image/jpeg;base64,{encoded}"