Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
|
@@ -8,34 +8,25 @@ datasets:
|
|
| 8 |
- mlfoundations-cua-dev/easyr1-103k-4MP-not-all-correct-stage-one-temp-1_1-RL-remove-pixmo-uground-seeclick # List datasets used for training
|
| 9 |
base_model: Qwen/Qwen3-VL-30B-A3B-Instruct
|
| 10 |
---
|
| 11 |
-
#
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
|
| 19 |
# Performance
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
| **Model**
|
| 24 |
-
|
| 25 |
-
|
|
| 26 |
-
|
|
| 27 |
-
|
|
| 28 |
-
|
|
| 29 |
-
|
|
| 30 |
-
| UGround-v1-7B | 7B | β
| β | 31.1 | 36.4 |
|
| 31 |
-
| Qwen2.5-VL-32B-Instruct | 32B | β
| 91.9 | 48.0 | 59.6 | |
|
| 32 |
-
| UGround-v1-72B | 72B | β
| β | 34.5 | β |
|
| 33 |
-
| Qwen2.5-VL-72B-Instruct | 72B | β
| 94.00 | 53.3 | 62.2 |
|
| 34 |
-
| UI-TARS | 72B | β
| 90.3 | 38.1 | β |
|
| 35 |
-
| GTA1 | 7B | β
| 92.4 | 50.1 | 67.7 |
|
| 36 |
-
| GTA1 | 32B | β
| 93.2 | 53.6 | 61.9 |
|
| 37 |
-
| GTA1 | 72B | β
| 94.8 | 58.4 | 66.7 |
|
| 38 |
-
| OLGA-30B-MoE (Ours) | 30B | β
| - | 63.9 | 73 |
|
| 39 |
|
| 40 |
|
| 41 |
> **Note:**
|
|
@@ -45,16 +36,12 @@ We evaluate on benchmarks ScreenSpot-V2, ScreenSpotPro and OS-World-G for ground
|
|
| 45 |
> - β indicates the performance improvement (β) of our model compared to its baseline.
|
| 46 |
|
| 47 |
# Inference
|
| 48 |
-
Below is a code snippet demonstrating how to ground using our model. Given an image and an instruction, we output
|
| 49 |
-
|
| 50 |
-
## Coordinates
|
| 51 |
-
Pay attention to the fact that Qwen's Autoprocessor rescales the image to multiples of 28 pixels. Therefore the absolute coordinates potentially need to be scaled back.
|
| 52 |
|
| 53 |
```python
|
| 54 |
from transformers import Qwen3VLMoeForConditionalGeneration, AutoProcessor
|
| 55 |
import re
|
| 56 |
-
from
|
| 57 |
-
from PIL import Image
|
| 58 |
import requests
|
| 59 |
from io import BytesIO
|
| 60 |
|
|
@@ -74,34 +61,32 @@ def extract_coordinates(raw_string):
|
|
| 74 |
except:
|
| 75 |
return 0,0
|
| 76 |
|
| 77 |
-
def
|
| 78 |
"""
|
| 79 |
-
|
| 80 |
-
This is necessary because the model would resize the image and predict the coordinates in the resized image.
|
| 81 |
Args:
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
"""
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
return resized_image, scale_x, scale_y
|
| 102 |
|
| 103 |
# Load the model and processor
|
| 104 |
-
MODEL_PATH = "mlfoundations-cua-dev/Gelato-30B"
|
| 105 |
|
| 106 |
model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
|
| 107 |
MODEL_PATH,
|
|
@@ -110,49 +95,48 @@ model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
|
|
| 110 |
)
|
| 111 |
|
| 112 |
processor = AutoProcessor.from_pretrained(
|
| 113 |
-
MODEL_PATH
|
|
|
|
| 114 |
)
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
# Prepare messages
|
| 117 |
-
|
| 118 |
You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. The image resolution is height {height} and width {width}. For elements with area, return the center point.
|
| 119 |
|
| 120 |
Output the coordinate pair exactly:
|
| 121 |
(x,y)
|
| 122 |
'''
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
resized_image, scale_x, scale_y = resize_image("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg")
|
| 126 |
|
| 127 |
messages = [
|
| 128 |
-
{
|
| 129 |
-
"role": "system",
|
| 130 |
-
"content": [
|
| 131 |
-
{
|
| 132 |
-
"type": "text",
|
| 133 |
-
"text": SYSTEM_PROMPT
|
| 134 |
-
}
|
| 135 |
-
]
|
| 136 |
-
},
|
| 137 |
{
|
| 138 |
"role": "user",
|
| 139 |
"content": [
|
|
|
|
| 140 |
{
|
| 141 |
"type": "image",
|
| 142 |
-
"image":
|
| 143 |
},
|
| 144 |
-
{"type": "text", "text": "
|
| 145 |
],
|
| 146 |
}
|
| 147 |
]
|
| 148 |
|
|
|
|
| 149 |
inputs = processor.apply_chat_template(
|
| 150 |
messages,
|
| 151 |
tokenize=True,
|
| 152 |
add_generation_prompt=True,
|
| 153 |
return_dict=True,
|
| 154 |
return_tensors="pt"
|
| 155 |
-
)
|
| 156 |
|
| 157 |
# Inference: Generation of the output
|
| 158 |
generated_ids = model.generate(**inputs, max_new_tokens=128)
|
|
@@ -164,8 +148,9 @@ output_text = processor.batch_decode(
|
|
| 164 |
)
|
| 165 |
|
| 166 |
# Extract the coordinates from the output text
|
|
|
|
| 167 |
pred_x, pred_y = extract_coordinates(output_text[0])
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
```
|
|
|
|
| 8 |
- mlfoundations-cua-dev/easyr1-103k-4MP-not-all-correct-stage-one-temp-1_1-RL-remove-pixmo-uground-seeclick # List datasets used for training
|
| 9 |
base_model: Qwen/Qwen3-VL-30B-A3B-Instruct
|
| 10 |
---
|
| 11 |
+
# π¨ Gelato β From Data Curation to Reinforcement Learning: Building a Strong Grounding Model for Computer-Use Agents
|
| 12 |
|
| 13 |
+
[π¨ **Gelato-30B-A3B (model)**](https://huggingface.co/mlfoundations/Gelato-30B-A3B)β|β[π±οΈ **Click-100k (dataset)**](https://huggingface.co/datasets/mlfoundations/clicks-100k) | [π **Training Instructions**](./training_configs) | [π **Evaluation**](./evaluation)
|
| 14 |
+
|
| 15 |
+

|
| 16 |
+
|
| 17 |
+
We are releasing [**π¨ Gelato-30B-A3B**](https://huggingface.co/mlfoundations/Gelato-30B-A3B), a state-of-the-art grounding model for GUI computer-use tasks! Gelato is trained on our open-sourced [**Click-100k**](https://huggingface.co/datasets/mlfoundations/clicks-100k) dataset and achieves **63.88% accuracy on ScreenSpot-Pro**<sup>[[3](#ref-screenspot-pro)]</sup> and **67.19% / 73.40% on OS-World-G / OS-World-G (Refined)**<sup>[[4](#ref-jedi)]</sup>, surpassing prior specialized computer grounding models like GTA1-32B <sup>[[5](#ref-gta1)]</sup> and much larger VLMs including Qwen3-VL-235B-A22B-Instruct <sup>[[10](#ref-qwen3vl)]</sup>. When combined with GPT-5, Gelato enables frontier-level agentic performanceβplacing *TBD* on the [OS-World leaderboard](https://github.com/mlfoundations/grounding-model-os-world) at *TBD* accuracy.
|
| 18 |
|
| 19 |
# Performance
|
| 20 |
|
| 21 |
+
Gelato-30B-A3B outperforms the SoTA specialized computer grounding model, GTA1-32B, and larger VLMs on the ScreenSpot-Pro and OS-World-G grounding benchmarks. When paired with GPT-5, Gelato as a computer-use agent attains *TBD* success rate on OS-World placing it *TBD* on the leaderboard.
|
| 22 |
+
|
| 23 |
+
| **Model** | **Total Size** | **Activated Size** | **Open Source** | **ScreenSpot-V2** | **ScreenSpotPro** | **OSWORLD-G** |
|
| 24 |
+
|------------|:--------------:|:------------------:|:----------------:|:-----------------:|:-----------------:|:--------------:|
|
| 25 |
+
| Qwen3-VL-30B-A3B-Instruct | 30 B | 3.3 B | β
| β | β | β |
|
| 26 |
+
| Qwen3-VL-235B-A22B-Instruct | 235 B | 22 B | β
| β | β | β |
|
| 27 |
+
| OpenCUA-72B | 72 B | β | β
| β | β | β |
|
| 28 |
+
| GTA1-32B | 32 B | β | β
| β | β | β |
|
| 29 |
+
| Gelato-30B-A3B | 30 B | 3.3 B | β
| β | β | β |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
|
| 32 |
> **Note:**
|
|
|
|
| 36 |
> - β indicates the performance improvement (β) of our model compared to its baseline.
|
| 37 |
|
| 38 |
# Inference
|
| 39 |
+
Below is a code snippet demonstrating how to ground using our model. Given an image and an instruction, we output normalized coordinates in the range [0,1000].
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
```python
|
| 42 |
from transformers import Qwen3VLMoeForConditionalGeneration, AutoProcessor
|
| 43 |
import re
|
| 44 |
+
from PIL import Image, ImageDraw
|
|
|
|
| 45 |
import requests
|
| 46 |
from io import BytesIO
|
| 47 |
|
|
|
|
| 61 |
except:
|
| 62 |
return 0,0
|
| 63 |
|
| 64 |
+
def visualize_prediction(img, pred_x, pred_y, img_width, img_height):
|
| 65 |
"""
|
| 66 |
+
Visualize the predicted coordinates on the image.
|
|
|
|
| 67 |
Args:
|
| 68 |
+
img: PIL.Image.Image
|
| 69 |
+
pred_x: float
|
| 70 |
+
pred_y: float
|
| 71 |
+
img_width: int
|
| 72 |
+
img_height: int
|
| 73 |
"""
|
| 74 |
+
pred_x = int((pred_x * img_width)/1000)
|
| 75 |
+
pred_y = int((pred_y * img_height)/1000)
|
| 76 |
+
|
| 77 |
+
draw = ImageDraw.Draw(img)
|
| 78 |
+
|
| 79 |
+
r = 20
|
| 80 |
+
draw.ellipse((pred_x - r, pred_y - r, pred_x + r, pred_y + r), outline="green", width=2)
|
| 81 |
+
cross_len = 6
|
| 82 |
+
draw.line((pred_x - cross_len, pred_y, pred_x + cross_len, pred_y), fill="green", width=2)
|
| 83 |
+
draw.line((pred_x, pred_y - cross_len, pred_x, pred_y + cross_len), fill="green", width=2)
|
| 84 |
+
|
| 85 |
+
img.save("predicted_coordinates.png")
|
| 86 |
+
print(f"Predicted coordinates: ({pred_x}, {pred_y})")
|
|
|
|
| 87 |
|
| 88 |
# Load the model and processor
|
| 89 |
+
MODEL_PATH = "mlfoundations-cua-dev/Gelato-30B-A3B"
|
| 90 |
|
| 91 |
model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
|
| 92 |
MODEL_PATH,
|
|
|
|
| 95 |
)
|
| 96 |
|
| 97 |
processor = AutoProcessor.from_pretrained(
|
| 98 |
+
MODEL_PATH,
|
| 99 |
+
max_pixels=10*7 # 10MP
|
| 100 |
)
|
| 101 |
|
| 102 |
+
url = "https://github.com/QwenLM/Qwen3-VL/raw/main/cookbooks/assets/computer_use/computer_use1.jpeg"
|
| 103 |
+
response = requests.get(url)
|
| 104 |
+
print(response.status_code)
|
| 105 |
+
print(response.headers.get("Content-Type"))
|
| 106 |
+
img = Image.open(BytesIO(response.content))
|
| 107 |
+
img_width, img_height = img.size
|
| 108 |
+
|
| 109 |
# Prepare messages
|
| 110 |
+
PROMPT = '''
|
| 111 |
You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. The image resolution is height {height} and width {width}. For elements with area, return the center point.
|
| 112 |
|
| 113 |
Output the coordinate pair exactly:
|
| 114 |
(x,y)
|
| 115 |
'''
|
| 116 |
+
PROMPT = PROMPT.strip()
|
|
|
|
|
|
|
| 117 |
|
| 118 |
messages = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
{
|
| 120 |
"role": "user",
|
| 121 |
"content": [
|
| 122 |
+
{"type": "text", "text": PROMPT},
|
| 123 |
{
|
| 124 |
"type": "image",
|
| 125 |
+
"image": img,
|
| 126 |
},
|
| 127 |
+
{"type": "text", "text": "Reload the cache."},
|
| 128 |
],
|
| 129 |
}
|
| 130 |
]
|
| 131 |
|
| 132 |
+
device = next(model.parameters()).device
|
| 133 |
inputs = processor.apply_chat_template(
|
| 134 |
messages,
|
| 135 |
tokenize=True,
|
| 136 |
add_generation_prompt=True,
|
| 137 |
return_dict=True,
|
| 138 |
return_tensors="pt"
|
| 139 |
+
).to(device)
|
| 140 |
|
| 141 |
# Inference: Generation of the output
|
| 142 |
generated_ids = model.generate(**inputs, max_new_tokens=128)
|
|
|
|
| 148 |
)
|
| 149 |
|
| 150 |
# Extract the coordinates from the output text
|
| 151 |
+
print(f"Model output: {output_text[0]}")
|
| 152 |
pred_x, pred_y = extract_coordinates(output_text[0])
|
| 153 |
+
|
| 154 |
+
# Calculate the absolute coordinates from normalized coordinates
|
| 155 |
+
visualize_prediction(img, pred_x, pred_y, img_width, img_height)
|
| 156 |
+
```
|