Spaces:
Sleeping
Sleeping
enhance VLM and fix bugs with step extraciton
Browse files- app.py +1 -1
- mini_agents.py +1 -1
- vlm_tools.py +69 -9
app.py
CHANGED
|
@@ -53,7 +53,7 @@ class BasicAgent:
|
|
| 53 |
fixed_answer = self.agent.run(question)
|
| 54 |
|
| 55 |
# Log steps
|
| 56 |
-
all_steps = self.agent.master_agent.memory.
|
| 57 |
for step in all_steps:
|
| 58 |
if isinstance(step, ActionStep):
|
| 59 |
step_class = "ActionStep"
|
|
|
|
| 53 |
fixed_answer = self.agent.run(question)
|
| 54 |
|
| 55 |
# Log steps
|
| 56 |
+
all_steps = self.agent.master_agent.memory.steps
|
| 57 |
for step in all_steps:
|
| 58 |
if isinstance(step, ActionStep):
|
| 59 |
step_class = "ActionStep"
|
mini_agents.py
CHANGED
|
@@ -49,7 +49,7 @@ AUTHORIZED_IMPORTS = [
|
|
| 49 |
# Data processing
|
| 50 |
"numpy", "pandas", "sklearn", "scipy", "math", "hmmlearn",
|
| 51 |
# File handling
|
| 52 |
-
"base64", "io", "json", "os", "pickle",
|
| 53 |
# Visualization
|
| 54 |
"pyplot", "matplotlib", "matplotlib.pyplot",
|
| 55 |
# Utilities
|
|
|
|
| 49 |
# Data processing
|
| 50 |
"numpy", "pandas", "sklearn", "scipy", "math", "hmmlearn",
|
| 51 |
# File handling
|
| 52 |
+
"base64", "io", "json", "os", "pickle", "openpyxl", "pyxlsb"
|
| 53 |
# Visualization
|
| 54 |
"pyplot", "matplotlib", "matplotlib.pyplot",
|
| 55 |
# Utilities
|
vlm_tools.py
CHANGED
|
@@ -304,12 +304,40 @@ class ObjectDetectionTool(Tool):
|
|
| 304 |
name = "object_detection"
|
| 305 |
description = """
|
| 306 |
Detect objects in a list of images.
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
"""
|
| 311 |
inputs = {
|
| 312 |
-
"images": {
|
|
|
|
|
|
|
|
|
|
| 313 |
}
|
| 314 |
output_type = "any"
|
| 315 |
|
|
@@ -390,14 +418,46 @@ class ObjectDetectionTool(Tool):
|
|
| 390 |
|
| 391 |
class OCRTool(Tool):
|
| 392 |
description = """
|
| 393 |
-
Scan an image for text.
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
"""
|
| 398 |
name = "ocr_scan"
|
| 399 |
inputs = {
|
| 400 |
-
"images": {
|
|
|
|
|
|
|
|
|
|
| 401 |
}
|
| 402 |
output_type = "any"
|
| 403 |
|
|
|
|
| 304 |
name = "object_detection"
|
| 305 |
description = """
|
| 306 |
Detect objects in a list of images.
|
| 307 |
+
|
| 308 |
+
Input Requirements:
|
| 309 |
+
- Input must be a list of images, where each image is a base64-encoded string
|
| 310 |
+
- Each base64 string must be properly padded (length must be a multiple of 4)
|
| 311 |
+
- Images will be resized to 416x416 pixels during processing
|
| 312 |
+
- Images should be in RGB or BGR format (3 channels)
|
| 313 |
+
- Supported image formats: JPG, PNG
|
| 314 |
+
|
| 315 |
+
Processing:
|
| 316 |
+
- Images are automatically resized to 416x416
|
| 317 |
+
- Images are normalized to [0,1] range
|
| 318 |
+
- Model expects input shape: [1, 3, 416, 416] (batch, channels, height, width)
|
| 319 |
+
|
| 320 |
+
Output:
|
| 321 |
+
- Returns a list of detected objects for each image
|
| 322 |
+
- Each detection includes: (label, confidence, bounding_box)
|
| 323 |
+
- Bounding boxes are in format: [x, y, width, height]
|
| 324 |
+
- Confidence threshold: 0.5
|
| 325 |
+
- NMS threshold: 0.4
|
| 326 |
+
|
| 327 |
+
Example input format:
|
| 328 |
+
["base64_encoded_image1", "base64_encoded_image2"]
|
| 329 |
+
|
| 330 |
+
Example output format:
|
| 331 |
+
[
|
| 332 |
+
[("person", 0.95, [100, 200, 50, 100]), ("car", 0.88, [300, 400, 80, 60])], # detections for image1
|
| 333 |
+
[("dog", 0.92, [150, 250, 40, 80])] # detections for image2
|
| 334 |
+
]
|
| 335 |
"""
|
| 336 |
inputs = {
|
| 337 |
+
"images": {
|
| 338 |
+
"type": "any",
|
| 339 |
+
"description": "List of base64-encoded images. Each image must be a valid base64 string with proper padding (length multiple of 4). Images will be resized to 416x416."
|
| 340 |
+
}
|
| 341 |
}
|
| 342 |
output_type = "any"
|
| 343 |
|
|
|
|
| 418 |
|
| 419 |
class OCRTool(Tool):
|
| 420 |
description = """
|
| 421 |
+
Scan an image for text using OCR (Optical Character Recognition).
|
| 422 |
+
|
| 423 |
+
Input Requirements:
|
| 424 |
+
- Input must be a list of images, where each image is a base64-encoded string
|
| 425 |
+
- Each base64 string must be properly padded (length must be a multiple of 4)
|
| 426 |
+
- Images should be in RGB or BGR format (3 channels)
|
| 427 |
+
- Supported image formats: JPG, PNG
|
| 428 |
+
- For best results:
|
| 429 |
+
* Text should be clear and well-lit
|
| 430 |
+
* Image should have good contrast
|
| 431 |
+
* Text should be properly oriented
|
| 432 |
+
* Avoid blurry or distorted images
|
| 433 |
+
|
| 434 |
+
Processing:
|
| 435 |
+
- Uses Tesseract OCR engine
|
| 436 |
+
- Automatically handles text orientation
|
| 437 |
+
- Supports multiple languages (default: English)
|
| 438 |
+
- Processes each image independently
|
| 439 |
+
|
| 440 |
+
Output:
|
| 441 |
+
- Returns a list of text strings, one for each input image
|
| 442 |
+
- Empty string is returned if no text is detected
|
| 443 |
+
- Text is returned in the order it appears in the image
|
| 444 |
+
- Line breaks are preserved in the output
|
| 445 |
+
|
| 446 |
+
Example input format:
|
| 447 |
+
["base64_encoded_image1", "base64_encoded_image2"]
|
| 448 |
+
|
| 449 |
+
Example output format:
|
| 450 |
+
[
|
| 451 |
+
"This is text from image 1\nSecond line of text", # text from image1
|
| 452 |
+
"Text from image 2" # text from image2
|
| 453 |
+
]
|
| 454 |
"""
|
| 455 |
name = "ocr_scan"
|
| 456 |
inputs = {
|
| 457 |
+
"images": {
|
| 458 |
+
"type": "any",
|
| 459 |
+
"description": "List of base64-encoded images. Each image must be a valid base64 string with proper padding (length multiple of 4). Images should be clear and well-lit for best OCR results."
|
| 460 |
+
}
|
| 461 |
}
|
| 462 |
output_type = "any"
|
| 463 |
|