Final_Assignment_Template

Sleeping

App Files Files Community

huytofu92 commited on May 22, 2025

Commit

f864d65

1 Parent(s): ff1ad14

enhance VLM and fix bugs with step extraciton

Browse files

Files changed (3) hide show

app.py +1 -1
mini_agents.py +1 -1
vlm_tools.py +69 -9

app.py CHANGED Viewed

@@ -53,7 +53,7 @@ class BasicAgent:
             fixed_answer = self.agent.run(question)
             # Log steps
-            all_steps = self.agent.master_agent.memory.get_full_steps()
             for step in all_steps:
                 if isinstance(step, ActionStep):
                     step_class = "ActionStep"

             fixed_answer = self.agent.run(question)
             # Log steps
+            all_steps = self.agent.master_agent.memory.steps
             for step in all_steps:
                 if isinstance(step, ActionStep):
                     step_class = "ActionStep"

mini_agents.py CHANGED Viewed

@@ -49,7 +49,7 @@ AUTHORIZED_IMPORTS = [
     # Data processing
     "numpy", "pandas", "sklearn", "scipy", "math", "hmmlearn",
     # File handling
-    "base64", "io", "json", "os", "pickle",
     # Visualization
     "pyplot", "matplotlib", "matplotlib.pyplot",
     # Utilities

     # Data processing
     "numpy", "pandas", "sklearn", "scipy", "math", "hmmlearn",
     # File handling
+    "base64", "io", "json", "os", "pickle", "openpyxl", "pyxlsb"
     # Visualization
     "pyplot", "matplotlib", "matplotlib.pyplot",
     # Utilities

vlm_tools.py CHANGED Viewed

@@ -304,12 +304,40 @@ class ObjectDetectionTool(Tool):
     name = "object_detection"
     description = """
         Detect objects in a list of images.
-        It takes a list of images as input and returns
-        a list of detected objects with labels, confidence, and bounding boxes.
-        The output type will be List[List[str]]
     """
     inputs = {
-        "images": {"type": "any", "description": "The list of images to detect objects in. Must be a List[str] or a List[np.ndarray]"}
     }
     output_type = "any"
@@ -390,14 +418,46 @@ class ObjectDetectionTool(Tool):
 class OCRTool(Tool):
     description = """
-    Scan an image for text.
-    It takes a list of images as input and returns
-    a list of text in the images.
-    The output type will be List[List[str]]
     """
     name = "ocr_scan"
     inputs = {
-        "images": {"type": "any", "description": "The list of images to scan for text. Must be a List[str] or a List[np.ndarray]"}
     }
     output_type = "any"

     name = "object_detection"
     description = """
         Detect objects in a list of images.
+        Input Requirements:
+        - Input must be a list of images, where each image is a base64-encoded string
+        - Each base64 string must be properly padded (length must be a multiple of 4)
+        - Images will be resized to 416x416 pixels during processing
+        - Images should be in RGB or BGR format (3 channels)
+        - Supported image formats: JPG, PNG
+        Processing:
+        - Images are automatically resized to 416x416
+        - Images are normalized to [0,1] range
+        - Model expects input shape: [1, 3, 416, 416] (batch, channels, height, width)
+        Output:
+        - Returns a list of detected objects for each image
+        - Each detection includes: (label, confidence, bounding_box)
+        - Bounding boxes are in format: [x, y, width, height]
+        - Confidence threshold: 0.5
+        - NMS threshold: 0.4
+        Example input format:
+        ["base64_encoded_image1", "base64_encoded_image2"]
+        Example output format:
+        [
+            [("person", 0.95, [100, 200, 50, 100]), ("car", 0.88, [300, 400, 80, 60])],  # detections for image1
+            [("dog", 0.92, [150, 250, 40, 80])]  # detections for image2
+        ]
     """
     inputs = {
+        "images": {
+            "type": "any",
+            "description": "List of base64-encoded images. Each image must be a valid base64 string with proper padding (length multiple of 4). Images will be resized to 416x416."
+        }
     }
     output_type = "any"
 class OCRTool(Tool):
     description = """
+    Scan an image for text using OCR (Optical Character Recognition).
+    Input Requirements:
+    - Input must be a list of images, where each image is a base64-encoded string
+    - Each base64 string must be properly padded (length must be a multiple of 4)
+    - Images should be in RGB or BGR format (3 channels)
+    - Supported image formats: JPG, PNG
+    - For best results:
+      * Text should be clear and well-lit
+      * Image should have good contrast
+      * Text should be properly oriented
+      * Avoid blurry or distorted images
+    Processing:
+    - Uses Tesseract OCR engine
+    - Automatically handles text orientation
+    - Supports multiple languages (default: English)
+    - Processes each image independently
+    Output:
+    - Returns a list of text strings, one for each input image
+    - Empty string is returned if no text is detected
+    - Text is returned in the order it appears in the image
+    - Line breaks are preserved in the output
+    Example input format:
+    ["base64_encoded_image1", "base64_encoded_image2"]
+    Example output format:
+    [
+        "This is text from image 1\nSecond line of text",  # text from image1
+        "Text from image 2"  # text from image2
+    ]
     """
     name = "ocr_scan"
     inputs = {
+        "images": {
+            "type": "any",
+            "description": "List of base64-encoded images. Each image must be a valid base64 string with proper padding (length multiple of 4). Images should be clear and well-lit for best OCR results."
+        }
     }
     output_type = "any"