ai_agent_course_final_project

Running

App Files Files Community

keynes42 commited on 21 days ago

Commit

c347392

verified ·

1 Parent(s): 27171f8

Update custom_tools.py

Browse files

Files changed (1) hide show

custom_tools.py +42 -1

custom_tools.py CHANGED Viewed

@@ -6,6 +6,7 @@ import urllib.parse
 from smolagents import Tool, WebSearchTool, WikipediaSearchTool, PythonInterpreterTool
 from pydantic import BaseModel, Field
 from transformers import pipeline  # You'll need: pip install transformers torch accelerate
 # ------------------ Simple wrapper tools to save loading time ------------------------
 class CachedWebSearchTool(WebSearchTool):
@@ -33,7 +34,47 @@ class PreloadedPythonTool(PythonInterpreterTool):
         return super().run(preamble + code)
-# --------------------- Transcribe audio file to text ----------------------------
 class TranscribeAudioTool(Tool):
     name: str = "transcribe_audio_from_url"
     description: str = "Downloads an audio file (e.g., .mp3, .wav) from a URL and transcribes its spoken content into text."

 from smolagents import Tool, WebSearchTool, WikipediaSearchTool, PythonInterpreterTool
 from pydantic import BaseModel, Field
 from transformers import pipeline  # You'll need: pip install transformers torch accelerate
+from PIL import Image
 # ------------------ Simple wrapper tools to save loading time ------------------------
 class CachedWebSearchTool(WebSearchTool):
         return super().run(preamble + code)
+# --------------------- Describe image file with text --------------------------- #
+class ImageContentDescriberTool(Tool):
+    name: str = "describe_image_content"
+    description: str = "Downloads an image from a URL and provides a textual description of its main content. It CANNOT solve complex puzzles like chess positions but can identify objects and scenes."
+    inputs: Dict[str, Dict[str, Union[str, Any]]] = {
+        "image_url": {
+            "type": "string",
+            "description": "The URL of the image to describe."
+        }
+    }
+    output_type: type = str
+    def forward(self, image_url: str) -> str:
+        return describe_image_from_url(image_url)
+# Lazy-load the vision model
+image_captioner = None
+def describe_image_from_url(image_url: str) -> str:
+    """Downloads an image from a URL and generates a text description."""
+    global image_captioner
+    if image_captioner is None:
+        try:
+            print("Initializing Image Captioning model for the first time...")
+            # Using a smaller, faster BLIP model.
+            image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+            print("Image Captioning model initialized.")
+        except Exception as e:
+            return f"Error: Could not initialize the image captioning model. Details: {e}"
+    try:
+        print(f"Downloading image from {image_url}...")
+        image = Image.open(requests.get(image_url, stream=True, timeout=15).raw)
+        print("Generating image description...")
+        description = image_captioner(image)[0]['generated_text']
+        return f"Image description: {description}"
+    except Exception as e:
+        return f"An error occurred while processing the image file: {e}"
+# --------------------- Transcribe audio file to text ---------------------------- #
 class TranscribeAudioTool(Tool):
     name: str = "transcribe_audio_from_url"
     description: str = "Downloads an audio file (e.g., .mp3, .wav) from a URL and transcribes its spoken content into text."