Update custom_tools.py
Browse files- custom_tools.py +42 -1
custom_tools.py
CHANGED
@@ -6,6 +6,7 @@ import urllib.parse
|
|
6 |
from smolagents import Tool, WebSearchTool, WikipediaSearchTool, PythonInterpreterTool
|
7 |
from pydantic import BaseModel, Field
|
8 |
from transformers import pipeline # You'll need: pip install transformers torch accelerate
|
|
|
9 |
|
10 |
# ------------------ Simple wrapper tools to save loading time ------------------------
|
11 |
class CachedWebSearchTool(WebSearchTool):
|
@@ -33,7 +34,47 @@ class PreloadedPythonTool(PythonInterpreterTool):
|
|
33 |
return super().run(preamble + code)
|
34 |
|
35 |
|
36 |
-
# ---------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
class TranscribeAudioTool(Tool):
|
38 |
name: str = "transcribe_audio_from_url"
|
39 |
description: str = "Downloads an audio file (e.g., .mp3, .wav) from a URL and transcribes its spoken content into text."
|
|
|
6 |
from smolagents import Tool, WebSearchTool, WikipediaSearchTool, PythonInterpreterTool
|
7 |
from pydantic import BaseModel, Field
|
8 |
from transformers import pipeline # You'll need: pip install transformers torch accelerate
|
9 |
+
from PIL import Image
|
10 |
|
11 |
# ------------------ Simple wrapper tools to save loading time ------------------------
|
12 |
class CachedWebSearchTool(WebSearchTool):
|
|
|
34 |
return super().run(preamble + code)
|
35 |
|
36 |
|
37 |
+
# --------------------- Describe image file with text --------------------------- #
|
38 |
+
class ImageContentDescriberTool(Tool):
|
39 |
+
name: str = "describe_image_content"
|
40 |
+
description: str = "Downloads an image from a URL and provides a textual description of its main content. It CANNOT solve complex puzzles like chess positions but can identify objects and scenes."
|
41 |
+
|
42 |
+
inputs: Dict[str, Dict[str, Union[str, Any]]] = {
|
43 |
+
"image_url": {
|
44 |
+
"type": "string",
|
45 |
+
"description": "The URL of the image to describe."
|
46 |
+
}
|
47 |
+
}
|
48 |
+
output_type: type = str
|
49 |
+
|
50 |
+
def forward(self, image_url: str) -> str:
|
51 |
+
return describe_image_from_url(image_url)
|
52 |
+
|
53 |
+
# Lazy-load the vision model
|
54 |
+
image_captioner = None
|
55 |
+
def describe_image_from_url(image_url: str) -> str:
|
56 |
+
"""Downloads an image from a URL and generates a text description."""
|
57 |
+
global image_captioner
|
58 |
+
if image_captioner is None:
|
59 |
+
try:
|
60 |
+
print("Initializing Image Captioning model for the first time...")
|
61 |
+
# Using a smaller, faster BLIP model.
|
62 |
+
image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
|
63 |
+
print("Image Captioning model initialized.")
|
64 |
+
except Exception as e:
|
65 |
+
return f"Error: Could not initialize the image captioning model. Details: {e}"
|
66 |
+
|
67 |
+
try:
|
68 |
+
print(f"Downloading image from {image_url}...")
|
69 |
+
image = Image.open(requests.get(image_url, stream=True, timeout=15).raw)
|
70 |
+
print("Generating image description...")
|
71 |
+
description = image_captioner(image)[0]['generated_text']
|
72 |
+
return f"Image description: {description}"
|
73 |
+
except Exception as e:
|
74 |
+
return f"An error occurred while processing the image file: {e}"
|
75 |
+
|
76 |
+
|
77 |
+
# --------------------- Transcribe audio file to text ---------------------------- #
|
78 |
class TranscribeAudioTool(Tool):
|
79 |
name: str = "transcribe_audio_from_url"
|
80 |
description: str = "Downloads an audio file (e.g., .mp3, .wav) from a URL and transcribes its spoken content into text."
|