keynes42 commited on
Commit
c347392
·
verified ·
1 Parent(s): 27171f8

Update custom_tools.py

Browse files
Files changed (1) hide show
  1. custom_tools.py +42 -1
custom_tools.py CHANGED
@@ -6,6 +6,7 @@ import urllib.parse
6
  from smolagents import Tool, WebSearchTool, WikipediaSearchTool, PythonInterpreterTool
7
  from pydantic import BaseModel, Field
8
  from transformers import pipeline # You'll need: pip install transformers torch accelerate
 
9
 
10
  # ------------------ Simple wrapper tools to save loading time ------------------------
11
  class CachedWebSearchTool(WebSearchTool):
@@ -33,7 +34,47 @@ class PreloadedPythonTool(PythonInterpreterTool):
33
  return super().run(preamble + code)
34
 
35
 
36
- # --------------------- Transcribe audio file to text ----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  class TranscribeAudioTool(Tool):
38
  name: str = "transcribe_audio_from_url"
39
  description: str = "Downloads an audio file (e.g., .mp3, .wav) from a URL and transcribes its spoken content into text."
 
6
  from smolagents import Tool, WebSearchTool, WikipediaSearchTool, PythonInterpreterTool
7
  from pydantic import BaseModel, Field
8
  from transformers import pipeline # You'll need: pip install transformers torch accelerate
9
+ from PIL import Image
10
 
11
  # ------------------ Simple wrapper tools to save loading time ------------------------
12
  class CachedWebSearchTool(WebSearchTool):
 
34
  return super().run(preamble + code)
35
 
36
 
37
+ # --------------------- Describe image file with text --------------------------- #
38
+ class ImageContentDescriberTool(Tool):
39
+ name: str = "describe_image_content"
40
+ description: str = "Downloads an image from a URL and provides a textual description of its main content. It CANNOT solve complex puzzles like chess positions but can identify objects and scenes."
41
+
42
+ inputs: Dict[str, Dict[str, Union[str, Any]]] = {
43
+ "image_url": {
44
+ "type": "string",
45
+ "description": "The URL of the image to describe."
46
+ }
47
+ }
48
+ output_type: type = str
49
+
50
+ def forward(self, image_url: str) -> str:
51
+ return describe_image_from_url(image_url)
52
+
53
+ # Lazy-load the vision model
54
+ image_captioner = None
55
+ def describe_image_from_url(image_url: str) -> str:
56
+ """Downloads an image from a URL and generates a text description."""
57
+ global image_captioner
58
+ if image_captioner is None:
59
+ try:
60
+ print("Initializing Image Captioning model for the first time...")
61
+ # Using a smaller, faster BLIP model.
62
+ image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
63
+ print("Image Captioning model initialized.")
64
+ except Exception as e:
65
+ return f"Error: Could not initialize the image captioning model. Details: {e}"
66
+
67
+ try:
68
+ print(f"Downloading image from {image_url}...")
69
+ image = Image.open(requests.get(image_url, stream=True, timeout=15).raw)
70
+ print("Generating image description...")
71
+ description = image_captioner(image)[0]['generated_text']
72
+ return f"Image description: {description}"
73
+ except Exception as e:
74
+ return f"An error occurred while processing the image file: {e}"
75
+
76
+
77
+ # --------------------- Transcribe audio file to text ---------------------------- #
78
  class TranscribeAudioTool(Tool):
79
  name: str = "transcribe_audio_from_url"
80
  description: str = "Downloads an audio file (e.g., .mp3, .wav) from a URL and transcribes its spoken content into text."