File size: 14,522 Bytes
b9db89e
 
359458a
b661ca5
2579cb2
9126d5d
 
2579cb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9db89e
8dbcbe2
 
b9db89e
 
 
 
 
 
 
 
 
 
 
 
 
f1a397a
 
 
 
c38dc79
f1a397a
 
324503a
335af49
b9db89e
b5860a1
b9db89e
 
 
 
 
 
 
 
 
 
6ece280
b9db89e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8dbcbe2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9db89e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67b6988
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
import requests
from bs4 import BeautifulSoup
from typing import List, Dict, Any, Union
from functools import lru_cache
from smolagents import Tool, WebSearchTool, WikipediaSearchTool, PythonInterpreterTool
from pydantic import BaseModel, Field

class CachedWebSearchTool(WebSearchTool):
    @lru_cache(maxsize=128)
    def run(self, query: str):
        # identical queries return instantly
        return super().run(query)

class CachedWikiTool(WikipediaSearchTool):
    @lru_cache(maxsize=128)
    def run(self, page: str):
        return super().run(page)

class PreloadedPythonTool(PythonInterpreterTool):
    """
    A PythonInterpreterTool that automatically prepends the necessary imports
    (bs4, BeautifulSoup, regex) so you never hit NameError inside your code blocks.
    """
    def run(self, code: str) -> str:
        preamble = (
            "import bs4\n"
            "from bs4 import BeautifulSoup\n"
            "import regex\n"
        )
        return super().run(preamble + code)



# --------------------- Webpage structure analyzer -------------------------------
class WebpageStructureAnalyzerTool(Tool):
    """
    A tool to fetch a webpage and analyze its basic HTML structure.
    It helps in understanding the page layout before attempting detailed parsing.
    """
    name: str = "analyze_webpage_structure"
    description: str = (
        "Fetches a webpage and returns a summary of its HTML structure "
        "(title, headings H1/H2/H3, tables found and their headers/first row, "
        "and counts of lists and forms). Use this tool *first* to understand "
        "a webpage's layout *before* trying to write specific 'bs4' code "
        "to extract detailed information."
    )
    # According to the comment: Dict[argument_name, Dict[key, Union[str, type, bool]]]
    # where the inner dict has 'type' and 'description'
    inputs: Dict[str, Dict[str, Union[str, type, bool]]] = { # Explicit type hint for clarity
        "url": {                                            # Argument name
            "type": "string",                                    # The actual Python type
            "description": "The URL of the webpage to analyze."
        }
    }
    output_type: str = "string"

    def forward(self, url) -> str:
        """
        Executes the webpage structure analysis.

        Args:
            url: The URL of the webpage to analyze.

        Returns:
            A string containing the structure summary or an error message.
        """
        # Ensure the core function is accessible here
        return analyze_webpage_structure(url)

def analyze_webpage_structure(url: str) -> str:
    """
    Fetches a webpage and returns a text summary of its key HTML structure.

    Args:
        url: The URL of the webpage to analyze.

    Returns:
        A string containing a summary of the HTML structure, or an error message.
    """
    summary_lines: List[str] = []
    
    # Define a User-Agent to mimic a browser, reducing chances of being blocked
    headers: Dict[str, str] = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        # Fetch the webpage content
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        summary_lines.append(f"--- Structure Summary for: {url} ---")

        # 1. Title
        title = soup.title.string if soup.title else "N/A"
        summary_lines.append(f"\n[Title]: {title.strip()}")

        # 2. Meta Description
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        description = meta_desc['content'] if meta_desc and meta_desc.has_attr('content') else "N/A"
        summary_lines.append(f"[Meta Description]: {description.strip()}")

        # 3. Headings (H1-H4)
        summary_lines.append("\n[Headings]:")
        for i in range(1, 5):
            headings = soup.find_all(f'h{i}')
            summary_lines.append(f"  - H{i} Tags Found: {len(headings)}")
            # Show the first 5 headings for brevity
            for h in headings[:5]:
                summary_lines.append(f"    - {h.get_text(strip=True)[:100]}") # Limit length

        # 4. Links
        links = soup.find_all('a')
        summary_lines.append(f"\n[Links]:")
        summary_lines.append(f"  - Total Links Found: {len(links)}")
        # Show the first 5 links
        for link in links[:5]:
            href = link.get('href', 'N/A')
            text = link.get_text(strip=True)[:80] # Limit length
            summary_lines.append(f"  - [{text}] -> {href}")

        # 5. Images
        images = soup.find_all('img')
        summary_lines.append(f"\n[Images]:")
        summary_lines.append(f"  - Total Images Found: {len(images)}")
        # Show the first 5 image alts/srcs
        for img in images[:5]:
            alt = img.get('alt', 'No alt text')[:80] # Limit length
            src = img.get('src', 'N/A')
            summary_lines.append(f"  - [Alt: {alt}] -> {src}")
            
        # 6. Tables
        tables = soup.find_all('table')
        summary_lines.append(f"\n[Tables]:")
        summary_lines.append(f"  - Total Tables Found: {len(tables)}")
        for i, table in enumerate(tables[:3]): # Show info for first 3 tables
            headers = [th.get_text(strip=True) for th in table.find_all('th', limit=10)]
            rows = table.find_all('tr')
            if headers:
                summary_lines.append(f"  - Table {i+1} (Rows: {len(rows)}): Headers = {headers}")
            else:
                summary_lines.append(f"  - Table {i+1} (Rows: {len(rows)}): No <th> headers found.")

        # 7. Lists
        ul_lists = soup.find_all('ul')
        ol_lists = soup.find_all('ol')
        summary_lines.append(f"\n[Lists]:")
        summary_lines.append(f"  - Unordered Lists (ul) Found: {len(ul_lists)}")
        summary_lines.append(f"  - Ordered Lists (ol) Found: {len(ol_lists)}")

        # 8. Forms
        forms = soup.find_all('form')
        summary_lines.append(f"\n[Forms]:")
        summary_lines.append(f"  - Total Forms Found: {len(forms)}")
        for i, form in enumerate(forms[:3]): # Show info for first 3 forms
            action = form.get('action', 'N/A')
            inputs = form.find_all('input')
            input_names = [inp.get('name', 'No name') for inp in inputs if inp.get('type') != 'hidden']
            summary_lines.append(f"  - Form {i+1} (Action: {action}): Inputs = {input_names[:5]}")

        summary_lines.append("\n------------------------------------")
        
        return "\n".join(summary_lines)

    except requests.exceptions.HTTPError as http_err:
        return f"HTTP Error fetching webpage {url}: {http_err}"
    except requests.exceptions.ConnectionError as conn_err:
        return f"Connection Error fetching webpage {url}: {conn_err}"
    except requests.exceptions.Timeout as timeout_err:
        return f"Timeout Error fetching webpage {url}: {timeout_err}"
    except requests.exceptions.RequestException as req_err:
        return f"Error fetching webpage {url}: {req_err}"
    except Exception as e:
        return f"An error occurred while analyzing {url}: {e}"



# --------------- Summarize webpage content ------------------------#
class SummarizeWebpageContentTool(Tool):
    name: str = "summarize_webpage_content"
    description: str = (
        "Fetches a webpage and returns a concise summary of its main textual content. "
        "Use this instead of 'visit_webpage' when you need an overview of the text, not its full structure or HTML."
    )
    
    inputs: Dict[str, Dict[str, Union[str, Any]]] = {
        "url": {
            "type": "string",
            "description": "The URL of the webpage to summarize."
        }
    }
    
    output_type: type = str

    def forward(self, url: str) -> str:
        return summarize_webpage_content(url)

def summarize_webpage_content(url: str, max_length: int = 1500) -> str:
    """
    Fetches the main textual content of a webpage and returns a concise summary.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Strip out script, style, nav, and footer tags to get cleaner text
        for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
            element.decompose()

        # Extract text from the main body, focusing on paragraphs
        main_content = soup.find('main') or soup.find('article') or soup.find('body')
        text_chunks = [p.get_text(strip=True) for p in main_content.find_all('p')]
        full_text = " ".join(text_chunks)
        
        if not full_text:
            return "Error: Could not extract meaningful text content from the webpage."

        # Return a truncated version as a simple summary
        summary = full_text[:max_length]
        if len(full_text) > max_length:
            # Try to cut at a word boundary
            last_space = summary.rfind(' ')
            if last_space != -1:
                summary = summary[:last_space]
            summary += "..."

        return f"Summary of content from {url}:\n{summary}"

    except requests.exceptions.RequestException as e:
        return f"Error fetching webpage {url}: {e}"
    except Exception as e:
        return f"An error occurred while summarizing {url}: {e}"



# --------------- Extract table from webpage ------------------------#
class ExtractTableFromWebpageTool(Tool):
    name: str = "extract_table_from_webpage"
    description: str = (
        "Extracts a specific table from a webpage and returns it in a clean Markdown format. "
        "Use the 'table_identifier' to specify which table you want."
    )
    
    inputs: Dict[str, Dict[str, Union[str, Any]]] = {
        "url": {
            "type": "string",
            "description": "The URL of the webpage containing the table."
        },
        "table_identifier": {
            "type": "string",
            "description": "The index (e.g., '0' for the first table, '1' for the second) or a text keyword from the table's caption to identify which table to extract. Defaults to '0'."
        }
    }
    
    output_type: str = "string" # Should match the return type of the helper function

    def forward(self, url: str, table_identifier: str = "0") -> str:
        return extract_table_from_webpage(url, table_identifier)

def extract_table_from_webpage(url: str, table_identifier: str = "0") -> str:
    """
    Fetches a webpage, finds a specific table, and returns it in Markdown format.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        tables = soup.find_all('table')
        if not tables:
            return "Error: No tables found on the webpage."

        target_table = None
        if table_identifier.isdigit():
            table_index = int(table_identifier)
            if table_index < len(tables):
                target_table = tables[table_index]
            else:
                return f"Error: Table index {table_index} is out of bounds. Only {len(tables)} tables found."
        else:
            for table in tables:
                caption = table.find('caption')
                if caption and table_identifier.lower() in caption.get_text().lower():
                    target_table = table
                    break
        
        if not target_table:
            return f"Error: Could not find a table matching the identifier '{table_identifier}'."

        # Convert table to Markdown format
        markdown_table = ""
        headers = [th.get_text(strip=True) for th in target_table.find_all('th')]
        if headers:
            markdown_table += "| " + " | ".join(headers) + " |\n"
            markdown_table += "| " + " | ".join(["---"] * len(headers)) + " |\n"

        for row in target_table.find_all('tr'):
            cells = [td.get_text(strip=True).replace('\n', ' ') for td in row.find_all('td')]
            if cells:
                markdown_table += "| " + " | ".join(cells) + " |\n"

        return markdown_table if markdown_table else "Error: Found the table but could not parse its content."

    except requests.exceptions.RequestException as e:
        return f"Error fetching webpage {url}: {e}"
    except Exception as e:
        return f"An error occurred while extracting the table from {url}: {e}"

        
# --- Example Usage ---
if __name__ == "__main__":
    test_url_wiki = "https://en.wikipedia.org/wiki/Python_(programming_language)"
    test_url_news = "https://www.bbc.com/news"
    test_url_fail = "https://thissitedoesnotexist12345.com"

    print("--- Analyzing Wikipedia ---")
    summary_wiki = analyze_webpage_structure(test_url_wiki)
    print(summary_wiki)

    print("\n--- Analyzing BBC News ---")
    summary_news = analyze_webpage_structure(test_url_news)
    print(summary_news)
    
    print("\n--- Analyzing Failing URL ---")
    summary_fail = analyze_webpage_structure(test_url_fail)
    print(summary_fail)

    # --- Optional: Testing other tools (requires smolagents & potentially setup) ---
    # print("\n" + "="*30)
    # print("   Testing Cached Wiki Tool")
    # print("="*30)
    # 
    # try:
    #     wiki_tool_instance = CachedWikiTool()
    #     wiki_result = wiki_tool_instance.run("Artificial intelligence")
    #     print(f"Wikipedia Result (first 200 chars): {wiki_result[:200]}...")
    #     
    #     # Test caching (this *should* be instant)
    #     print("\n--- Testing Wiki Cache ---")
    #     wiki_result_cached = wiki_tool_instance.run("Artificial intelligence")
    #     print(f"Cached Result (first 200 chars): {wiki_result_cached[:200]}...")
    #     
    # except Exception as e:
    #     print(f"Could not test CachedWikiTool, likely due to missing dependencies or setup: {e}")