ai_agent_course_final_project

Running

App Files Files Community

keynes42 commited on 14 days ago

Commit

5cfd9d9

verified ·

1 Parent(s): c669617

Update custom_tools.py

Browse files

Update get_wiki_section tool so it can adapt to more formats.

Files changed (1) hide show

custom_tools.py +23 -13

custom_tools.py CHANGED Viewed

@@ -551,23 +551,33 @@ def get_wikipedia_section(page_title: str, section_title: str) -> str:
         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
-        # Find the headline span with the exact section title
-        headline = soup.find("span", class_="mw-headline", id=section_title.replace(" ", "_"))
-        if not headline:
-            return f"Error: Section '{section_title}' not found on page '{page_title}'. Check capitalization and spelling."
         # Start collecting content from the parent tag of the headline (e.g., an <h2>)
         content_tags = []
-        # The parent of the headline span is the header tag (e.g., <h2>)
-        current_tag = headline.parent
-        # Iterate through the siblings of the header tag
-        for sibling in current_tag.find_next_siblings():
-            # Stop when we hit the next header of the same level (e.g., another h2)
-            if sibling.name == current_tag.name:
                 break
             # Collect the text from relevant tags like paragraphs, lists, etc.
             if sibling.name in ['p', 'ul', 'ol', 'dl']:
                 content_tags.append(sibling.get_text(strip=True))

         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
+        ## Find the headline span with the exact section title <- Too strict, change to a looser logic
+        # section_header = soup.find("span", class_="mw-headline", id=section_title.replace(" ", "_"))
+        section_header = None
+        # Find all potential header tags (h1, h2, h3, etc.)
+        for header in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
+            # Check if the header's text contains the section title (case-insensitive)
+            if section_title.lower() in header.get_text(strip=True).lower():
+                section_header = header
+                break # Stop at the first match
+        if not section_header:
+            return f"Error: Section containing '{section_title}' not found on page '{page_title}'. Check capitalization and spelling."
         # Start collecting content from the parent tag of the headline (e.g., an <h2>)
         content_tags = []
+        ## The parent of the section header span is the header tag (e.g., <h2>)
+        # current_tag = section_header.parent
+        ## Iterate through the siblings of the header tag
+        # for sibling in current_tag.find_next_siblings():
+            ## Stop when we hit the next header of the same level (e.g., another h2)
+        #    if sibling.name == current_tag.name:
+        #        break
+        for sibling in section_header.find_next_siblings():
+            # Stop when we hit the next header of the same or higher level
+            # e.g., if we matched an <h2>, stop at the next <h2> or <h1>
+            if sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] and int(sibling.name[1]) <= int(section_header.name[1]):
                 break
             # Collect the text from relevant tags like paragraphs, lists, etc.
             if sibling.name in ['p', 'ul', 'ol', 'dl']:
                 content_tags.append(sibling.get_text(strip=True))