Update custom_tools.py
Browse filesUpdate get_wiki_section tool so it can adapt to more formats.
- custom_tools.py +23 -13
custom_tools.py
CHANGED
@@ -551,23 +551,33 @@ def get_wikipedia_section(page_title: str, section_title: str) -> str:
|
|
551 |
response.raise_for_status()
|
552 |
soup = BeautifulSoup(response.content, 'html.parser')
|
553 |
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
559 |
|
560 |
# Start collecting content from the parent tag of the headline (e.g., an <h2>)
|
561 |
content_tags = []
|
562 |
-
|
563 |
-
current_tag =
|
564 |
-
|
565 |
-
#
|
566 |
-
|
567 |
-
|
568 |
-
|
|
|
|
|
|
|
|
|
569 |
break
|
570 |
-
|
571 |
# Collect the text from relevant tags like paragraphs, lists, etc.
|
572 |
if sibling.name in ['p', 'ul', 'ol', 'dl']:
|
573 |
content_tags.append(sibling.get_text(strip=True))
|
|
|
551 |
response.raise_for_status()
|
552 |
soup = BeautifulSoup(response.content, 'html.parser')
|
553 |
|
554 |
+
## Find the headline span with the exact section title <- Too strict, change to a looser logic
|
555 |
+
# section_header = soup.find("span", class_="mw-headline", id=section_title.replace(" ", "_"))
|
556 |
+
|
557 |
+
section_header = None
|
558 |
+
# Find all potential header tags (h1, h2, h3, etc.)
|
559 |
+
for header in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
560 |
+
# Check if the header's text contains the section title (case-insensitive)
|
561 |
+
if section_title.lower() in header.get_text(strip=True).lower():
|
562 |
+
section_header = header
|
563 |
+
break # Stop at the first match
|
564 |
+
if not section_header:
|
565 |
+
return f"Error: Section containing '{section_title}' not found on page '{page_title}'. Check capitalization and spelling."
|
566 |
|
567 |
# Start collecting content from the parent tag of the headline (e.g., an <h2>)
|
568 |
content_tags = []
|
569 |
+
## The parent of the section header span is the header tag (e.g., <h2>)
|
570 |
+
# current_tag = section_header.parent
|
571 |
+
## Iterate through the siblings of the header tag
|
572 |
+
# for sibling in current_tag.find_next_siblings():
|
573 |
+
## Stop when we hit the next header of the same level (e.g., another h2)
|
574 |
+
# if sibling.name == current_tag.name:
|
575 |
+
# break
|
576 |
+
for sibling in section_header.find_next_siblings():
|
577 |
+
# Stop when we hit the next header of the same or higher level
|
578 |
+
# e.g., if we matched an <h2>, stop at the next <h2> or <h1>
|
579 |
+
if sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] and int(sibling.name[1]) <= int(section_header.name[1]):
|
580 |
break
|
|
|
581 |
# Collect the text from relevant tags like paragraphs, lists, etc.
|
582 |
if sibling.name in ['p', 'ul', 'ol', 'dl']:
|
583 |
content_tags.append(sibling.get_text(strip=True))
|