keynes42 commited on
Commit
5cfd9d9
·
verified ·
1 Parent(s): c669617

Update custom_tools.py

Browse files

Update get_wiki_section tool so it can adapt to more formats.

Files changed (1) hide show
  1. custom_tools.py +23 -13
custom_tools.py CHANGED
@@ -551,23 +551,33 @@ def get_wikipedia_section(page_title: str, section_title: str) -> str:
551
  response.raise_for_status()
552
  soup = BeautifulSoup(response.content, 'html.parser')
553
 
554
- # Find the headline span with the exact section title
555
- headline = soup.find("span", class_="mw-headline", id=section_title.replace(" ", "_"))
556
-
557
- if not headline:
558
- return f"Error: Section '{section_title}' not found on page '{page_title}'. Check capitalization and spelling."
 
 
 
 
 
 
 
559
 
560
  # Start collecting content from the parent tag of the headline (e.g., an <h2>)
561
  content_tags = []
562
- # The parent of the headline span is the header tag (e.g., <h2>)
563
- current_tag = headline.parent
564
-
565
- # Iterate through the siblings of the header tag
566
- for sibling in current_tag.find_next_siblings():
567
- # Stop when we hit the next header of the same level (e.g., another h2)
568
- if sibling.name == current_tag.name:
 
 
 
 
569
  break
570
-
571
  # Collect the text from relevant tags like paragraphs, lists, etc.
572
  if sibling.name in ['p', 'ul', 'ol', 'dl']:
573
  content_tags.append(sibling.get_text(strip=True))
 
551
  response.raise_for_status()
552
  soup = BeautifulSoup(response.content, 'html.parser')
553
 
554
+ ## Find the headline span with the exact section title <- Too strict, change to a looser logic
555
+ # section_header = soup.find("span", class_="mw-headline", id=section_title.replace(" ", "_"))
556
+
557
+ section_header = None
558
+ # Find all potential header tags (h1, h2, h3, etc.)
559
+ for header in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
560
+ # Check if the header's text contains the section title (case-insensitive)
561
+ if section_title.lower() in header.get_text(strip=True).lower():
562
+ section_header = header
563
+ break # Stop at the first match
564
+ if not section_header:
565
+ return f"Error: Section containing '{section_title}' not found on page '{page_title}'. Check capitalization and spelling."
566
 
567
  # Start collecting content from the parent tag of the headline (e.g., an <h2>)
568
  content_tags = []
569
+ ## The parent of the section header span is the header tag (e.g., <h2>)
570
+ # current_tag = section_header.parent
571
+ ## Iterate through the siblings of the header tag
572
+ # for sibling in current_tag.find_next_siblings():
573
+ ## Stop when we hit the next header of the same level (e.g., another h2)
574
+ # if sibling.name == current_tag.name:
575
+ # break
576
+ for sibling in section_header.find_next_siblings():
577
+ # Stop when we hit the next header of the same or higher level
578
+ # e.g., if we matched an <h2>, stop at the next <h2> or <h1>
579
+ if sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] and int(sibling.name[1]) <= int(section_header.name[1]):
580
  break
 
581
  # Collect the text from relevant tags like paragraphs, lists, etc.
582
  if sibling.name in ['p', 'ul', 'ol', 'dl']:
583
  content_tags.append(sibling.get_text(strip=True))