super_agent

Sleeping

App Files Files Community

lezaf commited on Jun 22

Commit

44aa671

1 Parent(s): abaaf08

Optimize html elements extraction

Browse files

Files changed (1) hide show

tools/web_search.py +23 -8

tools/web_search.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import requests
 import numpy as np
 import pandas as pd
@@ -5,14 +6,15 @@ from io import StringIO
 from bs4 import BeautifulSoup
 from langchain_core.tools import tool
 from duckduckgo_search import DDGS
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from tools.utils import StructureAwareTextSplitter
 TOP_K = 5
 MAX_RESULTS = 2
-UNWANTED_TAGS = ['nav', 'header', 'footer', 'aside', 'form', 'script', 'style']
-TAGS_TO_KEEP = ['h1', 'h2', 'h3', 'p', 'ul', 'ol', 'table']
 def _format_table_to_string(table_html):
@@ -76,10 +78,11 @@ def _extract_list(tag, level=0):
         item_prefix = lambda idx: "-"
     for idx, li in enumerate(tag.find_all("li", recursive=False)):
-        # Get the text before any nested list
-        text = li.find(text=True, recursive=False)
-        text = text.strip() if text else ""
-        # Check for nested lists
         nested = li.find(["ul", "ol"], recursive=False)
         if nested:
             nested_items = _extract_list(nested, level+1)
@@ -113,6 +116,10 @@ def _parse_structured_content(soup):
                 content.append({'type': 'list', 'items': items})
         elif tag.name == 'table':
             content.append({'type': 'table', 'html': str(tag)})
     return content
@@ -131,10 +138,18 @@ def web_search(query: str) -> str:
         chunks (str): Concatenated string of most relevant chunks.
     """
     # ----- STEP 1: Find the most relevant webpages
-    results = DDGS(timeout=30).text(query, max_results=MAX_RESULTS)
-    urls = [r['href'] for r in results if 'href' in r]
     all_chunks = []
     for url in urls:

+import os
 import requests
 import numpy as np
 import pandas as pd
 from bs4 import BeautifulSoup
 from langchain_core.tools import tool
 from duckduckgo_search import DDGS
+from tavily import TavilyClient
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from tools.utils import StructureAwareTextSplitter
 TOP_K = 5
 MAX_RESULTS = 2
+UNWANTED_TAGS = ['nav', 'header', 'footer', 'aside', 'form', 'script', 'style', 'button']
+TAGS_TO_KEEP = ['h1', 'h2', 'h3', 'p', 'ul', 'ol', 'table', 'span']
 def _format_table_to_string(table_html):
         item_prefix = lambda idx: "-"
     for idx, li in enumerate(tag.find_all("li", recursive=False)):
+        # Get all text inside the li, flattening tags (including spans)
+        text = li.get_text(" ", strip=True)
+        # Remove text from nested lists (if any)
+        for nested in li.find_all(["ul", "ol"], recursive=False):
+            nested.extract()
         nested = li.find(["ul", "ol"], recursive=False)
         if nested:
             nested_items = _extract_list(nested, level+1)
                 content.append({'type': 'list', 'items': items})
         elif tag.name == 'table':
             content.append({'type': 'table', 'html': str(tag)})
+        elif tag.name == 'span':
+            # Only include spans that are not empty and with not parent element
+            if (tag.find_parent(['ul', 'ol', 'table', 'p']) is None) and tag.get_text(strip=True):
+                content.append({'type': 'span', 'text': tag.get_text(strip=True)})
     return content
         chunks (str): Concatenated string of most relevant chunks.
     """
+    USE_DDGS = os.getenv("USE_DDGS").lower() == "true"
     # ----- STEP 1: Find the most relevant webpages
+    if USE_DDGS:
+        results = DDGS(timeout=30).text(query, max_results=MAX_RESULTS)
+        urls = [r['href'] for r in results if 'href' in r]
+    else:
+        TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
+        tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
+        response = tavily_client.search(query, max_results=MAX_RESULTS)
+        urls = [r['url'] for r in response['results'] if 'url' in r]
     all_chunks = []
     for url in urls: