Spaces:
Running
Running
lezaf
commited on
Commit
·
44aa671
1
Parent(s):
abaaf08
Optimize html elements extraction
Browse files- tools/web_search.py +23 -8
tools/web_search.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import requests
|
2 |
import numpy as np
|
3 |
import pandas as pd
|
@@ -5,14 +6,15 @@ from io import StringIO
|
|
5 |
from bs4 import BeautifulSoup
|
6 |
from langchain_core.tools import tool
|
7 |
from duckduckgo_search import DDGS
|
|
|
8 |
from sentence_transformers import SentenceTransformer
|
9 |
from sklearn.metrics.pairwise import cosine_similarity
|
10 |
from tools.utils import StructureAwareTextSplitter
|
11 |
|
12 |
TOP_K = 5
|
13 |
MAX_RESULTS = 2
|
14 |
-
UNWANTED_TAGS = ['nav', 'header', 'footer', 'aside', 'form', 'script', 'style']
|
15 |
-
TAGS_TO_KEEP = ['h1', 'h2', 'h3', 'p', 'ul', 'ol', 'table']
|
16 |
|
17 |
|
18 |
def _format_table_to_string(table_html):
|
@@ -76,10 +78,11 @@ def _extract_list(tag, level=0):
|
|
76 |
item_prefix = lambda idx: "-"
|
77 |
|
78 |
for idx, li in enumerate(tag.find_all("li", recursive=False)):
|
79 |
-
# Get
|
80 |
-
text = li.
|
81 |
-
|
82 |
-
|
|
|
83 |
nested = li.find(["ul", "ol"], recursive=False)
|
84 |
if nested:
|
85 |
nested_items = _extract_list(nested, level+1)
|
@@ -113,6 +116,10 @@ def _parse_structured_content(soup):
|
|
113 |
content.append({'type': 'list', 'items': items})
|
114 |
elif tag.name == 'table':
|
115 |
content.append({'type': 'table', 'html': str(tag)})
|
|
|
|
|
|
|
|
|
116 |
|
117 |
return content
|
118 |
|
@@ -131,10 +138,18 @@ def web_search(query: str) -> str:
|
|
131 |
chunks (str): Concatenated string of most relevant chunks.
|
132 |
"""
|
133 |
|
|
|
134 |
# ----- STEP 1: Find the most relevant webpages
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
-
|
138 |
|
139 |
all_chunks = []
|
140 |
for url in urls:
|
|
|
1 |
+
import os
|
2 |
import requests
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
|
|
6 |
from bs4 import BeautifulSoup
|
7 |
from langchain_core.tools import tool
|
8 |
from duckduckgo_search import DDGS
|
9 |
+
from tavily import TavilyClient
|
10 |
from sentence_transformers import SentenceTransformer
|
11 |
from sklearn.metrics.pairwise import cosine_similarity
|
12 |
from tools.utils import StructureAwareTextSplitter
|
13 |
|
14 |
TOP_K = 5
|
15 |
MAX_RESULTS = 2
|
16 |
+
UNWANTED_TAGS = ['nav', 'header', 'footer', 'aside', 'form', 'script', 'style', 'button']
|
17 |
+
TAGS_TO_KEEP = ['h1', 'h2', 'h3', 'p', 'ul', 'ol', 'table', 'span']
|
18 |
|
19 |
|
20 |
def _format_table_to_string(table_html):
|
|
|
78 |
item_prefix = lambda idx: "-"
|
79 |
|
80 |
for idx, li in enumerate(tag.find_all("li", recursive=False)):
|
81 |
+
# Get all text inside the li, flattening tags (including spans)
|
82 |
+
text = li.get_text(" ", strip=True)
|
83 |
+
# Remove text from nested lists (if any)
|
84 |
+
for nested in li.find_all(["ul", "ol"], recursive=False):
|
85 |
+
nested.extract()
|
86 |
nested = li.find(["ul", "ol"], recursive=False)
|
87 |
if nested:
|
88 |
nested_items = _extract_list(nested, level+1)
|
|
|
116 |
content.append({'type': 'list', 'items': items})
|
117 |
elif tag.name == 'table':
|
118 |
content.append({'type': 'table', 'html': str(tag)})
|
119 |
+
elif tag.name == 'span':
|
120 |
+
# Only include spans that are not empty and with not parent element
|
121 |
+
if (tag.find_parent(['ul', 'ol', 'table', 'p']) is None) and tag.get_text(strip=True):
|
122 |
+
content.append({'type': 'span', 'text': tag.get_text(strip=True)})
|
123 |
|
124 |
return content
|
125 |
|
|
|
138 |
chunks (str): Concatenated string of most relevant chunks.
|
139 |
"""
|
140 |
|
141 |
+
USE_DDGS = os.getenv("USE_DDGS").lower() == "true"
|
142 |
# ----- STEP 1: Find the most relevant webpages
|
143 |
+
if USE_DDGS:
|
144 |
+
results = DDGS(timeout=30).text(query, max_results=MAX_RESULTS)
|
145 |
+
urls = [r['href'] for r in results if 'href' in r]
|
146 |
+
else:
|
147 |
+
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
|
148 |
+
|
149 |
+
tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
|
150 |
+
response = tavily_client.search(query, max_results=MAX_RESULTS)
|
151 |
|
152 |
+
urls = [r['url'] for r in response['results'] if 'url' in r]
|
153 |
|
154 |
all_chunks = []
|
155 |
for url in urls:
|