Spaces:
Running
Running
lezaf
commited on
Commit
·
abaaf08
1
Parent(s):
99bb959
Fix lists nested in tables parsed twice
Browse files- tools/web_search.py +8 -4
tools/web_search.py
CHANGED
@@ -108,7 +108,7 @@ def _parse_structured_content(soup):
|
|
108 |
elif tag.name == 'p':
|
109 |
content.append({'type': 'paragraph', 'text': tag.get_text(strip=True)})
|
110 |
elif tag.name in ['ul', 'ol']:
|
111 |
-
if tag.find_parent(['ul', 'ol']) is None:
|
112 |
items = _extract_list(tag)
|
113 |
content.append({'type': 'list', 'items': items})
|
114 |
elif tag.name == 'table':
|
@@ -144,6 +144,10 @@ def web_search(query: str) -> str:
|
|
144 |
except Exception as e:
|
145 |
return f"Error fetching URL {url}: {str(e)}"
|
146 |
|
|
|
|
|
|
|
|
|
147 |
# ----- STEP 2: Parse and clean the HTML content
|
148 |
soup = BeautifulSoup(html, "html.parser")
|
149 |
|
@@ -190,8 +194,8 @@ def web_search(query: str) -> str:
|
|
190 |
top_indices = similarities.argsort()[-TOP_K:][::-1]
|
191 |
|
192 |
# output in a file the top chunks
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
|
197 |
return "".join([all_chunks[idx] for idx in top_indices])
|
|
|
108 |
elif tag.name == 'p':
|
109 |
content.append({'type': 'paragraph', 'text': tag.get_text(strip=True)})
|
110 |
elif tag.name in ['ul', 'ol']:
|
111 |
+
if tag.find_parent(['ul', 'ol', 'table']) is None:
|
112 |
items = _extract_list(tag)
|
113 |
content.append({'type': 'list', 'items': items})
|
114 |
elif tag.name == 'table':
|
|
|
144 |
except Exception as e:
|
145 |
return f"Error fetching URL {url}: {str(e)}"
|
146 |
|
147 |
+
# Output the html content to a file for debugging
|
148 |
+
with open(f"test_output/{urls.index(url)}_web_search.txt", "w", encoding="utf-8") as f:
|
149 |
+
f.write(html)
|
150 |
+
|
151 |
# ----- STEP 2: Parse and clean the HTML content
|
152 |
soup = BeautifulSoup(html, "html.parser")
|
153 |
|
|
|
194 |
top_indices = similarities.argsort()[-TOP_K:][::-1]
|
195 |
|
196 |
# output in a file the top chunks
|
197 |
+
with open(f"test_output/top_chunks.txt", "w", encoding="utf-8") as f:
|
198 |
+
for c in all_chunks:
|
199 |
+
f.write(c)
|
200 |
|
201 |
return "".join([all_chunks[idx] for idx in top_indices])
|