super_agent

Running

lezaf commited on 25 days ago

Commit

abaaf08

1 Parent(s): 99bb959

Fix lists nested in tables parsed twice

Files changed (1) hide show

tools/web_search.py CHANGED Viewed

@@ -108,7 +108,7 @@ def _parse_structured_content(soup):
         elif tag.name == 'p':
             content.append({'type': 'paragraph', 'text': tag.get_text(strip=True)})
         elif tag.name in ['ul', 'ol']:
-            if tag.find_parent(['ul', 'ol']) is None:
                 items = _extract_list(tag)
                 content.append({'type': 'list', 'items': items})
         elif tag.name == 'table':
@@ -144,6 +144,10 @@ def web_search(query: str) -> str:
         except Exception as e:
             return f"Error fetching URL {url}: {str(e)}"
         # ----- STEP 2: Parse and clean the HTML content
         soup = BeautifulSoup(html, "html.parser")
@@ -190,8 +194,8 @@ def web_search(query: str) -> str:
     top_indices = similarities.argsort()[-TOP_K:][::-1]
     # output in a file the top chunks
-    # with open("test_output/top_chunks.txt", "w", encoding="utf-8") as f:
-    #     for c in all_chunks:
-    #         f.write(c)
     return "".join([all_chunks[idx] for idx in top_indices])

         elif tag.name == 'p':
             content.append({'type': 'paragraph', 'text': tag.get_text(strip=True)})
         elif tag.name in ['ul', 'ol']:
+            if tag.find_parent(['ul', 'ol', 'table']) is None:
                 items = _extract_list(tag)
                 content.append({'type': 'list', 'items': items})
         elif tag.name == 'table':
         except Exception as e:
             return f"Error fetching URL {url}: {str(e)}"
+        # Output the html content to a file for debugging
+        with open(f"test_output/{urls.index(url)}_web_search.txt", "w", encoding="utf-8") as f:
+            f.write(html)
         # ----- STEP 2: Parse and clean the HTML content
         soup = BeautifulSoup(html, "html.parser")
     top_indices = similarities.argsort()[-TOP_K:][::-1]
     # output in a file the top chunks
+    with open(f"test_output/top_chunks.txt", "w", encoding="utf-8") as f:
+        for c in all_chunks:
+            f.write(c)
     return "".join([all_chunks[idx] for idx in top_indices])