lezaf commited on
Commit
abaaf08
·
1 Parent(s): 99bb959

Fix lists nested in tables parsed twice

Browse files
Files changed (1) hide show
  1. tools/web_search.py +8 -4
tools/web_search.py CHANGED
@@ -108,7 +108,7 @@ def _parse_structured_content(soup):
108
  elif tag.name == 'p':
109
  content.append({'type': 'paragraph', 'text': tag.get_text(strip=True)})
110
  elif tag.name in ['ul', 'ol']:
111
- if tag.find_parent(['ul', 'ol']) is None:
112
  items = _extract_list(tag)
113
  content.append({'type': 'list', 'items': items})
114
  elif tag.name == 'table':
@@ -144,6 +144,10 @@ def web_search(query: str) -> str:
144
  except Exception as e:
145
  return f"Error fetching URL {url}: {str(e)}"
146
 
 
 
 
 
147
  # ----- STEP 2: Parse and clean the HTML content
148
  soup = BeautifulSoup(html, "html.parser")
149
 
@@ -190,8 +194,8 @@ def web_search(query: str) -> str:
190
  top_indices = similarities.argsort()[-TOP_K:][::-1]
191
 
192
  # output in a file the top chunks
193
- # with open("test_output/top_chunks.txt", "w", encoding="utf-8") as f:
194
- # for c in all_chunks:
195
- # f.write(c)
196
 
197
  return "".join([all_chunks[idx] for idx in top_indices])
 
108
  elif tag.name == 'p':
109
  content.append({'type': 'paragraph', 'text': tag.get_text(strip=True)})
110
  elif tag.name in ['ul', 'ol']:
111
+ if tag.find_parent(['ul', 'ol', 'table']) is None:
112
  items = _extract_list(tag)
113
  content.append({'type': 'list', 'items': items})
114
  elif tag.name == 'table':
 
144
  except Exception as e:
145
  return f"Error fetching URL {url}: {str(e)}"
146
 
147
+ # Output the html content to a file for debugging
148
+ with open(f"test_output/{urls.index(url)}_web_search.txt", "w", encoding="utf-8") as f:
149
+ f.write(html)
150
+
151
  # ----- STEP 2: Parse and clean the HTML content
152
  soup = BeautifulSoup(html, "html.parser")
153
 
 
194
  top_indices = similarities.argsort()[-TOP_K:][::-1]
195
 
196
  # output in a file the top chunks
197
+ with open(f"test_output/top_chunks.txt", "w", encoding="utf-8") as f:
198
+ for c in all_chunks:
199
+ f.write(c)
200
 
201
  return "".join([all_chunks[idx] for idx in top_indices])