lezaf commited on
Commit
44aa671
·
1 Parent(s): abaaf08

Optimize html elements extraction

Browse files
Files changed (1) hide show
  1. tools/web_search.py +23 -8
tools/web_search.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import requests
2
  import numpy as np
3
  import pandas as pd
@@ -5,14 +6,15 @@ from io import StringIO
5
  from bs4 import BeautifulSoup
6
  from langchain_core.tools import tool
7
  from duckduckgo_search import DDGS
 
8
  from sentence_transformers import SentenceTransformer
9
  from sklearn.metrics.pairwise import cosine_similarity
10
  from tools.utils import StructureAwareTextSplitter
11
 
12
  TOP_K = 5
13
  MAX_RESULTS = 2
14
- UNWANTED_TAGS = ['nav', 'header', 'footer', 'aside', 'form', 'script', 'style']
15
- TAGS_TO_KEEP = ['h1', 'h2', 'h3', 'p', 'ul', 'ol', 'table']
16
 
17
 
18
  def _format_table_to_string(table_html):
@@ -76,10 +78,11 @@ def _extract_list(tag, level=0):
76
  item_prefix = lambda idx: "-"
77
 
78
  for idx, li in enumerate(tag.find_all("li", recursive=False)):
79
- # Get the text before any nested list
80
- text = li.find(text=True, recursive=False)
81
- text = text.strip() if text else ""
82
- # Check for nested lists
 
83
  nested = li.find(["ul", "ol"], recursive=False)
84
  if nested:
85
  nested_items = _extract_list(nested, level+1)
@@ -113,6 +116,10 @@ def _parse_structured_content(soup):
113
  content.append({'type': 'list', 'items': items})
114
  elif tag.name == 'table':
115
  content.append({'type': 'table', 'html': str(tag)})
 
 
 
 
116
 
117
  return content
118
 
@@ -131,10 +138,18 @@ def web_search(query: str) -> str:
131
  chunks (str): Concatenated string of most relevant chunks.
132
  """
133
 
 
134
  # ----- STEP 1: Find the most relevant webpages
135
- results = DDGS(timeout=30).text(query, max_results=MAX_RESULTS)
 
 
 
 
 
 
 
136
 
137
- urls = [r['href'] for r in results if 'href' in r]
138
 
139
  all_chunks = []
140
  for url in urls:
 
1
+ import os
2
  import requests
3
  import numpy as np
4
  import pandas as pd
 
6
  from bs4 import BeautifulSoup
7
  from langchain_core.tools import tool
8
  from duckduckgo_search import DDGS
9
+ from tavily import TavilyClient
10
  from sentence_transformers import SentenceTransformer
11
  from sklearn.metrics.pairwise import cosine_similarity
12
  from tools.utils import StructureAwareTextSplitter
13
 
14
  TOP_K = 5
15
  MAX_RESULTS = 2
16
+ UNWANTED_TAGS = ['nav', 'header', 'footer', 'aside', 'form', 'script', 'style', 'button']
17
+ TAGS_TO_KEEP = ['h1', 'h2', 'h3', 'p', 'ul', 'ol', 'table', 'span']
18
 
19
 
20
  def _format_table_to_string(table_html):
 
78
  item_prefix = lambda idx: "-"
79
 
80
  for idx, li in enumerate(tag.find_all("li", recursive=False)):
81
+ # Get all text inside the li, flattening tags (including spans)
82
+ text = li.get_text(" ", strip=True)
83
+ # Remove text from nested lists (if any)
84
+ for nested in li.find_all(["ul", "ol"], recursive=False):
85
+ nested.extract()
86
  nested = li.find(["ul", "ol"], recursive=False)
87
  if nested:
88
  nested_items = _extract_list(nested, level+1)
 
116
  content.append({'type': 'list', 'items': items})
117
  elif tag.name == 'table':
118
  content.append({'type': 'table', 'html': str(tag)})
119
+ elif tag.name == 'span':
120
+ # Only include spans that are not empty and with not parent element
121
+ if (tag.find_parent(['ul', 'ol', 'table', 'p']) is None) and tag.get_text(strip=True):
122
+ content.append({'type': 'span', 'text': tag.get_text(strip=True)})
123
 
124
  return content
125
 
 
138
  chunks (str): Concatenated string of most relevant chunks.
139
  """
140
 
141
+ USE_DDGS = os.getenv("USE_DDGS").lower() == "true"
142
  # ----- STEP 1: Find the most relevant webpages
143
+ if USE_DDGS:
144
+ results = DDGS(timeout=30).text(query, max_results=MAX_RESULTS)
145
+ urls = [r['href'] for r in results if 'href' in r]
146
+ else:
147
+ TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
148
+
149
+ tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
150
+ response = tavily_client.search(query, max_results=MAX_RESULTS)
151
 
152
+ urls = [r['url'] for r in response['results'] if 'url' in r]
153
 
154
  all_chunks = []
155
  for url in urls: