import os import requests import numpy as np import pandas as pd from io import StringIO from bs4 import BeautifulSoup from langchain_core.tools import tool from duckduckgo_search import DDGS from tavily import TavilyClient from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity from tools.utils import StructureAwareTextSplitter TOP_K = 5 MAX_RESULTS = 2 UNWANTED_TAGS = ['nav', 'header', 'footer', 'aside', 'form', 'script', 'style', 'button'] TAGS_TO_KEEP = ['h1', 'h2', 'h3', 'p', 'ul', 'ol', 'table', 'span'] def _format_table_to_string(table_html): """ Convert an HTML table to a markdown-style string representation. Args: table_html (str): HTML string of the table. Returns: str: Table formatted as a markdown-style string, or a message if parsing fails. """ try: df = pd.read_html(StringIO(table_html))[0] except: return ["[Table could not be parsed]"] if df.empty: return None table_str = "|" # Put column headers for col in df.columns: table_str += f" {col} |" table_str += "\n" # Put rows for _, row in df.iterrows(): table_str += "|" for col, val in row.items(): table_str += f" {val} |" table_str += "\n" return table_str def _extract_list(tag, level=0): """ Recursively extract nested HTML lists (