Spaces:

Agents-MCP-Hackathon
/

MCP_Server_Web2JSON

Running

App Files Files Community

abdo-Mansour

OmarKouta21 commited on Jun 10

Commit

0a66039

verified ·

1 Parent(s): 785cf5f

RAG Extractor (#3)

Browse files

- RAG Extractor Done (913ad2248468bc3e4b0dadba88304fc599f63d66)

Co-authored-by: Omar Kouta <OmarKouta21@users.noreply.huggingface.co>

Files changed (3) hide show

app.py +3 -3
requirements.txt +6 -1
web2json/ai_extractor.py +156 -1

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import pandas as pd
 import gradio as gr
 from typing import Dict, Any, Type
 from web2json.preprocessor import BasicPreprocessor
-from web2json.ai_extractor import AIExtractor, GeminiLLMClient
 from web2json.postprocessor import PostProcessor
 from web2json.pipeline import Pipeline
 from pydantic import BaseModel, Field, create_model
@@ -172,13 +172,13 @@ def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str,
     # Initialize pipeline components
     # TODO: improve the RAG system and optimize (don't instantiate every time)
-    preprocessor = BasicPreprocessor(config={'keep_tags': False})
     try:
         llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
     except Exception as e:
         return {"error": f"Failed to initialize LLM client: {str(e)}"}
-    ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
     postprocessor = PostProcessor()
     pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)

 import gradio as gr
 from typing import Dict, Any, Type
 from web2json.preprocessor import BasicPreprocessor
+from web2json.ai_extractor import AIExtractor, RAGExtractor, GeminiLLMClient
 from web2json.postprocessor import PostProcessor
 from web2json.pipeline import Pipeline
 from pydantic import BaseModel, Field, create_model
     # Initialize pipeline components
     # TODO: improve the RAG system and optimize (don't instantiate every time)
+    preprocessor = BasicPreprocessor(config={'keep_tags': True})
     try:
         llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
     except Exception as e:
         return {"error": f"Failed to initialize LLM client: {str(e)}"}
+    ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
     postprocessor = PostProcessor()
     pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)

requirements.txt CHANGED Viewed

@@ -1,8 +1,13 @@
 pandas
 gradio
 pydantic
 python-dotenv
 beautifulsoup4
 requests
 google-genai
-json_repair

 pandas
 gradio
+gradio[mcp]
 pydantic
 python-dotenv
 beautifulsoup4
 requests
 google-genai
+json_repair
+numpy
+langchain
+langchain-text-splitters
+sentence-transformers

web2json/ai_extractor.py CHANGED Viewed

@@ -3,6 +3,11 @@ from abc import ABC, abstractmethod
 from google import genai
 from google.genai import types
 from pydantic import BaseModel
 class LLMClient(ABC):
     """
@@ -125,4 +130,154 @@ class AIExtractor:
 # TODO: RAGExtractor class
 class RAGExtractor(AIExtractor):
-    pass

 from google import genai
 from google.genai import types
 from pydantic import BaseModel
+import numpy as np
+from typing import List, Any, Dict, Tuple
+import time
+from langchain_text_splitters import HTMLHeaderTextSplitter
+from sentence_transformers import SentenceTransformer
 class LLMClient(ABC):
     """
 # TODO: RAGExtractor class
 class RAGExtractor(AIExtractor):
+    """
+    RAG-enhanced extractor that uses similarity search to find relevant chunks
+    before performing extraction, utilizing HTML header-based chunking and SentenceTransformer embeddings.
+    """
+    def __init__(self,
+                 llm_client: LLMClient,
+                 prompt_template: str,
+                 embedding_model_path: str = "sentence-transformers/all-mpnet-base-v2",
+                 top_k: int = 3):
+        """
+        Initialize RAG extractor with embedding and chunking capabilities.
+        Args:
+            llm_client: LLM client for generation.
+            prompt_template: Template for prompts.
+            embedding_model_path: Path/name for the SentenceTransformer embedding model.
+            top_k: Number of top similar chunks to retrieve.
+        """
+        super().__init__(llm_client, prompt_template)
+        self.embedding_model_path = embedding_model_path
+        # Initialize the SentenceTransformer model for embeddings
+        self.embedding_model_instance = SentenceTransformer(self.embedding_model_path)
+        self.top_k = top_k
+    @staticmethod
+    def _langchain_HHTS(text: str) -> List[str]:
+        """
+        Chunks HTML text using Langchain's HTMLHeaderTextSplitter based on h1 and h2 headers.
+        Args:
+            text (str): The HTML content to chunk.
+        Returns:
+            List[str]: A list of chunked text strings (extracted from Document objects' page_content).
+        """
+        headers_to_split_on = [
+            ("h1", "Header 1"),
+            ("h2", "Header 2"),
+            # ("h3", "Header 3"), # This header was explicitly commented out in the request
+        ]
+        html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
+        return [doc.page_content for doc in html_splitter.split_text(text)]
+    def embed_text(self, text: str) -> np.ndarray:
+        """
+        Generate embeddings for text using the initialized SentenceTransformer model.
+        Args:
+            text: The text string to embed.
+        Returns:
+            np.ndarray: The embedding vector for the input text as a NumPy array.
+        """
+        try:
+            return self.embedding_model_instance.encode(text)
+        except Exception as e:
+            print(f"Warning: Embedding failed for text: '{text[:50]}...', using random embedding: {e}")
+            return None
+    def search_similar_chunks(self,
+                              query: str,
+                              chunks: List[str],
+                              embeddings: np.ndarray) -> List[str]:
+        """
+        Find the most similar chunks to the query within the given list of chunks
+        by calculating cosine similarity between their embeddings.
+        Args:
+            query (str): The query text whose embedding will be used for similarity comparison.
+            chunks (List[str]): A list of text chunks to search within.
+            embeddings (np.ndarray): Precomputed embeddings for the chunks, corresponding to the 'chunks' list.
+        Returns:
+            List[str]: A list of the 'top_k' most similar chunks to the query.
+        """
+        query_embedding = self.embed_text(query)
+        similarities = []
+        if query_embedding.ndim > 1:
+            query_embedding = query_embedding.flatten()
+        for i, chunk_embedding in enumerate(embeddings):
+            if chunk_embedding.ndim > 1:
+                chunk_embedding = chunk_embedding.flatten()
+            norm_query = np.linalg.norm(query_embedding)
+            norm_chunk = np.linalg.norm(chunk_embedding)
+            if norm_query == 0 or norm_chunk == 0:
+                similarity = 0.0
+            else:
+                similarity = np.dot(query_embedding, chunk_embedding) / (norm_query * norm_chunk)
+            similarities.append((similarity, i))
+        similarities.sort(key=lambda x: x[0], reverse=True)
+        top_indices = [idx for _, idx in similarities[:self.top_k]]
+        return [chunks[i] for i in top_indices]
+    def extract(self, content: str, schema: BaseModel, query: str = None) -> str:
+        """
+        Overrides the base AIExtractor's method to implement RAG-enhanced extraction.
+        This function first chunks the input HTML content, then uses a query to find
+        the most relevant chunks via embedding similarity, and finally sends these
+        relevant chunks as context to the LLM for structured information extraction.
+        Args:
+            content (str): The raw HTML content from which to extract information.
+            schema (BaseModel): A Pydantic model defining the desired output structure for the LLM.
+            query (str, optional): An optional query string to guide the retrieval of relevant chunks.
+                                   If not provided, a default query based on the schema will be used.
+        Returns:
+            str: The structured JSON object as a string, as generated by the LLM.
+        """
+        start_time = time.time()
+        if not query:
+            query = f"Extract information based on the following JSON schema: {schema.model_json_schema()}"
+            print(f"No explicit query provided for retrieval. Using default: '{query[:100]}...'")
+        chunks = self._langchain_HHTS(content)
+        print(f"Content successfully chunked into {len(chunks)} pieces.")
+        combined_content_for_llm = ""
+        if not chunks:
+            print("Warning: No chunks were generated from the provided content. The entire original content will be sent to the LLM.")
+            combined_content_for_llm = content
+        else:
+            chunk_embeddings = np.array([self.embed_text(chunk) for chunk in chunks])
+            print(f"Generated embeddings for {len(chunks)} chunks.")
+            similar_chunks = self.search_similar_chunks(query, chunks, chunk_embeddings)
+            print(f"Retrieved {len(similar_chunks)} similar chunks based on the query.")
+            combined_content_for_llm = "\n\n".join(similar_chunks)
+            print(f"Combined content for LLM (truncated): '{combined_content_for_llm[:200]}...'")
+        prompt = self.prompt_template.format(content=combined_content_for_llm, schema=schema.model_json_schema())
+        print(f"Sending prompt to LLM (truncated): '{prompt[:500]}...'")
+        llm_response = self.llm_client.call_api(prompt)
+        execution_time = (time.time() - start_time) * 1000
+        print(f"Extraction process completed in {execution_time:.2f} milliseconds.")
+        print(f"LLM's final response: {llm_response}")
+        print("=" * 78)
+        return llm_response