Spaces:
Running
Running
| import json | |
| import pandas as pd | |
| import gradio as gr | |
| from typing import Dict, Any, Type | |
| from web2json.preprocessor import BasicPreprocessor | |
| from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient | |
| from web2json.postprocessor import PostProcessor | |
| from web2json.pipeline import Pipeline | |
| from pydantic import BaseModel, Field, create_model | |
| import os | |
| import dotenv | |
| dotenv.load_dotenv() | |
| def parse_schema_input(schema_input: str) -> Type[BaseModel]: | |
| """ | |
| Convert user schema input to a Pydantic BaseModel. | |
| Supports multiple input formats: | |
| 1. JSON schema format | |
| 2. Python class definition | |
| 3. Simple field definitions | |
| """ | |
| schema_input = schema_input.strip() | |
| if not schema_input: | |
| # Default schema if none provided | |
| return create_model('DefaultSchema', | |
| title=(str, Field(description="Title of the content")), | |
| content=(str, Field(description="Main content"))) | |
| try: | |
| # Try parsing as JSON schema | |
| if schema_input.startswith('{'): | |
| schema_dict = json.loads(schema_input) | |
| return json_schema_to_basemodel(schema_dict) | |
| # Try parsing as Python class definition | |
| elif 'class ' in schema_input and 'BaseModel' in schema_input: | |
| return python_class_to_basemodel(schema_input) | |
| # Try parsing as simple field definitions | |
| else: | |
| return simple_fields_to_basemodel(schema_input) | |
| except Exception as e: | |
| raise ValueError(f"Could not parse schema: {str(e)}. Please check your schema format.") | |
| def json_schema_to_basemodel(schema_dict: Dict) -> Type[BaseModel]: | |
| """Convert JSON schema to BaseModel""" | |
| fields = {} | |
| properties = schema_dict.get('properties', {}) | |
| required = schema_dict.get('required', []) | |
| for field_name, field_info in properties.items(): | |
| field_type = get_python_type(field_info.get('type', 'string')) | |
| field_description = field_info.get('description', '') | |
| if field_name in required: | |
| fields[field_name] = (field_type, Field(description=field_description)) | |
| else: | |
| fields[field_name] = (field_type, Field(default=None, description=field_description)) | |
| return create_model('DynamicSchema', **fields) | |
| def python_class_to_basemodel(class_definition: str) -> Type[BaseModel]: | |
| """Convert Python class definition to BaseModel""" | |
| try: | |
| # Execute the class definition in a safe namespace | |
| namespace = {'BaseModel': BaseModel, 'Field': Field, 'str': str, 'int': int, | |
| 'float': float, 'bool': bool, 'list': list, 'dict': dict} | |
| exec(class_definition, namespace) | |
| # Find the class that inherits from BaseModel | |
| for name, obj in namespace.items(): | |
| if (isinstance(obj, type) and | |
| issubclass(obj, BaseModel) and | |
| obj != BaseModel): | |
| return obj | |
| raise ValueError("No BaseModel class found in definition") | |
| except Exception as e: | |
| raise ValueError(f"Invalid Python class definition: {str(e)}") | |
| def simple_fields_to_basemodel(fields_text: str) -> Type[BaseModel]: | |
| """Convert simple field definitions to BaseModel""" | |
| fields = {} | |
| for line in fields_text.strip().split('\n'): | |
| line = line.strip() | |
| if not line or line.startswith('#'): | |
| continue | |
| # Parse field definition (e.g., "name: str = description") | |
| if ':' in line: | |
| parts = line.split(':', 1) | |
| field_name = parts[0].strip() | |
| type_and_desc = parts[1].strip() | |
| if '=' in type_and_desc: | |
| type_part, desc_part = type_and_desc.split('=', 1) | |
| field_type = get_python_type(type_part.strip()) | |
| description = desc_part.strip().strip('"\'') | |
| else: | |
| field_type = get_python_type(type_and_desc.strip()) | |
| description = "" | |
| fields[field_name] = (field_type, Field(description=description)) | |
| else: | |
| # Simple field name only | |
| field_name = line.strip() | |
| fields[field_name] = (str, Field(description="")) | |
| if not fields: | |
| raise ValueError("No valid fields found in schema definition") | |
| return create_model('DynamicSchema', **fields) | |
| def get_python_type(type_str: str): | |
| """Convert type string to Python type""" | |
| type_str = type_str.lower().strip() | |
| type_mapping = { | |
| 'string': str, 'str': str, | |
| 'integer': int, 'int': int, | |
| 'number': float, 'float': float, | |
| 'boolean': bool, 'bool': bool, | |
| 'array': list, 'list': list, | |
| 'object': dict, 'dict': dict | |
| } | |
| return type_mapping.get(type_str, str) | |
| def webpage_to_json_wrapper(content: str, is_url: bool, schema_input: str) -> Dict[str, Any]: | |
| """Wrapper function that converts schema input to BaseModel""" | |
| try: | |
| # Parse the schema input into a BaseModel | |
| schema_model = parse_schema_input(schema_input) | |
| # Call the original function | |
| return webpage_to_json(content, is_url, schema_model) | |
| except Exception as e: | |
| return {"error": f"Schema parsing error: {str(e)}"} | |
| def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str, Any]: | |
| """ | |
| Extracts structured JSON information from a given content based on a specified schema. | |
| This function sets up a processing pipeline that includes: | |
| - Preprocessing the input content. | |
| - Utilizing an AI language model to extract information according to the provided schema. | |
| - Postprocessing the extracted output to match the exact schema requirements. | |
| Parameters: | |
| content (str): The input content to be analyzed. This can be direct text or a URL content. | |
| is_url (bool): A flag indicating whether the provided content is a URL (True) or raw text (False). | |
| schema (BaseModel): A Pydantic BaseModel defining the expected structure and data types for the output. | |
| Returns: | |
| Dict[str, Any]: A dictionary containing the extracted data matching the schema. In case of errors during initialization | |
| or processing, the dictionary will include an "error" key with a descriptive message. | |
| """ | |
| prompt_template = """Extract the following information from the provided content according to the specified schema. | |
| Content to analyze: | |
| {content} | |
| Schema requirements: | |
| {schema} | |
| Instructions: | |
| - Extract only information that is explicitly present in the content | |
| - Follow the exact structure and data types specified in the schema | |
| - If a required field cannot be found, indicate this clearly | |
| - Preserve the original formatting and context where relevant | |
| - Return the extracted data in the format specified by the schema""" | |
| classification_prompt_template = """ | |
| # HTML Chunk Relevance Classification Prompt | |
| You are an HTML content classifier. Your task is to analyze an HTML chunk against a given schema and determine if the content is relevant. | |
| ## Instructions: | |
| 1. Carefully examine the provided HTML chunk | |
| 2. Compare it against the given schema/criteria | |
| 3. Determine if the HTML chunk contains content that matches or is relevant to the schema | |
| 4. Respond with ONLY a JSON object containing a single field "relevant" with value 1 (relevant) or 0 (not relevant) | |
| ## Input Format: | |
| **Schema/Criteria:** | |
| {schema} | |
| **HTML Chunk:** | |
| ```html | |
| {content} | |
| ``` | |
| ## Output Format: | |
| Your response must be ONLY a valid JSON object with no additional text: | |
| ```json | |
| {{ | |
| "relevant": 1 | |
| }} | |
| ``` | |
| OR | |
| ```json | |
| {{ | |
| "relevant": 0 | |
| }} | |
| ``` | |
| ## Classification Rules: | |
| - Output 1 if the HTML chunk contains content that matches the schema criteria | |
| - Output 0 if the HTML chunk does not contain relevant content | |
| - Consider semantic meaning, not just exact keyword matches | |
| - Look at text content, attributes, structure, and context | |
| - Ignore purely structural HTML elements (like divs, spans) unless they contain relevant content | |
| - Be STRICT in your evaluation - only mark as relevant (1) if there is clear, meaningful content that directly relates to the schema | |
| - Empty elements, placeholder text, navigation menus, headers/footers, and generic UI components should typically be marked as not relevant (0) | |
| - The HTML chunk does not need to contain ALL schema information, but it must contain SUBSTANTIAL and SPECIFIC content related to the schema | |
| CRITICAL: Your entire response MUST be exactly one JSON object. DO NOT include any explanations, reasoning, markdown formatting, code blocks, or additional text. Output ONLY the raw JSON object. | |
| """ | |
| # Initialize pipeline components | |
| # TODO: improve the RAG system and optimize (don't instantiate every time) | |
| preprocessor = BasicPreprocessor(config={'keep_tags': True}) | |
| try: | |
| # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')}) | |
| llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'}) | |
| except Exception as e: | |
| return {"error": f"Failed to initialize LLM client: {str(e)}"} | |
| # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template) | |
| ai_extractor = LLMClassifierExtractor(llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template) | |
| postprocessor = PostProcessor() | |
| pipeline = Pipeline(preprocessor, ai_extractor, postprocessor) | |
| try: | |
| result = pipeline.run(content, is_url, schema) | |
| print("-"*80) | |
| print(f"Processed result: {result}") | |
| return result | |
| except Exception as e: | |
| return {"error": f"Processing error: {str(e)}"} | |
| # Example schemas for the user | |
| example_schemas = """ | |
| **Example Schema Formats:** | |
| 1. **Simple field definitions:** | |
| ``` | |
| title: str = Page title | |
| price: float = Product price | |
| description: str = Product description | |
| available: bool = Is available | |
| ``` | |
| 2. **JSON Schema:** | |
| ```json | |
| { | |
| "properties": { | |
| "title": {"type": "string", "description": "Page title"}, | |
| "price": {"type": "number", "description": "Product price"}, | |
| "description": {"type": "string", "description": "Product description"} | |
| }, | |
| "required": ["title"] | |
| } | |
| ``` | |
| 3. **Python Class Definition:** | |
| ```python | |
| class ProductSchema(BaseModel): | |
| title: str = Field(description="Product title") | |
| price: float = Field(description="Product price") | |
| description: str = Field(description="Product description") | |
| available: bool = Field(default=False, description="Availability status") | |
| ``` | |
| """ | |
| # Build Gradio Interface | |
| demo = gr.Interface( | |
| fn=webpage_to_json_wrapper, | |
| inputs=[ | |
| gr.Textbox( | |
| label="Content (URL or Raw Text)", | |
| lines=10, | |
| placeholder="Enter URL or paste raw HTML/text here." | |
| ), | |
| gr.Checkbox(label="Content is URL?", value=False), | |
| gr.Textbox( | |
| label="Schema Definition", | |
| lines=15, | |
| placeholder="Define your extraction schema (see examples below)", | |
| info=example_schemas | |
| ) | |
| ], | |
| outputs=gr.JSON(label="Output JSON"), | |
| title="Webpage to JSON Converter", | |
| description="Convert web pages or raw text into structured JSON using customizable schemas. Define your schema using simple field definitions, JSON schema, or Python class syntax.", | |
| examples=[ | |
| [ | |
| "https://example.com", | |
| True, | |
| "title: str = Page title\nprice: float = Product price\ndescription: str = Description" | |
| ], | |
| [ | |
| "<h1>Sample Product</h1><p>Price: $29.99</p><p>Great quality item</p>", | |
| False, | |
| '''{ | |
| "type": "object", | |
| "properties": { | |
| "title": { | |
| "type": "string", | |
| "description": "Name of the product" | |
| }, | |
| "price": { | |
| "type": "number", | |
| "description": "Price of the product" | |
| }, | |
| "description": { | |
| "type": "string", | |
| "description": "Detailed description of the product" | |
| }, | |
| "availability": { | |
| "type": "boolean", | |
| "description": "Whether the product is in stock (true) or not (false)" | |
| } | |
| }, | |
| "required": ["title", "price"] | |
| }''' | |
| ] | |
| ] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(mcp_server=True) |