Spaces:

garage-lab
/

MCP_WEB2JSON

Running

App Files Files Community

abdo-Mansour commited on 7 days ago

Commit

4ed1b4f

1 Parent(s): d69398b

moving our work

Browse files

Files changed (14) hide show

.gitignore +3 -0
.gradio/certificate.pem +31 -0
.gradio/flagged/dataset1.csv +4 -0
README.md +9 -7
app.py +284 -0
requirements.txt +13 -0
web2json/__pycache__/ai_extractor.cpython-311.pyc +0 -0
web2json/__pycache__/pipeline.cpython-311.pyc +0 -0
web2json/__pycache__/postprocessor.cpython-311.pyc +0 -0
web2json/__pycache__/preprocessor.cpython-311.pyc +0 -0
web2json/ai_extractor.py +283 -0
web2json/pipeline.py +43 -0
web2json/postprocessor.py +27 -0
web2json/preprocessor.py +145 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.env
+test.ipynb
+venv

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

.gradio/flagged/dataset1.csv ADDED Viewed

	@@ -0,0 +1,4 @@

+Content (URL or Raw Text),Content is URL?,Schema Definition,Output JSON,timestamp
+https://www.amazon.sa/-/en/soundcore-Wireless-Bluetooth-Water-Resistant-Customization/dp/B0BTYDLTM3/?_encoding=UTF8&pd_rd_w=CThtI&content-id=amzn1.sym.2d38d13c-20c8-4b9c-a23b-8365256393a2%3Aamzn1.symc.fc11ad14-99c1-406b-aa77-051d0ba1aade&pf_rd_p=2d38d13c-20c8-4b9c-a23b-8365256393a2&pf_rd_r=BQM6QA6QQAM8KDJD4AE1&pd_rd_wg=6dgls&pd_rd_r=46bcc0bb-b6ee-4da5-aab6-17aad8fced65&ref_=pd_hp_d_atf_ci_mcx_mr_ca_hp_atf_d&th=1,true,"{""type"": ""object"", ""properties"": {""title"": {""type"": ""string"", ""description"": ""Name of the product""}, ""price"": {""type"": ""number"", ""description"": ""Price of the product""}, ""description"": {""type"": ""string"", ""description"": ""Detailed description of the product""}, ""availability"": {""type"": ""boolean"", ""description"": ""Whether the product is in stock (true) or not (false)""}}, ""required"": [""title"", ""price""]}",,2025-06-10 23:40:15.418526
+https://www.amazon.sa/-/en/PEJE-Smartwatch-Waterproof-Compatible-Android-%EF%BC%88Silver%EF%BC%89/dp/B0DHFLHFN5/260-6171633-6363068?pd_rd_w=WHovW&content-id=amzn1.sym.e8909ab6-a385-4cea-bf31-67a954c29d60&pf_rd_p=e8909ab6-a385-4cea-bf31-67a954c29d60&pf_rd_r=X8PYMCPKW44AGJ98BSBW&pd_rd_wg=b4VZi&pd_rd_r=f10a05cd-dd96-470c-b8ab-af7adf3471f0&pd_rd_i=B0DHFLHFN5&th=1,true,"{""type"": ""object"", ""properties"": {""title"": {""type"": ""string"", ""description"": ""Name of the product""}, ""price"": {""type"": ""number"", ""description"": ""Price of the product""}, ""description"": {""type"": ""string"", ""description"": ""Detailed description of the product""}, ""availability"": {""type"": ""boolean"", ""description"": ""Whether the product is in stock (true) or not (false)""}}, ""required"": [""title"", ""price""]}","{""title"": ""Product description"", ""price"": null, ""description"": ""Product details\n\nProducts related to this item""}",2025-06-10 23:44:58.218043
+https://www.amazon.sa/-/en/PEJE-Smartwatch-Waterproof-Compatible-Android-%EF%BC%88Silver%EF%BC%89/dp/B0DHFLHFN5/260-6171633-6363068?pd_rd_w=WHovW&content-id=amzn1.sym.e8909ab6-a385-4cea-bf31-67a954c29d60&pf_rd_p=e8909ab6-a385-4cea-bf31-67a954c29d60&pf_rd_r=X8PYMCPKW44AGJ98BSBW&pd_rd_wg=b4VZi&pd_rd_r=f10a05cd-dd96-470c-b8ab-af7adf3471f0&pd_rd_i=B0DHFLHFN5&th=1,true,"{""type"": ""object"", ""properties"": {""title"": {""type"": ""string"", ""description"": ""Name of the product""}, ""price"": {""type"": ""number"", ""description"": ""Price of the product""}, ""description"": {""type"": ""string"", ""description"": ""Detailed description of the product""}, ""availability"": {""type"": ""boolean"", ""description"": ""Whether the product is in stock (true) or not (false)""}}, ""required"": [""title"", ""price""]}",,2025-06-11 00:19:58.927755

README.md CHANGED Viewed

@@ -1,13 +1,15 @@
 ---
-title: MCP WEB2JSON
-emoji: 🌖
-colorFrom: gray
-colorTo: red
 sdk: gradio
-sdk_version: 5.34.2
 app_file: app.py
-pinned: false
-license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: MCP Server Web2JSON
+emoji: 🖇️
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 5.33.0
 app_file: app.py
+pinned: True
+tags: [mcp-server-track]
 ---
+[Video overview of the agent demo](https://youtu.be/wd0kjOVoGn8)
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import json
+import pandas as pd
+import gradio as gr
+from typing import Dict, Any, Type
+from web2json.preprocessor import BasicPreprocessor
+from web2json.ai_extractor import AIExtractor, RAGExtractor, GeminiLLMClient
+from web2json.postprocessor import PostProcessor
+from web2json.pipeline import Pipeline
+from pydantic import BaseModel, Field, create_model
+import os
+import dotenv
+dotenv.load_dotenv()
+def parse_schema_input(schema_input: str) -> Type[BaseModel]:
+    """
+    Convert user schema input to a Pydantic BaseModel.
+    Supports multiple input formats:
+    1. JSON schema format
+    2. Python class definition
+    3. Simple field definitions
+    """
+    schema_input = schema_input.strip()
+    if not schema_input:
+        # Default schema if none provided
+        return create_model('DefaultSchema',
+                          title=(str, Field(description="Title of the content")),
+                          content=(str, Field(description="Main content")))
+    try:
+        # Try parsing as JSON schema
+        if schema_input.startswith('{'):
+            schema_dict = json.loads(schema_input)
+            return json_schema_to_basemodel(schema_dict)
+        # Try parsing as Python class definition
+        elif 'class ' in schema_input and 'BaseModel' in schema_input:
+            return python_class_to_basemodel(schema_input)
+        # Try parsing as simple field definitions
+        else:
+            return simple_fields_to_basemodel(schema_input)
+    except Exception as e:
+        raise ValueError(f"Could not parse schema: {str(e)}. Please check your schema format.")
+def json_schema_to_basemodel(schema_dict: Dict) -> Type[BaseModel]:
+    """Convert JSON schema to BaseModel"""
+    fields = {}
+    properties = schema_dict.get('properties', {})
+    required = schema_dict.get('required', [])
+    for field_name, field_info in properties.items():
+        field_type = get_python_type(field_info.get('type', 'string'))
+        field_description = field_info.get('description', '')
+        if field_name in required:
+            fields[field_name] = (field_type, Field(description=field_description))
+        else:
+            fields[field_name] = (field_type, Field(default=None, description=field_description))
+    return create_model('DynamicSchema', **fields)
+def python_class_to_basemodel(class_definition: str) -> Type[BaseModel]:
+    """Convert Python class definition to BaseModel"""
+    try:
+        # Execute the class definition in a safe namespace
+        namespace = {'BaseModel': BaseModel, 'Field': Field, 'str': str, 'int': int,
+                    'float': float, 'bool': bool, 'list': list, 'dict': dict}
+        exec(class_definition, namespace)
+        # Find the class that inherits from BaseModel
+        for name, obj in namespace.items():
+            if (isinstance(obj, type) and
+                issubclass(obj, BaseModel) and
+                obj != BaseModel):
+                return obj
+        raise ValueError("No BaseModel class found in definition")
+    except Exception as e:
+        raise ValueError(f"Invalid Python class definition: {str(e)}")
+def simple_fields_to_basemodel(fields_text: str) -> Type[BaseModel]:
+    """Convert simple field definitions to BaseModel"""
+    fields = {}
+    for line in fields_text.strip().split('\n'):
+        line = line.strip()
+        if not line or line.startswith('#'):
+            continue
+        # Parse field definition (e.g., "name: str = description")
+        if ':' in line:
+            parts = line.split(':', 1)
+            field_name = parts[0].strip()
+            type_and_desc = parts[1].strip()
+            if '=' in type_and_desc:
+                type_part, desc_part = type_and_desc.split('=', 1)
+                field_type = get_python_type(type_part.strip())
+                description = desc_part.strip().strip('"\'')
+            else:
+                field_type = get_python_type(type_and_desc.strip())
+                description = ""
+            fields[field_name] = (field_type, Field(description=description))
+        else:
+            # Simple field name only
+            field_name = line.strip()
+            fields[field_name] = (str, Field(description=""))
+    if not fields:
+        raise ValueError("No valid fields found in schema definition")
+    return create_model('DynamicSchema', **fields)
+def get_python_type(type_str: str):
+    """Convert type string to Python type"""
+    type_str = type_str.lower().strip()
+    type_mapping = {
+        'string': str, 'str': str,
+        'integer': int, 'int': int,
+        'number': float, 'float': float,
+        'boolean': bool, 'bool': bool,
+        'array': list, 'list': list,
+        'object': dict, 'dict': dict
+    }
+    return type_mapping.get(type_str, str)
+def webpage_to_json_wrapper(content: str, is_url: bool, schema_input: str) -> Dict[str, Any]:
+    """Wrapper function that converts schema input to BaseModel"""
+    try:
+        # Parse the schema input into a BaseModel
+        schema_model = parse_schema_input(schema_input)
+        # Call the original function
+        return webpage_to_json(content, is_url, schema_model)
+    except Exception as e:
+        return {"error": f"Schema parsing error: {str(e)}"}
+def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str, Any]:
+    """
+    Extracts structured JSON information from a given content based on a specified schema.
+    This function sets up a processing pipeline that includes:
+    - Preprocessing the input content.
+    - Utilizing an AI language model to extract information according to the provided schema.
+    - Postprocessing the extracted output to match the exact schema requirements.
+    Parameters:
+        content (str): The input content to be analyzed. This can be direct text or a URL content.
+        is_url (bool): A flag indicating whether the provided content is a URL (True) or raw text (False).
+        schema (BaseModel): A Pydantic BaseModel defining the expected structure and data types for the output.
+    Returns:
+        Dict[str, Any]: A dictionary containing the extracted data matching the schema. In case of errors during initialization
+                        or processing, the dictionary will include an "error" key with a descriptive message.
+    """
+    prompt_template = """Extract the following information from the provided content according to the specified schema.
+    Content to analyze:
+    {content}
+    Schema requirements:
+    {schema}
+    Instructions:
+    - Extract only information that is explicitly present in the content
+    - Follow the exact structure and data types specified in the schema
+    - If a required field cannot be found, indicate this clearly
+    - Preserve the original formatting and context where relevant
+    - Return the extracted data in the format specified by the schema"""
+    # Initialize pipeline components
+    # TODO: improve the RAG system and optimize (don't instantiate every time)
+    preprocessor = BasicPreprocessor(config={'keep_tags': False})
+    try:
+        llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
+    except Exception as e:
+        return {"error": f"Failed to initialize LLM client: {str(e)}"}
+    # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
+    ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
+    postprocessor = PostProcessor()
+    pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
+    try:
+        result = pipeline.run(content, is_url, schema)
+        print("-"*80)
+        print(f"Processed result: {result}")
+        return result
+    except Exception as e:
+        return {"error": f"Processing error: {str(e)}"}
+# Example schemas for the user
+example_schemas = """
+**Example Schema Formats:**
+1. **Simple field definitions:**
+```
+title: str = Page title
+price: float = Product price
+description: str = Product description
+available: bool = Is available
+```
+2. **JSON Schema:**
+```json
+{
+  "properties": {
+    "title": {"type": "string", "description": "Page title"},
+    "price": {"type": "number", "description": "Product price"},
+    "description": {"type": "string", "description": "Product description"}
+  },
+  "required": ["title"]
+}
+```
+3. **Python Class Definition:**
+```python
+class ProductSchema(BaseModel):
+    title: str = Field(description="Product title")
+    price: float = Field(description="Product price")
+    description: str = Field(description="Product description")
+    available: bool = Field(default=False, description="Availability status")
+```
+"""
+# Build Gradio Interface
+demo = gr.Interface(
+    fn=webpage_to_json_wrapper,
+    inputs=[
+        gr.Textbox(
+            label="Content (URL or Raw Text)",
+            lines=10,
+            placeholder="Enter URL or paste raw HTML/text here."
+        ),
+        gr.Checkbox(label="Content is URL?", value=False),
+        gr.Textbox(
+            label="Schema Definition",
+            lines=15,
+            placeholder="Define your extraction schema (see examples below)",
+            info=example_schemas
+        )
+    ],
+    outputs=gr.JSON(label="Output JSON"),
+    title="Webpage to JSON Converter",
+    description="Convert web pages or raw text into structured JSON using customizable schemas. Define your schema using simple field definitions, JSON schema, or Python class syntax.",
+    examples=[
+        [
+            "https://example.com",
+            True,
+            "title: str = Page title\nprice: float = Product price\ndescription: str = Description"
+        ],
+        [
+            "<h1>Sample Product</h1><p>Price: $29.99</p><p>Great quality item</p>",
+            False,
+            '''{
+            "type": "object",
+            "properties": {
+                "title": {
+                "type": "string",
+                "description": "Name of the product"
+                },
+                "price": {
+                "type": "number",
+                "description": "Price of the product"
+                },
+                "description": {
+                "type": "string",
+                "description": "Detailed description of the product"
+                },
+                "availability": {
+                "type": "boolean",
+                "description": "Whether the product is in stock (true) or not (false)"
+                }
+            },
+            "required": ["title", "price"]
+            }'''
+        ]
+    ]
+)
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+pandas
+gradio
+gradio[mcp]
+pydantic
+python-dotenv
+beautifulsoup4
+requests
+google-genai
+json_repair
+numpy
+langchain
+langchain-text-splitters
+sentence-transformers

web2json/__pycache__/ai_extractor.cpython-311.pyc ADDED Viewed

Binary file (16.2 kB). View file

web2json/__pycache__/pipeline.cpython-311.pyc ADDED Viewed

Binary file (2.49 kB). View file

web2json/__pycache__/postprocessor.cpython-311.pyc ADDED Viewed

Binary file (1.65 kB). View file

web2json/__pycache__/preprocessor.cpython-311.pyc ADDED Viewed

Binary file (5.93 kB). View file

web2json/ai_extractor.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import os
+from abc import ABC, abstractmethod
+from google import genai
+from google.genai import types
+from pydantic import BaseModel
+import numpy as np
+from typing import List, Any, Dict, Tuple
+import time
+from langchain_text_splitters import HTMLHeaderTextSplitter
+from sentence_transformers import SentenceTransformer
+class LLMClient(ABC):
+    """
+    Abstract base class for calling LLM APIs.
+    """
+    def __init__(self, config: dict = None):
+        """
+        Initializes the LLMClient with a configuration dictionary.
+        Args:
+            config (dict): Configuration settings for the LLM client.
+        """
+        self.config = config or {}
+    @abstractmethod
+    def call_api(self, prompt: str) -> str:
+        """
+        Call the underlying LLM API with the given prompt.
+        Args:
+            prompt (str): The prompt or input text for the LLM.
+        Returns:
+            str: The response from the LLM.
+        """
+        pass
+class GeminiLLMClient(LLMClient):
+    """
+    Concrete implementation of LLMClient for the Gemini API.
+    """
+    def __init__(self, config: dict):
+        """
+        Initializes the GeminiLLMClient with an API key, model name, and optional generation settings.
+        Args:
+            config (dict): Configuration containing:
+                - 'api_key': (optional) API key for Gemini (falls back to GEMINI_API_KEY env var)
+                - 'model_name': (optional) the model to use (default 'gemini-2.0-flash')
+                - 'generation_config': (optional) dict of GenerateContentConfig parameters
+        """
+        api_key = config.get("api_key") or os.environ.get("GEMINI_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "API key for Gemini must be provided in config['api_key'] or GEMINI_API_KEY env var."
+            )
+        self.client = genai.Client(api_key=api_key)
+        self.model_name = config.get("model_name", "gemini-2.0-flash")
+        # allow custom generation settings, fallback to sensible defaults
+        gen_conf = config.get("generation_config", {})
+        self.generate_config = types.GenerateContentConfig(
+            response_mime_type=gen_conf.get("response_mime_type", "text/plain"),
+            temperature=gen_conf.get("temperature"),
+            max_output_tokens=gen_conf.get("max_output_tokens"),
+            top_p=gen_conf.get("top_p"),
+            top_k=gen_conf.get("top_k"),
+            # add any other fields you want to expose
+        )
+    def call_api(self, prompt: str) -> str:
+        """
+        Call the Gemini API with the given prompt (non-streaming).
+        Args:
+            prompt (str): The input text for the API.
+        Returns:
+            str: The generated text from the Gemini API.
+        """
+        contents = [
+            types.Content(
+                role="user",
+                parts=[types.Part.from_text(text=prompt)],
+            )
+        ]
+        # Non-streaming call returns a full response object
+        response = self.client.models.generate_content(
+            model=self.model_name,
+            contents=contents,
+            config=self.generate_config,
+        )
+        # Combine all output parts into a single string
+        return response.text
+class AIExtractor:
+    def __init__(self, llm_client: LLMClient, prompt_template: str):
+        """
+        Initializes the AIExtractor with a specific LLM client and configuration.
+        Args:
+            llm_client (LLMClient): An instance of a class that implements the LLMClient interface.
+            prompt_template (str): The template to use for generating prompts for the LLM.
+            should contain placeholders for dynamic content.
+            e.g., "Extract the following information: {content} based on schema: {schema}"
+        """
+        self.llm_client = llm_client
+        self.prompt_template = prompt_template
+    def extract(self, content: str, schema: BaseModel) -> str:
+        """
+        Extracts structured information from the given content based on the provided schema.
+        Args:
+            content (str): The raw content to extract information from.
+            schema (BaseModel): A Pydantic model defining the structure of the expected output.
+        Returns:
+            str: The structured JSON object as a string.
+        """
+        prompt = self.prompt_template.format(content=content, schema=schema.model_json_schema())
+        # print(f"Generated prompt: {prompt}")
+        response = self.llm_client.call_api(prompt)
+        return response
+# TODO: RAGExtractor class
+class RAGExtractor(AIExtractor):
+    """
+    RAG-enhanced extractor that uses similarity search to find relevant chunks
+    before performing extraction, utilizing HTML header-based chunking and SentenceTransformer embeddings.
+    """
+    def __init__(self,
+                 llm_client: LLMClient,
+                 prompt_template: str,
+                 embedding_model_path: str = "sentence-transformers/all-mpnet-base-v2",
+                 top_k: int = 3):
+        """
+        Initialize RAG extractor with embedding and chunking capabilities.
+        Args:
+            llm_client: LLM client for generation.
+            prompt_template: Template for prompts.
+            embedding_model_path: Path/name for the SentenceTransformer embedding model.
+            top_k: Number of top similar chunks to retrieve.
+        """
+        super().__init__(llm_client, prompt_template)
+        self.embedding_model_path = embedding_model_path
+        # Initialize the SentenceTransformer model for embeddings
+        self.embedding_model_instance = SentenceTransformer(self.embedding_model_path)
+        self.top_k = top_k
+    @staticmethod
+    def _langchain_HHTS(text: str) -> List[str]:
+        """
+        Chunks HTML text using Langchain's HTMLHeaderTextSplitter based on h1 and h2 headers.
+        Args:
+            text (str): The HTML content to chunk.
+        Returns:
+            List[str]: A list of chunked text strings (extracted from Document objects' page_content).
+        """
+        headers_to_split_on = [
+            ("h1", "Header 1"),
+            ("h2", "Header 2"),
+            # ("h3", "Header 3"), # This header was explicitly commented out in the request
+        ]
+        html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
+        return [doc.page_content for doc in html_splitter.split_text(text)]
+    def embed_text(self, text: str) -> np.ndarray:
+        """
+        Generate embeddings for text using the initialized SentenceTransformer model.
+        Args:
+            text: The text string to embed.
+        Returns:
+            np.ndarray: The embedding vector for the input text as a NumPy array.
+        """
+        try:
+            return self.embedding_model_instance.encode(text)
+        except Exception as e:
+            print(f"Warning: Embedding failed for text: '{text[:50]}...', using random embedding: {e}")
+            return None
+    def search_similar_chunks(self,
+                              query: str,
+                              chunks: List[str],
+                              embeddings: np.ndarray) -> List[str]:
+        """
+        Find the most similar chunks to the query within the given list of chunks
+        by calculating cosine similarity between their embeddings.
+        Args:
+            query (str): The query text whose embedding will be used for similarity comparison.
+            chunks (List[str]): A list of text chunks to search within.
+            embeddings (np.ndarray): Precomputed embeddings for the chunks, corresponding to the 'chunks' list.
+        Returns:
+            List[str]: A list of the 'top_k' most similar chunks to the query.
+        """
+        query_embedding = self.embed_text(query)
+        similarities = []
+        if query_embedding.ndim > 1:
+            query_embedding = query_embedding.flatten()
+        for i, chunk_embedding in enumerate(embeddings):
+            if chunk_embedding.ndim > 1:
+                chunk_embedding = chunk_embedding.flatten()
+            norm_query = np.linalg.norm(query_embedding)
+            norm_chunk = np.linalg.norm(chunk_embedding)
+            if norm_query == 0 or norm_chunk == 0:
+                similarity = 0.0
+            else:
+                similarity = np.dot(query_embedding, chunk_embedding) / (norm_query * norm_chunk)
+            similarities.append((similarity, i))
+        similarities.sort(key=lambda x: x[0], reverse=True)
+        top_indices = [idx for _, idx in similarities[:self.top_k]]
+        return [chunks[i] for i in top_indices]
+    def extract(self, content: str, schema: BaseModel, query: str = None) -> str:
+        """
+        Overrides the base AIExtractor's method to implement RAG-enhanced extraction.
+        This function first chunks the input HTML content, then uses a query to find
+        the most relevant chunks via embedding similarity, and finally sends these
+        relevant chunks as context to the LLM for structured information extraction.
+        Args:
+            content (str): The raw HTML content from which to extract information.
+            schema (BaseModel): A Pydantic model defining the desired output structure for the LLM.
+            query (str, optional): An optional query string to guide the retrieval of relevant chunks.
+                                   If not provided, a default query based on the schema will be used.
+        Returns:
+            str: The structured JSON object as a string, as generated by the LLM.
+        """
+        start_time = time.time()
+        if not query:
+            query = f"Extract information based on the following JSON schema: {schema.model_json_schema()}"
+            print(f"No explicit query provided for retrieval. Using default: '{query[:100]}...'")
+        chunks = self._langchain_HHTS(content)
+        print(f"Content successfully chunked into {len(chunks)} pieces.")
+        combined_content_for_llm = ""
+        if not chunks:
+            print("Warning: No chunks were generated from the provided content. The entire original content will be sent to the LLM.")
+            combined_content_for_llm = content
+        else:
+            chunk_embeddings = np.array([self.embed_text(chunk) for chunk in chunks])
+            print(f"Generated embeddings for {len(chunks)} chunks.")
+            similar_chunks = self.search_similar_chunks(query, chunks, chunk_embeddings)
+            print(f"Retrieved {len(similar_chunks)} similar chunks based on the query.")
+            combined_content_for_llm = "\n\n".join(similar_chunks)
+            print(f"Combined content for LLM (truncated): '{combined_content_for_llm[:200]}...'")
+        prompt = self.prompt_template.format(content=combined_content_for_llm, schema=schema.model_json_schema())
+        print(f"Sending prompt to LLM (truncated): '{prompt[:500]}...'")
+        llm_response = self.llm_client.call_api(prompt)
+        execution_time = (time.time() - start_time) * 1000
+        print(f"Extraction process completed in {execution_time:.2f} milliseconds.")
+        print(f"LLM's final response: {llm_response}")
+        print("=" * 78)
+        return llm_response

web2json/pipeline.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from web2json.ai_extractor import *
+from web2json.postprocessor import *
+from web2json.preprocessor import *
+from pydantic import BaseModel
+class Pipeline:
+    # constructor
+    def __init__(self,
+                 preprocessor: Preprocessor,
+                 ai_extractor: AIExtractor,
+                 postprocessor: PostProcessor):
+        self.preprocessor = preprocessor
+        self.ai_extractor = ai_extractor
+        self.postprocessor = postprocessor
+    def run(self, content: str, is_url: bool, schema:BaseModel) -> dict:
+        """
+        Run the entire pipeline: preprocess, extract, and postprocess.
+        Args:
+            content (str): The raw content to process.
+            is_url (bool): Whether the content is a URL or raw text.
+            schema (BaseModel): The schema defining the structure of the expected output.
+        Returns:
+            dict: The final structured data after processing.
+        """
+        # Step 1: Preprocess the content
+        preprocessed_content = self.preprocessor.preprocess(content, is_url)
+        print(f"Preprocessed content: {preprocessed_content[:100]}...")
+        print('+'*80)
+        # Step 2: Extract structured information using AI
+        extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
+        print(f"Extracted data: {extracted_data[:100]}...")
+        print('+'*80)
+        # Step 3: Post-process the extracted data
+        final_output = self.postprocessor.process(extracted_data)
+        print(f"Final output: {final_output}")
+        print('+'*80)
+        return final_output

web2json/postprocessor.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from json_repair import repair_json
+import json
+class PostProcessor:
+    def process(self, response: str) -> dict:
+        json_response = {}
+        try:
+            # Extract the JSON from the generated text.  Handle variations in output format.
+            json_string = response
+            if "```json" in response:
+                json_string = response.split("```json")[1].split("```")[0]
+            elif "{" in response and "}" in response:
+                # try to grab the json
+                start_index = response.find("{")
+                end_index = response.rfind("}") + 1
+                json_string = response[start_index:end_index]
+            json_response = json.loads(repair_json(json_string)) # Added for robustness
+        except Exception as e:
+            print(f"Error parsing JSON: {e}")
+            print(f"Generated text: {response}")
+            json_response = {}
+        return json_response

web2json/preprocessor.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import re
+import requests
+from bs4 import BeautifulSoup , Comment
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional
+class Preprocessor(ABC):
+    """
+    Abstract base class for preprocessors.
+    Defines the interface for transforming raw inputs into structured data.
+    """
+    def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
+        """
+        Initialize the preprocessor with optional configuration.
+        Args:
+            config: A dictionary of configuration settings.
+            - keep_tags (bool): If True, keeps HTML tags in the output; otherwise, cleans them.
+        """
+        self.config = config if config is not None else {'keep_tags': False}
+    def _fetch_content(self, url: str) -> str:
+        """
+        Fetches and parses the text content from a URL.
+        Args:
+            url: The URL to fetch content from.
+        Returns:
+            The clean, extracted text content from the page.
+        Raises:
+            ValueError: If the URL cannot be fetched or processed.
+        """
+        try:
+            # Set a User-Agent header to mimic a browser, which can help avoid
+            # being blocked by some websites.
+            # Inside _fetch_content method
+            headers =  headers = {
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+                "Accept-Language": "en-US,en;q=0.6",
+                "Cache-Control": "max-age=0",
+                "Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"",
+                "Sec-Ch-Ua-Mobile": "?0",
+                "Sec-Ch-Ua-Platform": "\"Windows\"",
+                "Sec-Fetch-Dest": "document",
+                "Sec-Fetch-Mode": "navigate",
+                "Sec-Fetch-Site": "none",
+                "Sec-Fetch-User": "?1",
+                "Upgrade-Insecure-Requests": "1",
+            }
+            # Make the HTTP GET request with a timeout.
+            response = requests.get(url, headers=headers, timeout=15)
+            return response.text
+        except requests.exceptions.RequestException as e:
+            # Catch any network-related errors (DNS, connection, timeout, etc.)
+            # and re-raise them as a more user-friendly ValueError.
+            raise ValueError(f"Failed to fetch content from URL: {url}. Error: {e}")
+    @abstractmethod
+    def preprocess(self, content: str, is_url: bool) -> str:
+        """
+        Take raw content (HTML, text, etc.) and apply preprocessing steps.
+        Args:
+            content: The raw data to preprocess.
+        Returns:
+            A dictionary containing structured, cleaned data ready for downstream tasks.
+        """
+        pass
+class BasicPreprocessor(Preprocessor):
+    """
+    Base preprocessor with common functionality.
+    Can be extended for specific preprocessing tasks.
+    """
+    # TODO: Might need to think of how to improve this later
+    def _clean_html(self, html_content: str) -> str:
+        """
+        Cleans up the given HTML content by:
+        - Removing <script> and <style> tags and their content.
+        - Removing HTML comments.
+        - Extracting and returning the visible text with normalized whitespace if keep_tags is False.
+        Args:
+            html_content (str): The HTML content to clean.
+        Returns:
+            str: The cleaned, visible text from the HTML.
+        """
+        # Parse the HTML content
+        soup = BeautifulSoup(html_content, "html.parser")
+        # Remove script and style elements
+        for tag in soup(["script", "style"]):
+            tag.decompose()
+        # Remove HTML comments
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+        # Extract text and normalize whitespace
+        if self.config.get('keep_tags', False):
+            # If keep_tags is True, return the raw HTML
+            return str(soup)
+        text = soup.get_text(separator=" ", strip=True)
+        clean_text = re.sub(r'\s+', ' ', text)
+        return clean_text
+    def preprocess(self, content: str, is_url: bool) -> str:
+        """
+        Take raw content (HTML, text, etc.) and apply preprocessing steps.
+        Args:
+            content: The raw data to preprocess.
+        Returns:
+            A dictionary containing structured, cleaned data ready for downstream tasks.
+        """
+        html_content = content
+        if is_url:
+            # Fetch content from the URL
+            html_content = self._fetch_content(content)
+        # Clean the HTML content
+        cleaned_content = self._clean_html(html_content)
+        return cleaned_content.strip()  # Return the cleaned text content, stripped of leading/trailing whitespace