Spaces:
Running
Running
Commit
·
4ed1b4f
1
Parent(s):
d69398b
moving our work
Browse files- .gitignore +3 -0
- .gradio/certificate.pem +31 -0
- .gradio/flagged/dataset1.csv +4 -0
- README.md +9 -7
- app.py +284 -0
- requirements.txt +13 -0
- web2json/__pycache__/ai_extractor.cpython-311.pyc +0 -0
- web2json/__pycache__/pipeline.cpython-311.pyc +0 -0
- web2json/__pycache__/postprocessor.cpython-311.pyc +0 -0
- web2json/__pycache__/preprocessor.cpython-311.pyc +0 -0
- web2json/ai_extractor.py +283 -0
- web2json/pipeline.py +43 -0
- web2json/postprocessor.py +27 -0
- web2json/preprocessor.py +145 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
test.ipynb
|
3 |
+
venv
|
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
.gradio/flagged/dataset1.csv
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Content (URL or Raw Text),Content is URL?,Schema Definition,Output JSON,timestamp
|
2 |
+
https://www.amazon.sa/-/en/soundcore-Wireless-Bluetooth-Water-Resistant-Customization/dp/B0BTYDLTM3/?_encoding=UTF8&pd_rd_w=CThtI&content-id=amzn1.sym.2d38d13c-20c8-4b9c-a23b-8365256393a2%3Aamzn1.symc.fc11ad14-99c1-406b-aa77-051d0ba1aade&pf_rd_p=2d38d13c-20c8-4b9c-a23b-8365256393a2&pf_rd_r=BQM6QA6QQAM8KDJD4AE1&pd_rd_wg=6dgls&pd_rd_r=46bcc0bb-b6ee-4da5-aab6-17aad8fced65&ref_=pd_hp_d_atf_ci_mcx_mr_ca_hp_atf_d&th=1,true,"{""type"": ""object"", ""properties"": {""title"": {""type"": ""string"", ""description"": ""Name of the product""}, ""price"": {""type"": ""number"", ""description"": ""Price of the product""}, ""description"": {""type"": ""string"", ""description"": ""Detailed description of the product""}, ""availability"": {""type"": ""boolean"", ""description"": ""Whether the product is in stock (true) or not (false)""}}, ""required"": [""title"", ""price""]}",,2025-06-10 23:40:15.418526
|
3 |
+
https://www.amazon.sa/-/en/PEJE-Smartwatch-Waterproof-Compatible-Android-%EF%BC%88Silver%EF%BC%89/dp/B0DHFLHFN5/260-6171633-6363068?pd_rd_w=WHovW&content-id=amzn1.sym.e8909ab6-a385-4cea-bf31-67a954c29d60&pf_rd_p=e8909ab6-a385-4cea-bf31-67a954c29d60&pf_rd_r=X8PYMCPKW44AGJ98BSBW&pd_rd_wg=b4VZi&pd_rd_r=f10a05cd-dd96-470c-b8ab-af7adf3471f0&pd_rd_i=B0DHFLHFN5&th=1,true,"{""type"": ""object"", ""properties"": {""title"": {""type"": ""string"", ""description"": ""Name of the product""}, ""price"": {""type"": ""number"", ""description"": ""Price of the product""}, ""description"": {""type"": ""string"", ""description"": ""Detailed description of the product""}, ""availability"": {""type"": ""boolean"", ""description"": ""Whether the product is in stock (true) or not (false)""}}, ""required"": [""title"", ""price""]}","{""title"": ""Product description"", ""price"": null, ""description"": ""Product details\n\nProducts related to this item""}",2025-06-10 23:44:58.218043
|
4 |
+
https://www.amazon.sa/-/en/PEJE-Smartwatch-Waterproof-Compatible-Android-%EF%BC%88Silver%EF%BC%89/dp/B0DHFLHFN5/260-6171633-6363068?pd_rd_w=WHovW&content-id=amzn1.sym.e8909ab6-a385-4cea-bf31-67a954c29d60&pf_rd_p=e8909ab6-a385-4cea-bf31-67a954c29d60&pf_rd_r=X8PYMCPKW44AGJ98BSBW&pd_rd_wg=b4VZi&pd_rd_r=f10a05cd-dd96-470c-b8ab-af7adf3471f0&pd_rd_i=B0DHFLHFN5&th=1,true,"{""type"": ""object"", ""properties"": {""title"": {""type"": ""string"", ""description"": ""Name of the product""}, ""price"": {""type"": ""number"", ""description"": ""Price of the product""}, ""description"": {""type"": ""string"", ""description"": ""Detailed description of the product""}, ""availability"": {""type"": ""boolean"", ""description"": ""Whether the product is in stock (true) or not (false)""}}, ""required"": [""title"", ""price""]}",,2025-06-11 00:19:58.927755
|
README.md
CHANGED
@@ -1,13 +1,15 @@
|
|
1 |
---
|
2 |
-
title: MCP
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
app_file: app.py
|
9 |
-
pinned:
|
10 |
-
|
11 |
---
|
12 |
|
|
|
|
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: MCP Server Web2JSON
|
3 |
+
emoji: 🖇️
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.33.0
|
8 |
app_file: app.py
|
9 |
+
pinned: True
|
10 |
+
tags: [mcp-server-track]
|
11 |
---
|
12 |
|
13 |
+
[Video overview of the agent demo](https://youtu.be/wd0kjOVoGn8)
|
14 |
+
|
15 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import pandas as pd
|
3 |
+
import gradio as gr
|
4 |
+
from typing import Dict, Any, Type
|
5 |
+
from web2json.preprocessor import BasicPreprocessor
|
6 |
+
from web2json.ai_extractor import AIExtractor, RAGExtractor, GeminiLLMClient
|
7 |
+
from web2json.postprocessor import PostProcessor
|
8 |
+
from web2json.pipeline import Pipeline
|
9 |
+
from pydantic import BaseModel, Field, create_model
|
10 |
+
import os
|
11 |
+
import dotenv
|
12 |
+
|
13 |
+
dotenv.load_dotenv()
|
14 |
+
|
15 |
+
def parse_schema_input(schema_input: str) -> Type[BaseModel]:
|
16 |
+
"""
|
17 |
+
Convert user schema input to a Pydantic BaseModel.
|
18 |
+
Supports multiple input formats:
|
19 |
+
1. JSON schema format
|
20 |
+
2. Python class definition
|
21 |
+
3. Simple field definitions
|
22 |
+
"""
|
23 |
+
schema_input = schema_input.strip()
|
24 |
+
|
25 |
+
if not schema_input:
|
26 |
+
# Default schema if none provided
|
27 |
+
return create_model('DefaultSchema',
|
28 |
+
title=(str, Field(description="Title of the content")),
|
29 |
+
content=(str, Field(description="Main content")))
|
30 |
+
|
31 |
+
try:
|
32 |
+
# Try parsing as JSON schema
|
33 |
+
if schema_input.startswith('{'):
|
34 |
+
schema_dict = json.loads(schema_input)
|
35 |
+
return json_schema_to_basemodel(schema_dict)
|
36 |
+
|
37 |
+
# Try parsing as Python class definition
|
38 |
+
elif 'class ' in schema_input and 'BaseModel' in schema_input:
|
39 |
+
return python_class_to_basemodel(schema_input)
|
40 |
+
|
41 |
+
# Try parsing as simple field definitions
|
42 |
+
else:
|
43 |
+
return simple_fields_to_basemodel(schema_input)
|
44 |
+
|
45 |
+
except Exception as e:
|
46 |
+
raise ValueError(f"Could not parse schema: {str(e)}. Please check your schema format.")
|
47 |
+
|
48 |
+
def json_schema_to_basemodel(schema_dict: Dict) -> Type[BaseModel]:
|
49 |
+
"""Convert JSON schema to BaseModel"""
|
50 |
+
fields = {}
|
51 |
+
properties = schema_dict.get('properties', {})
|
52 |
+
required = schema_dict.get('required', [])
|
53 |
+
|
54 |
+
for field_name, field_info in properties.items():
|
55 |
+
field_type = get_python_type(field_info.get('type', 'string'))
|
56 |
+
field_description = field_info.get('description', '')
|
57 |
+
|
58 |
+
if field_name in required:
|
59 |
+
fields[field_name] = (field_type, Field(description=field_description))
|
60 |
+
else:
|
61 |
+
fields[field_name] = (field_type, Field(default=None, description=field_description))
|
62 |
+
|
63 |
+
return create_model('DynamicSchema', **fields)
|
64 |
+
|
65 |
+
def python_class_to_basemodel(class_definition: str) -> Type[BaseModel]:
|
66 |
+
"""Convert Python class definition to BaseModel"""
|
67 |
+
try:
|
68 |
+
# Execute the class definition in a safe namespace
|
69 |
+
namespace = {'BaseModel': BaseModel, 'Field': Field, 'str': str, 'int': int,
|
70 |
+
'float': float, 'bool': bool, 'list': list, 'dict': dict}
|
71 |
+
exec(class_definition, namespace)
|
72 |
+
|
73 |
+
# Find the class that inherits from BaseModel
|
74 |
+
for name, obj in namespace.items():
|
75 |
+
if (isinstance(obj, type) and
|
76 |
+
issubclass(obj, BaseModel) and
|
77 |
+
obj != BaseModel):
|
78 |
+
return obj
|
79 |
+
|
80 |
+
raise ValueError("No BaseModel class found in definition")
|
81 |
+
except Exception as e:
|
82 |
+
raise ValueError(f"Invalid Python class definition: {str(e)}")
|
83 |
+
|
84 |
+
def simple_fields_to_basemodel(fields_text: str) -> Type[BaseModel]:
|
85 |
+
"""Convert simple field definitions to BaseModel"""
|
86 |
+
fields = {}
|
87 |
+
|
88 |
+
for line in fields_text.strip().split('\n'):
|
89 |
+
line = line.strip()
|
90 |
+
if not line or line.startswith('#'):
|
91 |
+
continue
|
92 |
+
|
93 |
+
# Parse field definition (e.g., "name: str = description")
|
94 |
+
if ':' in line:
|
95 |
+
parts = line.split(':', 1)
|
96 |
+
field_name = parts[0].strip()
|
97 |
+
|
98 |
+
type_and_desc = parts[1].strip()
|
99 |
+
if '=' in type_and_desc:
|
100 |
+
type_part, desc_part = type_and_desc.split('=', 1)
|
101 |
+
field_type = get_python_type(type_part.strip())
|
102 |
+
description = desc_part.strip().strip('"\'')
|
103 |
+
else:
|
104 |
+
field_type = get_python_type(type_and_desc.strip())
|
105 |
+
description = ""
|
106 |
+
|
107 |
+
fields[field_name] = (field_type, Field(description=description))
|
108 |
+
else:
|
109 |
+
# Simple field name only
|
110 |
+
field_name = line.strip()
|
111 |
+
fields[field_name] = (str, Field(description=""))
|
112 |
+
|
113 |
+
if not fields:
|
114 |
+
raise ValueError("No valid fields found in schema definition")
|
115 |
+
|
116 |
+
return create_model('DynamicSchema', **fields)
|
117 |
+
|
118 |
+
def get_python_type(type_str: str):
|
119 |
+
"""Convert type string to Python type"""
|
120 |
+
type_str = type_str.lower().strip()
|
121 |
+
type_mapping = {
|
122 |
+
'string': str, 'str': str,
|
123 |
+
'integer': int, 'int': int,
|
124 |
+
'number': float, 'float': float,
|
125 |
+
'boolean': bool, 'bool': bool,
|
126 |
+
'array': list, 'list': list,
|
127 |
+
'object': dict, 'dict': dict
|
128 |
+
}
|
129 |
+
return type_mapping.get(type_str, str)
|
130 |
+
|
131 |
+
def webpage_to_json_wrapper(content: str, is_url: bool, schema_input: str) -> Dict[str, Any]:
|
132 |
+
"""Wrapper function that converts schema input to BaseModel"""
|
133 |
+
try:
|
134 |
+
# Parse the schema input into a BaseModel
|
135 |
+
schema_model = parse_schema_input(schema_input)
|
136 |
+
|
137 |
+
# Call the original function
|
138 |
+
return webpage_to_json(content, is_url, schema_model)
|
139 |
+
|
140 |
+
except Exception as e:
|
141 |
+
return {"error": f"Schema parsing error: {str(e)}"}
|
142 |
+
|
143 |
+
def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str, Any]:
|
144 |
+
"""
|
145 |
+
Extracts structured JSON information from a given content based on a specified schema.
|
146 |
+
This function sets up a processing pipeline that includes:
|
147 |
+
- Preprocessing the input content.
|
148 |
+
- Utilizing an AI language model to extract information according to the provided schema.
|
149 |
+
- Postprocessing the extracted output to match the exact schema requirements.
|
150 |
+
Parameters:
|
151 |
+
content (str): The input content to be analyzed. This can be direct text or a URL content.
|
152 |
+
is_url (bool): A flag indicating whether the provided content is a URL (True) or raw text (False).
|
153 |
+
schema (BaseModel): A Pydantic BaseModel defining the expected structure and data types for the output.
|
154 |
+
Returns:
|
155 |
+
Dict[str, Any]: A dictionary containing the extracted data matching the schema. In case of errors during initialization
|
156 |
+
or processing, the dictionary will include an "error" key with a descriptive message.
|
157 |
+
"""
|
158 |
+
prompt_template = """Extract the following information from the provided content according to the specified schema.
|
159 |
+
|
160 |
+
Content to analyze:
|
161 |
+
{content}
|
162 |
+
|
163 |
+
Schema requirements:
|
164 |
+
{schema}
|
165 |
+
|
166 |
+
Instructions:
|
167 |
+
- Extract only information that is explicitly present in the content
|
168 |
+
- Follow the exact structure and data types specified in the schema
|
169 |
+
- If a required field cannot be found, indicate this clearly
|
170 |
+
- Preserve the original formatting and context where relevant
|
171 |
+
- Return the extracted data in the format specified by the schema"""
|
172 |
+
|
173 |
+
# Initialize pipeline components
|
174 |
+
# TODO: improve the RAG system and optimize (don't instantiate every time)
|
175 |
+
preprocessor = BasicPreprocessor(config={'keep_tags': False})
|
176 |
+
try:
|
177 |
+
llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
|
178 |
+
except Exception as e:
|
179 |
+
return {"error": f"Failed to initialize LLM client: {str(e)}"}
|
180 |
+
|
181 |
+
# ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
|
182 |
+
ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
|
183 |
+
postprocessor = PostProcessor()
|
184 |
+
pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
|
185 |
+
|
186 |
+
try:
|
187 |
+
result = pipeline.run(content, is_url, schema)
|
188 |
+
print("-"*80)
|
189 |
+
print(f"Processed result: {result}")
|
190 |
+
return result
|
191 |
+
except Exception as e:
|
192 |
+
return {"error": f"Processing error: {str(e)}"}
|
193 |
+
|
194 |
+
# Example schemas for the user
|
195 |
+
example_schemas = """
|
196 |
+
**Example Schema Formats:**
|
197 |
+
|
198 |
+
1. **Simple field definitions:**
|
199 |
+
```
|
200 |
+
title: str = Page title
|
201 |
+
price: float = Product price
|
202 |
+
description: str = Product description
|
203 |
+
available: bool = Is available
|
204 |
+
```
|
205 |
+
|
206 |
+
2. **JSON Schema:**
|
207 |
+
```json
|
208 |
+
{
|
209 |
+
"properties": {
|
210 |
+
"title": {"type": "string", "description": "Page title"},
|
211 |
+
"price": {"type": "number", "description": "Product price"},
|
212 |
+
"description": {"type": "string", "description": "Product description"}
|
213 |
+
},
|
214 |
+
"required": ["title"]
|
215 |
+
}
|
216 |
+
```
|
217 |
+
|
218 |
+
3. **Python Class Definition:**
|
219 |
+
```python
|
220 |
+
class ProductSchema(BaseModel):
|
221 |
+
title: str = Field(description="Product title")
|
222 |
+
price: float = Field(description="Product price")
|
223 |
+
description: str = Field(description="Product description")
|
224 |
+
available: bool = Field(default=False, description="Availability status")
|
225 |
+
```
|
226 |
+
"""
|
227 |
+
|
228 |
+
# Build Gradio Interface
|
229 |
+
demo = gr.Interface(
|
230 |
+
fn=webpage_to_json_wrapper,
|
231 |
+
inputs=[
|
232 |
+
gr.Textbox(
|
233 |
+
label="Content (URL or Raw Text)",
|
234 |
+
lines=10,
|
235 |
+
placeholder="Enter URL or paste raw HTML/text here."
|
236 |
+
),
|
237 |
+
gr.Checkbox(label="Content is URL?", value=False),
|
238 |
+
gr.Textbox(
|
239 |
+
label="Schema Definition",
|
240 |
+
lines=15,
|
241 |
+
placeholder="Define your extraction schema (see examples below)",
|
242 |
+
info=example_schemas
|
243 |
+
)
|
244 |
+
],
|
245 |
+
outputs=gr.JSON(label="Output JSON"),
|
246 |
+
title="Webpage to JSON Converter",
|
247 |
+
description="Convert web pages or raw text into structured JSON using customizable schemas. Define your schema using simple field definitions, JSON schema, or Python class syntax.",
|
248 |
+
examples=[
|
249 |
+
[
|
250 |
+
"https://example.com",
|
251 |
+
True,
|
252 |
+
"title: str = Page title\nprice: float = Product price\ndescription: str = Description"
|
253 |
+
],
|
254 |
+
[
|
255 |
+
"<h1>Sample Product</h1><p>Price: $29.99</p><p>Great quality item</p>",
|
256 |
+
False,
|
257 |
+
'''{
|
258 |
+
"type": "object",
|
259 |
+
"properties": {
|
260 |
+
"title": {
|
261 |
+
"type": "string",
|
262 |
+
"description": "Name of the product"
|
263 |
+
},
|
264 |
+
"price": {
|
265 |
+
"type": "number",
|
266 |
+
"description": "Price of the product"
|
267 |
+
},
|
268 |
+
"description": {
|
269 |
+
"type": "string",
|
270 |
+
"description": "Detailed description of the product"
|
271 |
+
},
|
272 |
+
"availability": {
|
273 |
+
"type": "boolean",
|
274 |
+
"description": "Whether the product is in stock (true) or not (false)"
|
275 |
+
}
|
276 |
+
},
|
277 |
+
"required": ["title", "price"]
|
278 |
+
}'''
|
279 |
+
]
|
280 |
+
]
|
281 |
+
)
|
282 |
+
|
283 |
+
if __name__ == "__main__":
|
284 |
+
demo.launch(mcp_server=True)
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
gradio
|
3 |
+
gradio[mcp]
|
4 |
+
pydantic
|
5 |
+
python-dotenv
|
6 |
+
beautifulsoup4
|
7 |
+
requests
|
8 |
+
google-genai
|
9 |
+
json_repair
|
10 |
+
numpy
|
11 |
+
langchain
|
12 |
+
langchain-text-splitters
|
13 |
+
sentence-transformers
|
web2json/__pycache__/ai_extractor.cpython-311.pyc
ADDED
Binary file (16.2 kB). View file
|
|
web2json/__pycache__/pipeline.cpython-311.pyc
ADDED
Binary file (2.49 kB). View file
|
|
web2json/__pycache__/postprocessor.cpython-311.pyc
ADDED
Binary file (1.65 kB). View file
|
|
web2json/__pycache__/preprocessor.cpython-311.pyc
ADDED
Binary file (5.93 kB). View file
|
|
web2json/ai_extractor.py
ADDED
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from abc import ABC, abstractmethod
|
3 |
+
from google import genai
|
4 |
+
from google.genai import types
|
5 |
+
from pydantic import BaseModel
|
6 |
+
import numpy as np
|
7 |
+
from typing import List, Any, Dict, Tuple
|
8 |
+
import time
|
9 |
+
from langchain_text_splitters import HTMLHeaderTextSplitter
|
10 |
+
from sentence_transformers import SentenceTransformer
|
11 |
+
|
12 |
+
class LLMClient(ABC):
|
13 |
+
"""
|
14 |
+
Abstract base class for calling LLM APIs.
|
15 |
+
"""
|
16 |
+
def __init__(self, config: dict = None):
|
17 |
+
"""
|
18 |
+
Initializes the LLMClient with a configuration dictionary.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
config (dict): Configuration settings for the LLM client.
|
22 |
+
"""
|
23 |
+
self.config = config or {}
|
24 |
+
|
25 |
+
@abstractmethod
|
26 |
+
def call_api(self, prompt: str) -> str:
|
27 |
+
"""
|
28 |
+
Call the underlying LLM API with the given prompt.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
prompt (str): The prompt or input text for the LLM.
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
str: The response from the LLM.
|
35 |
+
"""
|
36 |
+
pass
|
37 |
+
|
38 |
+
|
39 |
+
class GeminiLLMClient(LLMClient):
|
40 |
+
"""
|
41 |
+
Concrete implementation of LLMClient for the Gemini API.
|
42 |
+
"""
|
43 |
+
|
44 |
+
def __init__(self, config: dict):
|
45 |
+
"""
|
46 |
+
Initializes the GeminiLLMClient with an API key, model name, and optional generation settings.
|
47 |
+
|
48 |
+
Args:
|
49 |
+
config (dict): Configuration containing:
|
50 |
+
- 'api_key': (optional) API key for Gemini (falls back to GEMINI_API_KEY env var)
|
51 |
+
- 'model_name': (optional) the model to use (default 'gemini-2.0-flash')
|
52 |
+
- 'generation_config': (optional) dict of GenerateContentConfig parameters
|
53 |
+
"""
|
54 |
+
api_key = config.get("api_key") or os.environ.get("GEMINI_API_KEY")
|
55 |
+
if not api_key:
|
56 |
+
raise ValueError(
|
57 |
+
"API key for Gemini must be provided in config['api_key'] or GEMINI_API_KEY env var."
|
58 |
+
)
|
59 |
+
self.client = genai.Client(api_key=api_key)
|
60 |
+
self.model_name = config.get("model_name", "gemini-2.0-flash")
|
61 |
+
# allow custom generation settings, fallback to sensible defaults
|
62 |
+
gen_conf = config.get("generation_config", {})
|
63 |
+
self.generate_config = types.GenerateContentConfig(
|
64 |
+
response_mime_type=gen_conf.get("response_mime_type", "text/plain"),
|
65 |
+
temperature=gen_conf.get("temperature"),
|
66 |
+
max_output_tokens=gen_conf.get("max_output_tokens"),
|
67 |
+
top_p=gen_conf.get("top_p"),
|
68 |
+
top_k=gen_conf.get("top_k"),
|
69 |
+
# add any other fields you want to expose
|
70 |
+
)
|
71 |
+
|
72 |
+
def call_api(self, prompt: str) -> str:
|
73 |
+
"""
|
74 |
+
Call the Gemini API with the given prompt (non-streaming).
|
75 |
+
|
76 |
+
Args:
|
77 |
+
prompt (str): The input text for the API.
|
78 |
+
|
79 |
+
Returns:
|
80 |
+
str: The generated text from the Gemini API.
|
81 |
+
"""
|
82 |
+
contents = [
|
83 |
+
types.Content(
|
84 |
+
role="user",
|
85 |
+
parts=[types.Part.from_text(text=prompt)],
|
86 |
+
)
|
87 |
+
]
|
88 |
+
|
89 |
+
# Non-streaming call returns a full response object
|
90 |
+
response = self.client.models.generate_content(
|
91 |
+
model=self.model_name,
|
92 |
+
contents=contents,
|
93 |
+
config=self.generate_config,
|
94 |
+
)
|
95 |
+
|
96 |
+
# Combine all output parts into a single string
|
97 |
+
return response.text
|
98 |
+
|
99 |
+
|
100 |
+
|
101 |
+
class AIExtractor:
|
102 |
+
def __init__(self, llm_client: LLMClient, prompt_template: str):
|
103 |
+
"""
|
104 |
+
Initializes the AIExtractor with a specific LLM client and configuration.
|
105 |
+
|
106 |
+
Args:
|
107 |
+
llm_client (LLMClient): An instance of a class that implements the LLMClient interface.
|
108 |
+
prompt_template (str): The template to use for generating prompts for the LLM.
|
109 |
+
should contain placeholders for dynamic content.
|
110 |
+
e.g., "Extract the following information: {content} based on schema: {schema}"
|
111 |
+
"""
|
112 |
+
self.llm_client = llm_client
|
113 |
+
self.prompt_template = prompt_template
|
114 |
+
|
115 |
+
def extract(self, content: str, schema: BaseModel) -> str:
|
116 |
+
"""
|
117 |
+
Extracts structured information from the given content based on the provided schema.
|
118 |
+
|
119 |
+
Args:
|
120 |
+
content (str): The raw content to extract information from.
|
121 |
+
schema (BaseModel): A Pydantic model defining the structure of the expected output.
|
122 |
+
|
123 |
+
Returns:
|
124 |
+
str: The structured JSON object as a string.
|
125 |
+
"""
|
126 |
+
prompt = self.prompt_template.format(content=content, schema=schema.model_json_schema())
|
127 |
+
# print(f"Generated prompt: {prompt}")
|
128 |
+
response = self.llm_client.call_api(prompt)
|
129 |
+
return response
|
130 |
+
|
131 |
+
# TODO: RAGExtractor class
|
132 |
+
class RAGExtractor(AIExtractor):
|
133 |
+
"""
|
134 |
+
RAG-enhanced extractor that uses similarity search to find relevant chunks
|
135 |
+
before performing extraction, utilizing HTML header-based chunking and SentenceTransformer embeddings.
|
136 |
+
"""
|
137 |
+
|
138 |
+
def __init__(self,
|
139 |
+
llm_client: LLMClient,
|
140 |
+
prompt_template: str,
|
141 |
+
embedding_model_path: str = "sentence-transformers/all-mpnet-base-v2",
|
142 |
+
top_k: int = 3):
|
143 |
+
"""
|
144 |
+
Initialize RAG extractor with embedding and chunking capabilities.
|
145 |
+
|
146 |
+
Args:
|
147 |
+
llm_client: LLM client for generation.
|
148 |
+
prompt_template: Template for prompts.
|
149 |
+
embedding_model_path: Path/name for the SentenceTransformer embedding model.
|
150 |
+
top_k: Number of top similar chunks to retrieve.
|
151 |
+
"""
|
152 |
+
super().__init__(llm_client, prompt_template)
|
153 |
+
self.embedding_model_path = embedding_model_path
|
154 |
+
# Initialize the SentenceTransformer model for embeddings
|
155 |
+
self.embedding_model_instance = SentenceTransformer(self.embedding_model_path)
|
156 |
+
self.top_k = top_k
|
157 |
+
|
158 |
+
@staticmethod
|
159 |
+
def _langchain_HHTS(text: str) -> List[str]:
|
160 |
+
"""
|
161 |
+
Chunks HTML text using Langchain's HTMLHeaderTextSplitter based on h1 and h2 headers.
|
162 |
+
|
163 |
+
Args:
|
164 |
+
text (str): The HTML content to chunk.
|
165 |
+
|
166 |
+
Returns:
|
167 |
+
List[str]: A list of chunked text strings (extracted from Document objects' page_content).
|
168 |
+
"""
|
169 |
+
headers_to_split_on = [
|
170 |
+
("h1", "Header 1"),
|
171 |
+
("h2", "Header 2"),
|
172 |
+
# ("h3", "Header 3"), # This header was explicitly commented out in the request
|
173 |
+
]
|
174 |
+
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
|
175 |
+
return [doc.page_content for doc in html_splitter.split_text(text)]
|
176 |
+
|
177 |
+
def embed_text(self, text: str) -> np.ndarray:
|
178 |
+
"""
|
179 |
+
Generate embeddings for text using the initialized SentenceTransformer model.
|
180 |
+
|
181 |
+
Args:
|
182 |
+
text: The text string to embed.
|
183 |
+
|
184 |
+
Returns:
|
185 |
+
np.ndarray: The embedding vector for the input text as a NumPy array.
|
186 |
+
"""
|
187 |
+
try:
|
188 |
+
return self.embedding_model_instance.encode(text)
|
189 |
+
except Exception as e:
|
190 |
+
print(f"Warning: Embedding failed for text: '{text[:50]}...', using random embedding: {e}")
|
191 |
+
|
192 |
+
return None
|
193 |
+
|
194 |
+
def search_similar_chunks(self,
|
195 |
+
query: str,
|
196 |
+
chunks: List[str],
|
197 |
+
embeddings: np.ndarray) -> List[str]:
|
198 |
+
"""
|
199 |
+
Find the most similar chunks to the query within the given list of chunks
|
200 |
+
by calculating cosine similarity between their embeddings.
|
201 |
+
|
202 |
+
Args:
|
203 |
+
query (str): The query text whose embedding will be used for similarity comparison.
|
204 |
+
chunks (List[str]): A list of text chunks to search within.
|
205 |
+
embeddings (np.ndarray): Precomputed embeddings for the chunks, corresponding to the 'chunks' list.
|
206 |
+
|
207 |
+
Returns:
|
208 |
+
List[str]: A list of the 'top_k' most similar chunks to the query.
|
209 |
+
"""
|
210 |
+
query_embedding = self.embed_text(query)
|
211 |
+
|
212 |
+
similarities = []
|
213 |
+
|
214 |
+
if query_embedding.ndim > 1:
|
215 |
+
query_embedding = query_embedding.flatten()
|
216 |
+
|
217 |
+
for i, chunk_embedding in enumerate(embeddings):
|
218 |
+
if chunk_embedding.ndim > 1:
|
219 |
+
chunk_embedding = chunk_embedding.flatten()
|
220 |
+
|
221 |
+
norm_query = np.linalg.norm(query_embedding)
|
222 |
+
norm_chunk = np.linalg.norm(chunk_embedding)
|
223 |
+
|
224 |
+
if norm_query == 0 or norm_chunk == 0:
|
225 |
+
similarity = 0.0
|
226 |
+
else:
|
227 |
+
similarity = np.dot(query_embedding, chunk_embedding) / (norm_query * norm_chunk)
|
228 |
+
similarities.append((similarity, i))
|
229 |
+
|
230 |
+
similarities.sort(key=lambda x: x[0], reverse=True)
|
231 |
+
top_indices = [idx for _, idx in similarities[:self.top_k]]
|
232 |
+
|
233 |
+
return [chunks[i] for i in top_indices]
|
234 |
+
|
235 |
+
def extract(self, content: str, schema: BaseModel, query: str = None) -> str:
|
236 |
+
"""
|
237 |
+
Overrides the base AIExtractor's method to implement RAG-enhanced extraction.
|
238 |
+
This function first chunks the input HTML content, then uses a query to find
|
239 |
+
the most relevant chunks via embedding similarity, and finally sends these
|
240 |
+
relevant chunks as context to the LLM for structured information extraction.
|
241 |
+
|
242 |
+
Args:
|
243 |
+
content (str): The raw HTML content from which to extract information.
|
244 |
+
schema (BaseModel): A Pydantic model defining the desired output structure for the LLM.
|
245 |
+
query (str, optional): An optional query string to guide the retrieval of relevant chunks.
|
246 |
+
If not provided, a default query based on the schema will be used.
|
247 |
+
|
248 |
+
Returns:
|
249 |
+
str: The structured JSON object as a string, as generated by the LLM.
|
250 |
+
"""
|
251 |
+
start_time = time.time()
|
252 |
+
|
253 |
+
if not query:
|
254 |
+
query = f"Extract information based on the following JSON schema: {schema.model_json_schema()}"
|
255 |
+
print(f"No explicit query provided for retrieval. Using default: '{query[:100]}...'")
|
256 |
+
|
257 |
+
chunks = self._langchain_HHTS(content)
|
258 |
+
print(f"Content successfully chunked into {len(chunks)} pieces.")
|
259 |
+
|
260 |
+
combined_content_for_llm = ""
|
261 |
+
if not chunks:
|
262 |
+
print("Warning: No chunks were generated from the provided content. The entire original content will be sent to the LLM.")
|
263 |
+
combined_content_for_llm = content
|
264 |
+
else:
|
265 |
+
chunk_embeddings = np.array([self.embed_text(chunk) for chunk in chunks])
|
266 |
+
print(f"Generated embeddings for {len(chunks)} chunks.")
|
267 |
+
|
268 |
+
similar_chunks = self.search_similar_chunks(query, chunks, chunk_embeddings)
|
269 |
+
print(f"Retrieved {len(similar_chunks)} similar chunks based on the query.")
|
270 |
+
|
271 |
+
combined_content_for_llm = "\n\n".join(similar_chunks)
|
272 |
+
print(f"Combined content for LLM (truncated): '{combined_content_for_llm[:200]}...'")
|
273 |
+
|
274 |
+
prompt = self.prompt_template.format(content=combined_content_for_llm, schema=schema.model_json_schema())
|
275 |
+
print(f"Sending prompt to LLM (truncated): '{prompt[:500]}...'")
|
276 |
+
llm_response = self.llm_client.call_api(prompt)
|
277 |
+
|
278 |
+
execution_time = (time.time() - start_time) * 1000
|
279 |
+
print(f"Extraction process completed in {execution_time:.2f} milliseconds.")
|
280 |
+
print(f"LLM's final response: {llm_response}")
|
281 |
+
print("=" * 78)
|
282 |
+
|
283 |
+
return llm_response
|
web2json/pipeline.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from web2json.ai_extractor import *
|
2 |
+
from web2json.postprocessor import *
|
3 |
+
from web2json.preprocessor import *
|
4 |
+
from pydantic import BaseModel
|
5 |
+
|
6 |
+
class Pipeline:
|
7 |
+
# constructor
|
8 |
+
def __init__(self,
|
9 |
+
preprocessor: Preprocessor,
|
10 |
+
ai_extractor: AIExtractor,
|
11 |
+
postprocessor: PostProcessor):
|
12 |
+
self.preprocessor = preprocessor
|
13 |
+
self.ai_extractor = ai_extractor
|
14 |
+
self.postprocessor = postprocessor
|
15 |
+
|
16 |
+
def run(self, content: str, is_url: bool, schema:BaseModel) -> dict:
|
17 |
+
"""
|
18 |
+
Run the entire pipeline: preprocess, extract, and postprocess.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
content (str): The raw content to process.
|
22 |
+
is_url (bool): Whether the content is a URL or raw text.
|
23 |
+
schema (BaseModel): The schema defining the structure of the expected output.
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
dict: The final structured data after processing.
|
27 |
+
"""
|
28 |
+
# Step 1: Preprocess the content
|
29 |
+
preprocessed_content = self.preprocessor.preprocess(content, is_url)
|
30 |
+
print(f"Preprocessed content: {preprocessed_content[:100]}...")
|
31 |
+
print('+'*80)
|
32 |
+
# Step 2: Extract structured information using AI
|
33 |
+
extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
|
34 |
+
print(f"Extracted data: {extracted_data[:100]}...")
|
35 |
+
print('+'*80)
|
36 |
+
# Step 3: Post-process the extracted data
|
37 |
+
final_output = self.postprocessor.process(extracted_data)
|
38 |
+
print(f"Final output: {final_output}")
|
39 |
+
print('+'*80)
|
40 |
+
|
41 |
+
return final_output
|
42 |
+
|
43 |
+
|
web2json/postprocessor.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from json_repair import repair_json
|
2 |
+
import json
|
3 |
+
|
4 |
+
class PostProcessor:
|
5 |
+
|
6 |
+
def process(self, response: str) -> dict:
|
7 |
+
json_response = {}
|
8 |
+
try:
|
9 |
+
# Extract the JSON from the generated text. Handle variations in output format.
|
10 |
+
json_string = response
|
11 |
+
if "```json" in response:
|
12 |
+
json_string = response.split("```json")[1].split("```")[0]
|
13 |
+
elif "{" in response and "}" in response:
|
14 |
+
# try to grab the json
|
15 |
+
start_index = response.find("{")
|
16 |
+
end_index = response.rfind("}") + 1
|
17 |
+
json_string = response[start_index:end_index]
|
18 |
+
|
19 |
+
json_response = json.loads(repair_json(json_string)) # Added for robustness
|
20 |
+
except Exception as e:
|
21 |
+
print(f"Error parsing JSON: {e}")
|
22 |
+
print(f"Generated text: {response}")
|
23 |
+
json_response = {}
|
24 |
+
|
25 |
+
|
26 |
+
return json_response
|
27 |
+
|
web2json/preprocessor.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup , Comment
|
4 |
+
from abc import ABC, abstractmethod
|
5 |
+
from typing import Any, Dict, Optional
|
6 |
+
|
7 |
+
|
8 |
+
class Preprocessor(ABC):
|
9 |
+
"""
|
10 |
+
Abstract base class for preprocessors.
|
11 |
+
Defines the interface for transforming raw inputs into structured data.
|
12 |
+
"""
|
13 |
+
|
14 |
+
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
|
15 |
+
"""
|
16 |
+
Initialize the preprocessor with optional configuration.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
config: A dictionary of configuration settings.
|
20 |
+
- keep_tags (bool): If True, keeps HTML tags in the output; otherwise, cleans them.
|
21 |
+
"""
|
22 |
+
self.config = config if config is not None else {'keep_tags': False}
|
23 |
+
|
24 |
+
def _fetch_content(self, url: str) -> str:
|
25 |
+
"""
|
26 |
+
Fetches and parses the text content from a URL.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
url: The URL to fetch content from.
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
The clean, extracted text content from the page.
|
33 |
+
|
34 |
+
Raises:
|
35 |
+
ValueError: If the URL cannot be fetched or processed.
|
36 |
+
"""
|
37 |
+
try:
|
38 |
+
# Set a User-Agent header to mimic a browser, which can help avoid
|
39 |
+
# being blocked by some websites.
|
40 |
+
# Inside _fetch_content method
|
41 |
+
headers = headers = {
|
42 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
|
43 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
44 |
+
"Accept-Language": "en-US,en;q=0.6",
|
45 |
+
"Cache-Control": "max-age=0",
|
46 |
+
"Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"",
|
47 |
+
"Sec-Ch-Ua-Mobile": "?0",
|
48 |
+
"Sec-Ch-Ua-Platform": "\"Windows\"",
|
49 |
+
"Sec-Fetch-Dest": "document",
|
50 |
+
"Sec-Fetch-Mode": "navigate",
|
51 |
+
"Sec-Fetch-Site": "none",
|
52 |
+
"Sec-Fetch-User": "?1",
|
53 |
+
"Upgrade-Insecure-Requests": "1",
|
54 |
+
}
|
55 |
+
|
56 |
+
# Make the HTTP GET request with a timeout.
|
57 |
+
response = requests.get(url, headers=headers, timeout=15)
|
58 |
+
|
59 |
+
|
60 |
+
return response.text
|
61 |
+
|
62 |
+
except requests.exceptions.RequestException as e:
|
63 |
+
# Catch any network-related errors (DNS, connection, timeout, etc.)
|
64 |
+
# and re-raise them as a more user-friendly ValueError.
|
65 |
+
raise ValueError(f"Failed to fetch content from URL: {url}. Error: {e}")
|
66 |
+
|
67 |
+
|
68 |
+
@abstractmethod
|
69 |
+
def preprocess(self, content: str, is_url: bool) -> str:
|
70 |
+
"""
|
71 |
+
Take raw content (HTML, text, etc.) and apply preprocessing steps.
|
72 |
+
|
73 |
+
Args:
|
74 |
+
content: The raw data to preprocess.
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
A dictionary containing structured, cleaned data ready for downstream tasks.
|
78 |
+
"""
|
79 |
+
pass
|
80 |
+
|
81 |
+
class BasicPreprocessor(Preprocessor):
|
82 |
+
"""
|
83 |
+
Base preprocessor with common functionality.
|
84 |
+
Can be extended for specific preprocessing tasks.
|
85 |
+
"""
|
86 |
+
# TODO: Might need to think of how to improve this later
|
87 |
+
def _clean_html(self, html_content: str) -> str:
|
88 |
+
"""
|
89 |
+
Cleans up the given HTML content by:
|
90 |
+
- Removing <script> and <style> tags and their content.
|
91 |
+
- Removing HTML comments.
|
92 |
+
- Extracting and returning the visible text with normalized whitespace if keep_tags is False.
|
93 |
+
|
94 |
+
Args:
|
95 |
+
html_content (str): The HTML content to clean.
|
96 |
+
|
97 |
+
Returns:
|
98 |
+
str: The cleaned, visible text from the HTML.
|
99 |
+
"""
|
100 |
+
# Parse the HTML content
|
101 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
102 |
+
|
103 |
+
# Remove script and style elements
|
104 |
+
for tag in soup(["script", "style"]):
|
105 |
+
tag.decompose()
|
106 |
+
|
107 |
+
# Remove HTML comments
|
108 |
+
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
109 |
+
comment.extract()
|
110 |
+
|
111 |
+
# Extract text and normalize whitespace
|
112 |
+
if self.config.get('keep_tags', False):
|
113 |
+
# If keep_tags is True, return the raw HTML
|
114 |
+
return str(soup)
|
115 |
+
|
116 |
+
text = soup.get_text(separator=" ", strip=True)
|
117 |
+
clean_text = re.sub(r'\s+', ' ', text)
|
118 |
+
|
119 |
+
return clean_text
|
120 |
+
|
121 |
+
def preprocess(self, content: str, is_url: bool) -> str:
|
122 |
+
"""
|
123 |
+
Take raw content (HTML, text, etc.) and apply preprocessing steps.
|
124 |
+
|
125 |
+
Args:
|
126 |
+
content: The raw data to preprocess.
|
127 |
+
|
128 |
+
Returns:
|
129 |
+
A dictionary containing structured, cleaned data ready for downstream tasks.
|
130 |
+
"""
|
131 |
+
|
132 |
+
html_content = content
|
133 |
+
if is_url:
|
134 |
+
# Fetch content from the URL
|
135 |
+
html_content = self._fetch_content(content)
|
136 |
+
|
137 |
+
|
138 |
+
# Clean the HTML content
|
139 |
+
cleaned_content = self._clean_html(html_content)
|
140 |
+
|
141 |
+
return cleaned_content.strip() # Return the cleaned text content, stripped of leading/trailing whitespace
|
142 |
+
|
143 |
+
|
144 |
+
|
145 |
+
|