abdo-Mansour commited on
Commit
4ed1b4f
·
1 Parent(s): d69398b

moving our work

Browse files
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .env
2
+ test.ipynb
3
+ venv
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
.gradio/flagged/dataset1.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Content (URL or Raw Text),Content is URL?,Schema Definition,Output JSON,timestamp
2
+ https://www.amazon.sa/-/en/soundcore-Wireless-Bluetooth-Water-Resistant-Customization/dp/B0BTYDLTM3/?_encoding=UTF8&pd_rd_w=CThtI&content-id=amzn1.sym.2d38d13c-20c8-4b9c-a23b-8365256393a2%3Aamzn1.symc.fc11ad14-99c1-406b-aa77-051d0ba1aade&pf_rd_p=2d38d13c-20c8-4b9c-a23b-8365256393a2&pf_rd_r=BQM6QA6QQAM8KDJD4AE1&pd_rd_wg=6dgls&pd_rd_r=46bcc0bb-b6ee-4da5-aab6-17aad8fced65&ref_=pd_hp_d_atf_ci_mcx_mr_ca_hp_atf_d&th=1,true,"{""type"": ""object"", ""properties"": {""title"": {""type"": ""string"", ""description"": ""Name of the product""}, ""price"": {""type"": ""number"", ""description"": ""Price of the product""}, ""description"": {""type"": ""string"", ""description"": ""Detailed description of the product""}, ""availability"": {""type"": ""boolean"", ""description"": ""Whether the product is in stock (true) or not (false)""}}, ""required"": [""title"", ""price""]}",,2025-06-10 23:40:15.418526
3
+ https://www.amazon.sa/-/en/PEJE-Smartwatch-Waterproof-Compatible-Android-%EF%BC%88Silver%EF%BC%89/dp/B0DHFLHFN5/260-6171633-6363068?pd_rd_w=WHovW&content-id=amzn1.sym.e8909ab6-a385-4cea-bf31-67a954c29d60&pf_rd_p=e8909ab6-a385-4cea-bf31-67a954c29d60&pf_rd_r=X8PYMCPKW44AGJ98BSBW&pd_rd_wg=b4VZi&pd_rd_r=f10a05cd-dd96-470c-b8ab-af7adf3471f0&pd_rd_i=B0DHFLHFN5&th=1,true,"{""type"": ""object"", ""properties"": {""title"": {""type"": ""string"", ""description"": ""Name of the product""}, ""price"": {""type"": ""number"", ""description"": ""Price of the product""}, ""description"": {""type"": ""string"", ""description"": ""Detailed description of the product""}, ""availability"": {""type"": ""boolean"", ""description"": ""Whether the product is in stock (true) or not (false)""}}, ""required"": [""title"", ""price""]}","{""title"": ""Product description"", ""price"": null, ""description"": ""Product details\n\nProducts related to this item""}",2025-06-10 23:44:58.218043
4
+ https://www.amazon.sa/-/en/PEJE-Smartwatch-Waterproof-Compatible-Android-%EF%BC%88Silver%EF%BC%89/dp/B0DHFLHFN5/260-6171633-6363068?pd_rd_w=WHovW&content-id=amzn1.sym.e8909ab6-a385-4cea-bf31-67a954c29d60&pf_rd_p=e8909ab6-a385-4cea-bf31-67a954c29d60&pf_rd_r=X8PYMCPKW44AGJ98BSBW&pd_rd_wg=b4VZi&pd_rd_r=f10a05cd-dd96-470c-b8ab-af7adf3471f0&pd_rd_i=B0DHFLHFN5&th=1,true,"{""type"": ""object"", ""properties"": {""title"": {""type"": ""string"", ""description"": ""Name of the product""}, ""price"": {""type"": ""number"", ""description"": ""Price of the product""}, ""description"": {""type"": ""string"", ""description"": ""Detailed description of the product""}, ""availability"": {""type"": ""boolean"", ""description"": ""Whether the product is in stock (true) or not (false)""}}, ""required"": [""title"", ""price""]}",,2025-06-11 00:19:58.927755
README.md CHANGED
@@ -1,13 +1,15 @@
1
  ---
2
- title: MCP WEB2JSON
3
- emoji: 🌖
4
- colorFrom: gray
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.34.2
8
  app_file: app.py
9
- pinned: false
10
- license: mit
11
  ---
12
 
 
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: MCP Server Web2JSON
3
+ emoji: 🖇️
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.33.0
8
  app_file: app.py
9
+ pinned: True
10
+ tags: [mcp-server-track]
11
  ---
12
 
13
+ [Video overview of the agent demo](https://youtu.be/wd0kjOVoGn8)
14
+
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ import gradio as gr
4
+ from typing import Dict, Any, Type
5
+ from web2json.preprocessor import BasicPreprocessor
6
+ from web2json.ai_extractor import AIExtractor, RAGExtractor, GeminiLLMClient
7
+ from web2json.postprocessor import PostProcessor
8
+ from web2json.pipeline import Pipeline
9
+ from pydantic import BaseModel, Field, create_model
10
+ import os
11
+ import dotenv
12
+
13
+ dotenv.load_dotenv()
14
+
15
+ def parse_schema_input(schema_input: str) -> Type[BaseModel]:
16
+ """
17
+ Convert user schema input to a Pydantic BaseModel.
18
+ Supports multiple input formats:
19
+ 1. JSON schema format
20
+ 2. Python class definition
21
+ 3. Simple field definitions
22
+ """
23
+ schema_input = schema_input.strip()
24
+
25
+ if not schema_input:
26
+ # Default schema if none provided
27
+ return create_model('DefaultSchema',
28
+ title=(str, Field(description="Title of the content")),
29
+ content=(str, Field(description="Main content")))
30
+
31
+ try:
32
+ # Try parsing as JSON schema
33
+ if schema_input.startswith('{'):
34
+ schema_dict = json.loads(schema_input)
35
+ return json_schema_to_basemodel(schema_dict)
36
+
37
+ # Try parsing as Python class definition
38
+ elif 'class ' in schema_input and 'BaseModel' in schema_input:
39
+ return python_class_to_basemodel(schema_input)
40
+
41
+ # Try parsing as simple field definitions
42
+ else:
43
+ return simple_fields_to_basemodel(schema_input)
44
+
45
+ except Exception as e:
46
+ raise ValueError(f"Could not parse schema: {str(e)}. Please check your schema format.")
47
+
48
+ def json_schema_to_basemodel(schema_dict: Dict) -> Type[BaseModel]:
49
+ """Convert JSON schema to BaseModel"""
50
+ fields = {}
51
+ properties = schema_dict.get('properties', {})
52
+ required = schema_dict.get('required', [])
53
+
54
+ for field_name, field_info in properties.items():
55
+ field_type = get_python_type(field_info.get('type', 'string'))
56
+ field_description = field_info.get('description', '')
57
+
58
+ if field_name in required:
59
+ fields[field_name] = (field_type, Field(description=field_description))
60
+ else:
61
+ fields[field_name] = (field_type, Field(default=None, description=field_description))
62
+
63
+ return create_model('DynamicSchema', **fields)
64
+
65
+ def python_class_to_basemodel(class_definition: str) -> Type[BaseModel]:
66
+ """Convert Python class definition to BaseModel"""
67
+ try:
68
+ # Execute the class definition in a safe namespace
69
+ namespace = {'BaseModel': BaseModel, 'Field': Field, 'str': str, 'int': int,
70
+ 'float': float, 'bool': bool, 'list': list, 'dict': dict}
71
+ exec(class_definition, namespace)
72
+
73
+ # Find the class that inherits from BaseModel
74
+ for name, obj in namespace.items():
75
+ if (isinstance(obj, type) and
76
+ issubclass(obj, BaseModel) and
77
+ obj != BaseModel):
78
+ return obj
79
+
80
+ raise ValueError("No BaseModel class found in definition")
81
+ except Exception as e:
82
+ raise ValueError(f"Invalid Python class definition: {str(e)}")
83
+
84
+ def simple_fields_to_basemodel(fields_text: str) -> Type[BaseModel]:
85
+ """Convert simple field definitions to BaseModel"""
86
+ fields = {}
87
+
88
+ for line in fields_text.strip().split('\n'):
89
+ line = line.strip()
90
+ if not line or line.startswith('#'):
91
+ continue
92
+
93
+ # Parse field definition (e.g., "name: str = description")
94
+ if ':' in line:
95
+ parts = line.split(':', 1)
96
+ field_name = parts[0].strip()
97
+
98
+ type_and_desc = parts[1].strip()
99
+ if '=' in type_and_desc:
100
+ type_part, desc_part = type_and_desc.split('=', 1)
101
+ field_type = get_python_type(type_part.strip())
102
+ description = desc_part.strip().strip('"\'')
103
+ else:
104
+ field_type = get_python_type(type_and_desc.strip())
105
+ description = ""
106
+
107
+ fields[field_name] = (field_type, Field(description=description))
108
+ else:
109
+ # Simple field name only
110
+ field_name = line.strip()
111
+ fields[field_name] = (str, Field(description=""))
112
+
113
+ if not fields:
114
+ raise ValueError("No valid fields found in schema definition")
115
+
116
+ return create_model('DynamicSchema', **fields)
117
+
118
+ def get_python_type(type_str: str):
119
+ """Convert type string to Python type"""
120
+ type_str = type_str.lower().strip()
121
+ type_mapping = {
122
+ 'string': str, 'str': str,
123
+ 'integer': int, 'int': int,
124
+ 'number': float, 'float': float,
125
+ 'boolean': bool, 'bool': bool,
126
+ 'array': list, 'list': list,
127
+ 'object': dict, 'dict': dict
128
+ }
129
+ return type_mapping.get(type_str, str)
130
+
131
+ def webpage_to_json_wrapper(content: str, is_url: bool, schema_input: str) -> Dict[str, Any]:
132
+ """Wrapper function that converts schema input to BaseModel"""
133
+ try:
134
+ # Parse the schema input into a BaseModel
135
+ schema_model = parse_schema_input(schema_input)
136
+
137
+ # Call the original function
138
+ return webpage_to_json(content, is_url, schema_model)
139
+
140
+ except Exception as e:
141
+ return {"error": f"Schema parsing error: {str(e)}"}
142
+
143
+ def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str, Any]:
144
+ """
145
+ Extracts structured JSON information from a given content based on a specified schema.
146
+ This function sets up a processing pipeline that includes:
147
+ - Preprocessing the input content.
148
+ - Utilizing an AI language model to extract information according to the provided schema.
149
+ - Postprocessing the extracted output to match the exact schema requirements.
150
+ Parameters:
151
+ content (str): The input content to be analyzed. This can be direct text or a URL content.
152
+ is_url (bool): A flag indicating whether the provided content is a URL (True) or raw text (False).
153
+ schema (BaseModel): A Pydantic BaseModel defining the expected structure and data types for the output.
154
+ Returns:
155
+ Dict[str, Any]: A dictionary containing the extracted data matching the schema. In case of errors during initialization
156
+ or processing, the dictionary will include an "error" key with a descriptive message.
157
+ """
158
+ prompt_template = """Extract the following information from the provided content according to the specified schema.
159
+
160
+ Content to analyze:
161
+ {content}
162
+
163
+ Schema requirements:
164
+ {schema}
165
+
166
+ Instructions:
167
+ - Extract only information that is explicitly present in the content
168
+ - Follow the exact structure and data types specified in the schema
169
+ - If a required field cannot be found, indicate this clearly
170
+ - Preserve the original formatting and context where relevant
171
+ - Return the extracted data in the format specified by the schema"""
172
+
173
+ # Initialize pipeline components
174
+ # TODO: improve the RAG system and optimize (don't instantiate every time)
175
+ preprocessor = BasicPreprocessor(config={'keep_tags': False})
176
+ try:
177
+ llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
178
+ except Exception as e:
179
+ return {"error": f"Failed to initialize LLM client: {str(e)}"}
180
+
181
+ # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
182
+ ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
183
+ postprocessor = PostProcessor()
184
+ pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
185
+
186
+ try:
187
+ result = pipeline.run(content, is_url, schema)
188
+ print("-"*80)
189
+ print(f"Processed result: {result}")
190
+ return result
191
+ except Exception as e:
192
+ return {"error": f"Processing error: {str(e)}"}
193
+
194
+ # Example schemas for the user
195
+ example_schemas = """
196
+ **Example Schema Formats:**
197
+
198
+ 1. **Simple field definitions:**
199
+ ```
200
+ title: str = Page title
201
+ price: float = Product price
202
+ description: str = Product description
203
+ available: bool = Is available
204
+ ```
205
+
206
+ 2. **JSON Schema:**
207
+ ```json
208
+ {
209
+ "properties": {
210
+ "title": {"type": "string", "description": "Page title"},
211
+ "price": {"type": "number", "description": "Product price"},
212
+ "description": {"type": "string", "description": "Product description"}
213
+ },
214
+ "required": ["title"]
215
+ }
216
+ ```
217
+
218
+ 3. **Python Class Definition:**
219
+ ```python
220
+ class ProductSchema(BaseModel):
221
+ title: str = Field(description="Product title")
222
+ price: float = Field(description="Product price")
223
+ description: str = Field(description="Product description")
224
+ available: bool = Field(default=False, description="Availability status")
225
+ ```
226
+ """
227
+
228
+ # Build Gradio Interface
229
+ demo = gr.Interface(
230
+ fn=webpage_to_json_wrapper,
231
+ inputs=[
232
+ gr.Textbox(
233
+ label="Content (URL or Raw Text)",
234
+ lines=10,
235
+ placeholder="Enter URL or paste raw HTML/text here."
236
+ ),
237
+ gr.Checkbox(label="Content is URL?", value=False),
238
+ gr.Textbox(
239
+ label="Schema Definition",
240
+ lines=15,
241
+ placeholder="Define your extraction schema (see examples below)",
242
+ info=example_schemas
243
+ )
244
+ ],
245
+ outputs=gr.JSON(label="Output JSON"),
246
+ title="Webpage to JSON Converter",
247
+ description="Convert web pages or raw text into structured JSON using customizable schemas. Define your schema using simple field definitions, JSON schema, or Python class syntax.",
248
+ examples=[
249
+ [
250
+ "https://example.com",
251
+ True,
252
+ "title: str = Page title\nprice: float = Product price\ndescription: str = Description"
253
+ ],
254
+ [
255
+ "<h1>Sample Product</h1><p>Price: $29.99</p><p>Great quality item</p>",
256
+ False,
257
+ '''{
258
+ "type": "object",
259
+ "properties": {
260
+ "title": {
261
+ "type": "string",
262
+ "description": "Name of the product"
263
+ },
264
+ "price": {
265
+ "type": "number",
266
+ "description": "Price of the product"
267
+ },
268
+ "description": {
269
+ "type": "string",
270
+ "description": "Detailed description of the product"
271
+ },
272
+ "availability": {
273
+ "type": "boolean",
274
+ "description": "Whether the product is in stock (true) or not (false)"
275
+ }
276
+ },
277
+ "required": ["title", "price"]
278
+ }'''
279
+ ]
280
+ ]
281
+ )
282
+
283
+ if __name__ == "__main__":
284
+ demo.launch(mcp_server=True)
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ gradio
3
+ gradio[mcp]
4
+ pydantic
5
+ python-dotenv
6
+ beautifulsoup4
7
+ requests
8
+ google-genai
9
+ json_repair
10
+ numpy
11
+ langchain
12
+ langchain-text-splitters
13
+ sentence-transformers
web2json/__pycache__/ai_extractor.cpython-311.pyc ADDED
Binary file (16.2 kB). View file
 
web2json/__pycache__/pipeline.cpython-311.pyc ADDED
Binary file (2.49 kB). View file
 
web2json/__pycache__/postprocessor.cpython-311.pyc ADDED
Binary file (1.65 kB). View file
 
web2json/__pycache__/preprocessor.cpython-311.pyc ADDED
Binary file (5.93 kB). View file
 
web2json/ai_extractor.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from abc import ABC, abstractmethod
3
+ from google import genai
4
+ from google.genai import types
5
+ from pydantic import BaseModel
6
+ import numpy as np
7
+ from typing import List, Any, Dict, Tuple
8
+ import time
9
+ from langchain_text_splitters import HTMLHeaderTextSplitter
10
+ from sentence_transformers import SentenceTransformer
11
+
12
+ class LLMClient(ABC):
13
+ """
14
+ Abstract base class for calling LLM APIs.
15
+ """
16
+ def __init__(self, config: dict = None):
17
+ """
18
+ Initializes the LLMClient with a configuration dictionary.
19
+
20
+ Args:
21
+ config (dict): Configuration settings for the LLM client.
22
+ """
23
+ self.config = config or {}
24
+
25
+ @abstractmethod
26
+ def call_api(self, prompt: str) -> str:
27
+ """
28
+ Call the underlying LLM API with the given prompt.
29
+
30
+ Args:
31
+ prompt (str): The prompt or input text for the LLM.
32
+
33
+ Returns:
34
+ str: The response from the LLM.
35
+ """
36
+ pass
37
+
38
+
39
+ class GeminiLLMClient(LLMClient):
40
+ """
41
+ Concrete implementation of LLMClient for the Gemini API.
42
+ """
43
+
44
+ def __init__(self, config: dict):
45
+ """
46
+ Initializes the GeminiLLMClient with an API key, model name, and optional generation settings.
47
+
48
+ Args:
49
+ config (dict): Configuration containing:
50
+ - 'api_key': (optional) API key for Gemini (falls back to GEMINI_API_KEY env var)
51
+ - 'model_name': (optional) the model to use (default 'gemini-2.0-flash')
52
+ - 'generation_config': (optional) dict of GenerateContentConfig parameters
53
+ """
54
+ api_key = config.get("api_key") or os.environ.get("GEMINI_API_KEY")
55
+ if not api_key:
56
+ raise ValueError(
57
+ "API key for Gemini must be provided in config['api_key'] or GEMINI_API_KEY env var."
58
+ )
59
+ self.client = genai.Client(api_key=api_key)
60
+ self.model_name = config.get("model_name", "gemini-2.0-flash")
61
+ # allow custom generation settings, fallback to sensible defaults
62
+ gen_conf = config.get("generation_config", {})
63
+ self.generate_config = types.GenerateContentConfig(
64
+ response_mime_type=gen_conf.get("response_mime_type", "text/plain"),
65
+ temperature=gen_conf.get("temperature"),
66
+ max_output_tokens=gen_conf.get("max_output_tokens"),
67
+ top_p=gen_conf.get("top_p"),
68
+ top_k=gen_conf.get("top_k"),
69
+ # add any other fields you want to expose
70
+ )
71
+
72
+ def call_api(self, prompt: str) -> str:
73
+ """
74
+ Call the Gemini API with the given prompt (non-streaming).
75
+
76
+ Args:
77
+ prompt (str): The input text for the API.
78
+
79
+ Returns:
80
+ str: The generated text from the Gemini API.
81
+ """
82
+ contents = [
83
+ types.Content(
84
+ role="user",
85
+ parts=[types.Part.from_text(text=prompt)],
86
+ )
87
+ ]
88
+
89
+ # Non-streaming call returns a full response object
90
+ response = self.client.models.generate_content(
91
+ model=self.model_name,
92
+ contents=contents,
93
+ config=self.generate_config,
94
+ )
95
+
96
+ # Combine all output parts into a single string
97
+ return response.text
98
+
99
+
100
+
101
+ class AIExtractor:
102
+ def __init__(self, llm_client: LLMClient, prompt_template: str):
103
+ """
104
+ Initializes the AIExtractor with a specific LLM client and configuration.
105
+
106
+ Args:
107
+ llm_client (LLMClient): An instance of a class that implements the LLMClient interface.
108
+ prompt_template (str): The template to use for generating prompts for the LLM.
109
+ should contain placeholders for dynamic content.
110
+ e.g., "Extract the following information: {content} based on schema: {schema}"
111
+ """
112
+ self.llm_client = llm_client
113
+ self.prompt_template = prompt_template
114
+
115
+ def extract(self, content: str, schema: BaseModel) -> str:
116
+ """
117
+ Extracts structured information from the given content based on the provided schema.
118
+
119
+ Args:
120
+ content (str): The raw content to extract information from.
121
+ schema (BaseModel): A Pydantic model defining the structure of the expected output.
122
+
123
+ Returns:
124
+ str: The structured JSON object as a string.
125
+ """
126
+ prompt = self.prompt_template.format(content=content, schema=schema.model_json_schema())
127
+ # print(f"Generated prompt: {prompt}")
128
+ response = self.llm_client.call_api(prompt)
129
+ return response
130
+
131
+ # TODO: RAGExtractor class
132
+ class RAGExtractor(AIExtractor):
133
+ """
134
+ RAG-enhanced extractor that uses similarity search to find relevant chunks
135
+ before performing extraction, utilizing HTML header-based chunking and SentenceTransformer embeddings.
136
+ """
137
+
138
+ def __init__(self,
139
+ llm_client: LLMClient,
140
+ prompt_template: str,
141
+ embedding_model_path: str = "sentence-transformers/all-mpnet-base-v2",
142
+ top_k: int = 3):
143
+ """
144
+ Initialize RAG extractor with embedding and chunking capabilities.
145
+
146
+ Args:
147
+ llm_client: LLM client for generation.
148
+ prompt_template: Template for prompts.
149
+ embedding_model_path: Path/name for the SentenceTransformer embedding model.
150
+ top_k: Number of top similar chunks to retrieve.
151
+ """
152
+ super().__init__(llm_client, prompt_template)
153
+ self.embedding_model_path = embedding_model_path
154
+ # Initialize the SentenceTransformer model for embeddings
155
+ self.embedding_model_instance = SentenceTransformer(self.embedding_model_path)
156
+ self.top_k = top_k
157
+
158
+ @staticmethod
159
+ def _langchain_HHTS(text: str) -> List[str]:
160
+ """
161
+ Chunks HTML text using Langchain's HTMLHeaderTextSplitter based on h1 and h2 headers.
162
+
163
+ Args:
164
+ text (str): The HTML content to chunk.
165
+
166
+ Returns:
167
+ List[str]: A list of chunked text strings (extracted from Document objects' page_content).
168
+ """
169
+ headers_to_split_on = [
170
+ ("h1", "Header 1"),
171
+ ("h2", "Header 2"),
172
+ # ("h3", "Header 3"), # This header was explicitly commented out in the request
173
+ ]
174
+ html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
175
+ return [doc.page_content for doc in html_splitter.split_text(text)]
176
+
177
+ def embed_text(self, text: str) -> np.ndarray:
178
+ """
179
+ Generate embeddings for text using the initialized SentenceTransformer model.
180
+
181
+ Args:
182
+ text: The text string to embed.
183
+
184
+ Returns:
185
+ np.ndarray: The embedding vector for the input text as a NumPy array.
186
+ """
187
+ try:
188
+ return self.embedding_model_instance.encode(text)
189
+ except Exception as e:
190
+ print(f"Warning: Embedding failed for text: '{text[:50]}...', using random embedding: {e}")
191
+
192
+ return None
193
+
194
+ def search_similar_chunks(self,
195
+ query: str,
196
+ chunks: List[str],
197
+ embeddings: np.ndarray) -> List[str]:
198
+ """
199
+ Find the most similar chunks to the query within the given list of chunks
200
+ by calculating cosine similarity between their embeddings.
201
+
202
+ Args:
203
+ query (str): The query text whose embedding will be used for similarity comparison.
204
+ chunks (List[str]): A list of text chunks to search within.
205
+ embeddings (np.ndarray): Precomputed embeddings for the chunks, corresponding to the 'chunks' list.
206
+
207
+ Returns:
208
+ List[str]: A list of the 'top_k' most similar chunks to the query.
209
+ """
210
+ query_embedding = self.embed_text(query)
211
+
212
+ similarities = []
213
+
214
+ if query_embedding.ndim > 1:
215
+ query_embedding = query_embedding.flatten()
216
+
217
+ for i, chunk_embedding in enumerate(embeddings):
218
+ if chunk_embedding.ndim > 1:
219
+ chunk_embedding = chunk_embedding.flatten()
220
+
221
+ norm_query = np.linalg.norm(query_embedding)
222
+ norm_chunk = np.linalg.norm(chunk_embedding)
223
+
224
+ if norm_query == 0 or norm_chunk == 0:
225
+ similarity = 0.0
226
+ else:
227
+ similarity = np.dot(query_embedding, chunk_embedding) / (norm_query * norm_chunk)
228
+ similarities.append((similarity, i))
229
+
230
+ similarities.sort(key=lambda x: x[0], reverse=True)
231
+ top_indices = [idx for _, idx in similarities[:self.top_k]]
232
+
233
+ return [chunks[i] for i in top_indices]
234
+
235
+ def extract(self, content: str, schema: BaseModel, query: str = None) -> str:
236
+ """
237
+ Overrides the base AIExtractor's method to implement RAG-enhanced extraction.
238
+ This function first chunks the input HTML content, then uses a query to find
239
+ the most relevant chunks via embedding similarity, and finally sends these
240
+ relevant chunks as context to the LLM for structured information extraction.
241
+
242
+ Args:
243
+ content (str): The raw HTML content from which to extract information.
244
+ schema (BaseModel): A Pydantic model defining the desired output structure for the LLM.
245
+ query (str, optional): An optional query string to guide the retrieval of relevant chunks.
246
+ If not provided, a default query based on the schema will be used.
247
+
248
+ Returns:
249
+ str: The structured JSON object as a string, as generated by the LLM.
250
+ """
251
+ start_time = time.time()
252
+
253
+ if not query:
254
+ query = f"Extract information based on the following JSON schema: {schema.model_json_schema()}"
255
+ print(f"No explicit query provided for retrieval. Using default: '{query[:100]}...'")
256
+
257
+ chunks = self._langchain_HHTS(content)
258
+ print(f"Content successfully chunked into {len(chunks)} pieces.")
259
+
260
+ combined_content_for_llm = ""
261
+ if not chunks:
262
+ print("Warning: No chunks were generated from the provided content. The entire original content will be sent to the LLM.")
263
+ combined_content_for_llm = content
264
+ else:
265
+ chunk_embeddings = np.array([self.embed_text(chunk) for chunk in chunks])
266
+ print(f"Generated embeddings for {len(chunks)} chunks.")
267
+
268
+ similar_chunks = self.search_similar_chunks(query, chunks, chunk_embeddings)
269
+ print(f"Retrieved {len(similar_chunks)} similar chunks based on the query.")
270
+
271
+ combined_content_for_llm = "\n\n".join(similar_chunks)
272
+ print(f"Combined content for LLM (truncated): '{combined_content_for_llm[:200]}...'")
273
+
274
+ prompt = self.prompt_template.format(content=combined_content_for_llm, schema=schema.model_json_schema())
275
+ print(f"Sending prompt to LLM (truncated): '{prompt[:500]}...'")
276
+ llm_response = self.llm_client.call_api(prompt)
277
+
278
+ execution_time = (time.time() - start_time) * 1000
279
+ print(f"Extraction process completed in {execution_time:.2f} milliseconds.")
280
+ print(f"LLM's final response: {llm_response}")
281
+ print("=" * 78)
282
+
283
+ return llm_response
web2json/pipeline.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from web2json.ai_extractor import *
2
+ from web2json.postprocessor import *
3
+ from web2json.preprocessor import *
4
+ from pydantic import BaseModel
5
+
6
+ class Pipeline:
7
+ # constructor
8
+ def __init__(self,
9
+ preprocessor: Preprocessor,
10
+ ai_extractor: AIExtractor,
11
+ postprocessor: PostProcessor):
12
+ self.preprocessor = preprocessor
13
+ self.ai_extractor = ai_extractor
14
+ self.postprocessor = postprocessor
15
+
16
+ def run(self, content: str, is_url: bool, schema:BaseModel) -> dict:
17
+ """
18
+ Run the entire pipeline: preprocess, extract, and postprocess.
19
+
20
+ Args:
21
+ content (str): The raw content to process.
22
+ is_url (bool): Whether the content is a URL or raw text.
23
+ schema (BaseModel): The schema defining the structure of the expected output.
24
+
25
+ Returns:
26
+ dict: The final structured data after processing.
27
+ """
28
+ # Step 1: Preprocess the content
29
+ preprocessed_content = self.preprocessor.preprocess(content, is_url)
30
+ print(f"Preprocessed content: {preprocessed_content[:100]}...")
31
+ print('+'*80)
32
+ # Step 2: Extract structured information using AI
33
+ extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
34
+ print(f"Extracted data: {extracted_data[:100]}...")
35
+ print('+'*80)
36
+ # Step 3: Post-process the extracted data
37
+ final_output = self.postprocessor.process(extracted_data)
38
+ print(f"Final output: {final_output}")
39
+ print('+'*80)
40
+
41
+ return final_output
42
+
43
+
web2json/postprocessor.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from json_repair import repair_json
2
+ import json
3
+
4
+ class PostProcessor:
5
+
6
+ def process(self, response: str) -> dict:
7
+ json_response = {}
8
+ try:
9
+ # Extract the JSON from the generated text. Handle variations in output format.
10
+ json_string = response
11
+ if "```json" in response:
12
+ json_string = response.split("```json")[1].split("```")[0]
13
+ elif "{" in response and "}" in response:
14
+ # try to grab the json
15
+ start_index = response.find("{")
16
+ end_index = response.rfind("}") + 1
17
+ json_string = response[start_index:end_index]
18
+
19
+ json_response = json.loads(repair_json(json_string)) # Added for robustness
20
+ except Exception as e:
21
+ print(f"Error parsing JSON: {e}")
22
+ print(f"Generated text: {response}")
23
+ json_response = {}
24
+
25
+
26
+ return json_response
27
+
web2json/preprocessor.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ from bs4 import BeautifulSoup , Comment
4
+ from abc import ABC, abstractmethod
5
+ from typing import Any, Dict, Optional
6
+
7
+
8
+ class Preprocessor(ABC):
9
+ """
10
+ Abstract base class for preprocessors.
11
+ Defines the interface for transforming raw inputs into structured data.
12
+ """
13
+
14
+ def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
15
+ """
16
+ Initialize the preprocessor with optional configuration.
17
+
18
+ Args:
19
+ config: A dictionary of configuration settings.
20
+ - keep_tags (bool): If True, keeps HTML tags in the output; otherwise, cleans them.
21
+ """
22
+ self.config = config if config is not None else {'keep_tags': False}
23
+
24
+ def _fetch_content(self, url: str) -> str:
25
+ """
26
+ Fetches and parses the text content from a URL.
27
+
28
+ Args:
29
+ url: The URL to fetch content from.
30
+
31
+ Returns:
32
+ The clean, extracted text content from the page.
33
+
34
+ Raises:
35
+ ValueError: If the URL cannot be fetched or processed.
36
+ """
37
+ try:
38
+ # Set a User-Agent header to mimic a browser, which can help avoid
39
+ # being blocked by some websites.
40
+ # Inside _fetch_content method
41
+ headers = headers = {
42
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
43
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
44
+ "Accept-Language": "en-US,en;q=0.6",
45
+ "Cache-Control": "max-age=0",
46
+ "Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"",
47
+ "Sec-Ch-Ua-Mobile": "?0",
48
+ "Sec-Ch-Ua-Platform": "\"Windows\"",
49
+ "Sec-Fetch-Dest": "document",
50
+ "Sec-Fetch-Mode": "navigate",
51
+ "Sec-Fetch-Site": "none",
52
+ "Sec-Fetch-User": "?1",
53
+ "Upgrade-Insecure-Requests": "1",
54
+ }
55
+
56
+ # Make the HTTP GET request with a timeout.
57
+ response = requests.get(url, headers=headers, timeout=15)
58
+
59
+
60
+ return response.text
61
+
62
+ except requests.exceptions.RequestException as e:
63
+ # Catch any network-related errors (DNS, connection, timeout, etc.)
64
+ # and re-raise them as a more user-friendly ValueError.
65
+ raise ValueError(f"Failed to fetch content from URL: {url}. Error: {e}")
66
+
67
+
68
+ @abstractmethod
69
+ def preprocess(self, content: str, is_url: bool) -> str:
70
+ """
71
+ Take raw content (HTML, text, etc.) and apply preprocessing steps.
72
+
73
+ Args:
74
+ content: The raw data to preprocess.
75
+
76
+ Returns:
77
+ A dictionary containing structured, cleaned data ready for downstream tasks.
78
+ """
79
+ pass
80
+
81
+ class BasicPreprocessor(Preprocessor):
82
+ """
83
+ Base preprocessor with common functionality.
84
+ Can be extended for specific preprocessing tasks.
85
+ """
86
+ # TODO: Might need to think of how to improve this later
87
+ def _clean_html(self, html_content: str) -> str:
88
+ """
89
+ Cleans up the given HTML content by:
90
+ - Removing <script> and <style> tags and their content.
91
+ - Removing HTML comments.
92
+ - Extracting and returning the visible text with normalized whitespace if keep_tags is False.
93
+
94
+ Args:
95
+ html_content (str): The HTML content to clean.
96
+
97
+ Returns:
98
+ str: The cleaned, visible text from the HTML.
99
+ """
100
+ # Parse the HTML content
101
+ soup = BeautifulSoup(html_content, "html.parser")
102
+
103
+ # Remove script and style elements
104
+ for tag in soup(["script", "style"]):
105
+ tag.decompose()
106
+
107
+ # Remove HTML comments
108
+ for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
109
+ comment.extract()
110
+
111
+ # Extract text and normalize whitespace
112
+ if self.config.get('keep_tags', False):
113
+ # If keep_tags is True, return the raw HTML
114
+ return str(soup)
115
+
116
+ text = soup.get_text(separator=" ", strip=True)
117
+ clean_text = re.sub(r'\s+', ' ', text)
118
+
119
+ return clean_text
120
+
121
+ def preprocess(self, content: str, is_url: bool) -> str:
122
+ """
123
+ Take raw content (HTML, text, etc.) and apply preprocessing steps.
124
+
125
+ Args:
126
+ content: The raw data to preprocess.
127
+
128
+ Returns:
129
+ A dictionary containing structured, cleaned data ready for downstream tasks.
130
+ """
131
+
132
+ html_content = content
133
+ if is_url:
134
+ # Fetch content from the URL
135
+ html_content = self._fetch_content(content)
136
+
137
+
138
+ # Clean the HTML content
139
+ cleaned_content = self._clean_html(html_content)
140
+
141
+ return cleaned_content.strip() # Return the cleaned text content, stripped of leading/trailing whitespace
142
+
143
+
144
+
145
+