Spaces:
Running
Running
Commit
·
c67f04e
1
Parent(s):
4ed1b4f
done with version 2
Browse files- app.py +54 -4
- requirements.txt +3 -1
- web2json/__pycache__/ai_extractor.cpython-311.pyc +0 -0
- web2json/__pycache__/pipeline.cpython-311.pyc +0 -0
- web2json/__pycache__/preprocessor.cpython-311.pyc +0 -0
- web2json/ai_extractor.py +240 -6
- web2json/pipeline.py +1 -1
- web2json/preprocessor.py +78 -3
app.py
CHANGED
@@ -3,7 +3,7 @@ import pandas as pd
|
|
3 |
import gradio as gr
|
4 |
from typing import Dict, Any, Type
|
5 |
from web2json.preprocessor import BasicPreprocessor
|
6 |
-
from web2json.ai_extractor import AIExtractor,
|
7 |
from web2json.postprocessor import PostProcessor
|
8 |
from web2json.pipeline import Pipeline
|
9 |
from pydantic import BaseModel, Field, create_model
|
@@ -170,16 +170,66 @@ def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str,
|
|
170 |
- Preserve the original formatting and context where relevant
|
171 |
- Return the extracted data in the format specified by the schema"""
|
172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
# Initialize pipeline components
|
174 |
# TODO: improve the RAG system and optimize (don't instantiate every time)
|
175 |
-
preprocessor = BasicPreprocessor(config={'keep_tags':
|
176 |
try:
|
177 |
-
llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
|
|
|
178 |
except Exception as e:
|
179 |
return {"error": f"Failed to initialize LLM client: {str(e)}"}
|
180 |
|
181 |
# ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
|
182 |
-
ai_extractor =
|
183 |
postprocessor = PostProcessor()
|
184 |
pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
|
185 |
|
|
|
3 |
import gradio as gr
|
4 |
from typing import Dict, Any, Type
|
5 |
from web2json.preprocessor import BasicPreprocessor
|
6 |
+
from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient
|
7 |
from web2json.postprocessor import PostProcessor
|
8 |
from web2json.pipeline import Pipeline
|
9 |
from pydantic import BaseModel, Field, create_model
|
|
|
170 |
- Preserve the original formatting and context where relevant
|
171 |
- Return the extracted data in the format specified by the schema"""
|
172 |
|
173 |
+
classification_prompt_template = """
|
174 |
+
# HTML Chunk Relevance Classification Prompt
|
175 |
+
|
176 |
+
You are an HTML content classifier. Your task is to analyze an HTML chunk against a given schema and determine if the content is relevant.
|
177 |
+
|
178 |
+
## Instructions:
|
179 |
+
1. Carefully examine the provided HTML chunk
|
180 |
+
2. Compare it against the given schema/criteria
|
181 |
+
3. Determine if the HTML chunk contains content that matches or is relevant to the schema
|
182 |
+
4. Respond with ONLY a JSON object containing a single field "relevant" with value 1 (relevant) or 0 (not relevant)
|
183 |
+
|
184 |
+
## Input Format:
|
185 |
+
**Schema/Criteria:**
|
186 |
+
{schema}
|
187 |
+
|
188 |
+
**HTML Chunk:**
|
189 |
+
```html
|
190 |
+
{content}
|
191 |
+
```
|
192 |
+
|
193 |
+
## Output Format:
|
194 |
+
Your response must be ONLY a valid JSON object with no additional text:
|
195 |
+
|
196 |
+
```json
|
197 |
+
{{
|
198 |
+
"relevant": 1
|
199 |
+
}}
|
200 |
+
```
|
201 |
+
|
202 |
+
OR
|
203 |
+
|
204 |
+
```json
|
205 |
+
{{
|
206 |
+
"relevant": 0
|
207 |
+
}}
|
208 |
+
```
|
209 |
+
|
210 |
+
## Classification Rules:
|
211 |
+
- Output 1 if the HTML chunk contains content that matches the schema criteria
|
212 |
+
- Output 0 if the HTML chunk does not contain relevant content
|
213 |
+
- Consider semantic meaning, not just exact keyword matches
|
214 |
+
- Look at text content, attributes, structure, and context
|
215 |
+
- Ignore purely structural HTML elements (like divs, spans) unless they contain relevant content
|
216 |
+
- Be STRICT in your evaluation - only mark as relevant (1) if there is clear, meaningful content that directly relates to the schema
|
217 |
+
- Empty elements, placeholder text, navigation menus, headers/footers, and generic UI components should typically be marked as not relevant (0)
|
218 |
+
- The HTML chunk does not need to contain ALL schema information, but it must contain SUBSTANTIAL and SPECIFIC content related to the schema
|
219 |
+
|
220 |
+
CRITICAL: Your entire response MUST be exactly one JSON object. DO NOT include any explanations, reasoning, markdown formatting, code blocks, or additional text. Output ONLY the raw JSON object.
|
221 |
+
"""
|
222 |
# Initialize pipeline components
|
223 |
# TODO: improve the RAG system and optimize (don't instantiate every time)
|
224 |
+
preprocessor = BasicPreprocessor(config={'keep_tags': True})
|
225 |
try:
|
226 |
+
# llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
|
227 |
+
llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
|
228 |
except Exception as e:
|
229 |
return {"error": f"Failed to initialize LLM client: {str(e)}"}
|
230 |
|
231 |
# ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
|
232 |
+
ai_extractor = LLMClassifierExtractor(llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
|
233 |
postprocessor = PostProcessor()
|
234 |
pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
|
235 |
|
requirements.txt
CHANGED
@@ -10,4 +10,6 @@ json_repair
|
|
10 |
numpy
|
11 |
langchain
|
12 |
langchain-text-splitters
|
13 |
-
sentence-transformers
|
|
|
|
|
|
10 |
numpy
|
11 |
langchain
|
12 |
langchain-text-splitters
|
13 |
+
sentence-transformers
|
14 |
+
openai
|
15 |
+
html_chunking
|
web2json/__pycache__/ai_extractor.cpython-311.pyc
CHANGED
Binary files a/web2json/__pycache__/ai_extractor.cpython-311.pyc and b/web2json/__pycache__/ai_extractor.cpython-311.pyc differ
|
|
web2json/__pycache__/pipeline.cpython-311.pyc
CHANGED
Binary files a/web2json/__pycache__/pipeline.cpython-311.pyc and b/web2json/__pycache__/pipeline.cpython-311.pyc differ
|
|
web2json/__pycache__/preprocessor.cpython-311.pyc
CHANGED
Binary files a/web2json/__pycache__/preprocessor.cpython-311.pyc and b/web2json/__pycache__/preprocessor.cpython-311.pyc differ
|
|
web2json/ai_extractor.py
CHANGED
@@ -1,14 +1,22 @@
|
|
1 |
import os
|
2 |
-
|
|
|
3 |
from google import genai
|
|
|
|
|
|
|
|
|
|
|
4 |
from google.genai import types
|
5 |
from pydantic import BaseModel
|
6 |
-
|
7 |
-
from
|
8 |
-
import
|
|
|
|
|
|
|
9 |
from langchain_text_splitters import HTMLHeaderTextSplitter
|
10 |
from sentence_transformers import SentenceTransformer
|
11 |
-
|
12 |
class LLMClient(ABC):
|
13 |
"""
|
14 |
Abstract base class for calling LLM APIs.
|
@@ -96,7 +104,130 @@ class GeminiLLMClient(LLMClient):
|
|
96 |
# Combine all output parts into a single string
|
97 |
return response.text
|
98 |
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
class AIExtractor:
|
102 |
def __init__(self, llm_client: LLMClient, prompt_template: str):
|
@@ -127,6 +258,109 @@ class AIExtractor:
|
|
127 |
# print(f"Generated prompt: {prompt}")
|
128 |
response = self.llm_client.call_api(prompt)
|
129 |
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
# TODO: RAGExtractor class
|
132 |
class RAGExtractor(AIExtractor):
|
|
|
1 |
import os
|
2 |
+
import time
|
3 |
+
import numpy as np
|
4 |
from google import genai
|
5 |
+
from openai import OpenAI
|
6 |
+
import time
|
7 |
+
import random
|
8 |
+
from openai import RateLimitError
|
9 |
+
from functools import wraps
|
10 |
from google.genai import types
|
11 |
from pydantic import BaseModel
|
12 |
+
from concurrent.futures import ThreadPoolExecutor
|
13 |
+
from html_chunking import get_html_chunks
|
14 |
+
from abc import ABC, abstractmethod
|
15 |
+
from typing import List, Any, Dict, Tuple, Optional
|
16 |
+
import re
|
17 |
+
import json
|
18 |
from langchain_text_splitters import HTMLHeaderTextSplitter
|
19 |
from sentence_transformers import SentenceTransformer
|
|
|
20 |
class LLMClient(ABC):
|
21 |
"""
|
22 |
Abstract base class for calling LLM APIs.
|
|
|
104 |
# Combine all output parts into a single string
|
105 |
return response.text
|
106 |
|
107 |
+
def extract_markdown_json(text: str) -> Optional[Dict[str, Any]]:
|
108 |
+
"""
|
109 |
+
Find the first Markdown ```json ...``` block in `text`,
|
110 |
+
parse it as JSON, and return the resulting dict.
|
111 |
+
Returns None if no valid JSON block is found.
|
112 |
+
"""
|
113 |
+
# 1) Look specifically for a ```json code fence
|
114 |
+
fence_match = re.search(
|
115 |
+
r"```json\s*(\{.*?\})\s*```",
|
116 |
+
text,
|
117 |
+
re.DOTALL | re.IGNORECASE
|
118 |
+
)
|
119 |
+
if not fence_match:
|
120 |
+
return None
|
121 |
+
|
122 |
+
json_str = fence_match.group(1)
|
123 |
+
try:
|
124 |
+
return json.loads(json_str)
|
125 |
+
except json.JSONDecodeError:
|
126 |
+
return None
|
127 |
+
|
128 |
+
def retry_on_ratelimit(max_retries=5, base_delay=1.0, max_delay=10.0):
|
129 |
+
def deco(fn):
|
130 |
+
@wraps(fn)
|
131 |
+
def wrapped(*args, **kwargs):
|
132 |
+
delay = base_delay
|
133 |
+
for attempt in range(max_retries):
|
134 |
+
try:
|
135 |
+
return fn(*args, **kwargs)
|
136 |
+
except RateLimitError:
|
137 |
+
if attempt == max_retries - 1:
|
138 |
+
# give up
|
139 |
+
raise
|
140 |
+
# back off + jitter
|
141 |
+
sleep = min(max_delay, delay) + random.uniform(0, delay)
|
142 |
+
time.sleep(sleep)
|
143 |
+
delay *= 2
|
144 |
+
# unreachable
|
145 |
+
return wrapped
|
146 |
+
return deco
|
147 |
+
class NvidiaLLMClient(LLMClient):
|
148 |
+
"""
|
149 |
+
Concrete implementation of LLMClient for the NVIDIA API (non-streaming).
|
150 |
+
"""
|
151 |
+
|
152 |
+
def __init__(self, config: dict):
|
153 |
+
"""
|
154 |
+
Initializes the NvidiaLLMClient with an API key, model name, and optional generation settings.
|
155 |
+
|
156 |
+
Args:
|
157 |
+
config (dict): Configuration containing:
|
158 |
+
- 'api_key': (optional) API key for NVIDIA (falls back to NVIDIA_API_KEY env var)
|
159 |
+
- 'model_name': (optional) the model to use (default 'google/gemma-3-1b-it')
|
160 |
+
- 'generation_config': (optional) dict of generation parameters like temperature, top_p, etc.
|
161 |
+
"""
|
162 |
+
api_key = config.get("api_key") or os.environ.get("NVIDIA_API_KEY")
|
163 |
+
if not api_key:
|
164 |
+
raise ValueError(
|
165 |
+
"API key for NVIDIA must be provided in config['api_key'] or NVIDIA_API_KEY env var."
|
166 |
+
)
|
167 |
+
|
168 |
+
self.client = OpenAI(
|
169 |
+
base_url="https://integrate.api.nvidia.com/v1",
|
170 |
+
api_key=api_key
|
171 |
+
)
|
172 |
+
self.model_name = config.get("model_name", "google/gemma-3-1b-it")
|
173 |
+
|
174 |
+
# Store generation settings with sensible defaults
|
175 |
+
gen_conf = config.get("generation_config", {})
|
176 |
+
self.temperature = gen_conf.get("temperature", 0.1)
|
177 |
+
self.top_p = gen_conf.get("top_p", 0.7)
|
178 |
+
self.max_tokens = gen_conf.get("max_tokens", 512)
|
179 |
+
|
180 |
+
def set_model(self, model_name: str):
|
181 |
+
"""
|
182 |
+
Set the model name for the NVIDIA API client.
|
183 |
+
|
184 |
+
Args:
|
185 |
+
model_name (str): The name of the model to use.
|
186 |
+
"""
|
187 |
+
self.model_name = model_name
|
188 |
+
|
189 |
+
@retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
|
190 |
+
def call_api(self, prompt: str) -> str:
|
191 |
+
"""
|
192 |
+
Call the NVIDIA API with the given prompt (non-streaming).
|
193 |
+
|
194 |
+
Args:
|
195 |
+
prompt (str): The input text for the API.
|
196 |
+
|
197 |
+
Returns:
|
198 |
+
str: The generated text from the NVIDIA API.
|
199 |
+
"""
|
200 |
+
response = self.client.chat.completions.create(
|
201 |
+
model=self.model_name,
|
202 |
+
messages=[{"role": "user", "content": prompt}],
|
203 |
+
temperature=self.temperature,
|
204 |
+
top_p=self.top_p,
|
205 |
+
max_tokens=self.max_tokens
|
206 |
+
# stream is omitted (defaults to False)
|
207 |
+
)
|
208 |
+
# print("DONE")
|
209 |
+
# For the standard (non-streaming) response:
|
210 |
+
# choices[0].message.content holds the generated text
|
211 |
+
return response.choices[0].message.content
|
212 |
+
|
213 |
+
def call_batch(self, prompts, max_workers=8):
|
214 |
+
"""
|
215 |
+
Parallel batch with isolated errors: each prompt that still
|
216 |
+
fails after retries will raise, but others succeed.
|
217 |
+
"""
|
218 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
219 |
+
results = [None] * len(prompts)
|
220 |
+
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
221 |
+
futures = {ex.submit(self.call_api, p): i for i, p in enumerate(prompts)}
|
222 |
+
for fut in as_completed(futures):
|
223 |
+
idx = futures[fut]
|
224 |
+
try:
|
225 |
+
results[idx] = fut.result()
|
226 |
+
except RateLimitError:
|
227 |
+
# You could set results[idx] = None or a default string
|
228 |
+
results[idx] = f"<failed after retries>"
|
229 |
+
return results
|
230 |
+
|
231 |
|
232 |
class AIExtractor:
|
233 |
def __init__(self, llm_client: LLMClient, prompt_template: str):
|
|
|
258 |
# print(f"Generated prompt: {prompt}")
|
259 |
response = self.llm_client.call_api(prompt)
|
260 |
return response
|
261 |
+
|
262 |
+
class LLMClassifierExtractor(AIExtractor):
|
263 |
+
"""
|
264 |
+
Extractor that uses an LLM to classify and extract structured information from text content.
|
265 |
+
This class is designed to handle classification tasks where the LLM generates structured output based on a provided schema.
|
266 |
+
"""
|
267 |
+
def __init__(self, llm_client: LLMClient, prompt_template: str, classifier_prompt: str, ):
|
268 |
+
"""
|
269 |
+
Initializes the LLMClassifierExtractor with an LLM client and a prompt template.
|
270 |
+
|
271 |
+
Args:
|
272 |
+
llm_client (LLMClient): An instance of a class that implements the LLMClient interface.
|
273 |
+
prompt_template (str): The template to use for generating prompts for the LLM.
|
274 |
+
"""
|
275 |
+
super().__init__(llm_client, prompt_template)
|
276 |
+
self.classifier_prompt = classifier_prompt
|
277 |
+
|
278 |
+
def chunk_content(self, content: str , max_tokens: int = 500, is_clean: bool = True) -> List[str]:
|
279 |
+
"""
|
280 |
+
Splits the content into manageable chunks for processing.
|
281 |
+
|
282 |
+
Args:
|
283 |
+
content (str): The raw content to be chunked.
|
284 |
+
|
285 |
+
Returns:
|
286 |
+
List[str]: A list of text chunks.
|
287 |
+
"""
|
288 |
+
# Use the get_html_chunks function to split the content into chunks
|
289 |
+
return get_html_chunks(html=content, max_tokens=max_tokens, is_clean_html=is_clean, attr_cutoff_len=5)
|
290 |
+
|
291 |
+
|
292 |
+
def classify_chunks(self, chunks: List[str], schema: BaseModel) -> List[Dict[str, Any]]:
|
293 |
+
"""
|
294 |
+
Classifies each chunk using the LLM based on the provided schema.
|
295 |
+
|
296 |
+
Args:
|
297 |
+
chunks (List[str]): A list of text chunks to classify.
|
298 |
+
schema (BaseModel): A Pydantic model defining the structure of the expected output.
|
299 |
+
|
300 |
+
Returns:
|
301 |
+
List[Dict[str, Any]]: A list of dictionaries containing classified information.
|
302 |
+
"""
|
303 |
+
prompts = [self.classifier_prompt.format(content=chunk, schema=schema.model_json_schema()) for chunk in chunks]
|
304 |
+
classified_chunks = []
|
305 |
+
responses = self.llm_client.call_batch(prompts)
|
306 |
+
for response in responses:
|
307 |
+
# extract the json from the response
|
308 |
+
json_data = extract_markdown_json(response)
|
309 |
+
if json_data:
|
310 |
+
classified_chunks.append(json_data)
|
311 |
+
else:
|
312 |
+
classified_chunks.append({
|
313 |
+
"error": "Failed to extract JSON from response",
|
314 |
+
"relevant": 1,
|
315 |
+
})
|
316 |
+
return classified_chunks
|
317 |
+
|
318 |
+
def extract(self, content: str, schema: BaseModel) -> str:
|
319 |
+
"""
|
320 |
+
Extracts structured information from the given content based on the provided schema.
|
321 |
+
|
322 |
+
Args:
|
323 |
+
content (str): The raw content to extract information from.
|
324 |
+
schema (BaseModel): A Pydantic model defining the structure of the expected output.
|
325 |
+
|
326 |
+
Returns:
|
327 |
+
str: The structured JSON object as a string.
|
328 |
+
"""
|
329 |
+
# Chunk the HTML
|
330 |
+
chunks = self.chunk_content(content,max_tokens=1500)
|
331 |
+
print(f"Content successfully chunked into {len(chunks)} pieces.")
|
332 |
+
# Classify each chunk using the LLM
|
333 |
+
classified_chunks = self.classify_chunks(chunks, schema)
|
334 |
+
# Concatenate the positive classified chunks into a single string
|
335 |
+
print(f"Classified {classified_chunks} chunks.")
|
336 |
+
positive_chunks = []
|
337 |
+
for i, chunk in enumerate(classified_chunks):
|
338 |
+
if chunk.get("relevant", 0) > 0:
|
339 |
+
positive_chunks.append(chunks[i])
|
340 |
+
if len(positive_chunks) == 0:
|
341 |
+
positive_chunks = chunks
|
342 |
+
filtered_content = "\n\n".join(positive_chunks)
|
343 |
+
print(f"Filtered content for extraction: {filtered_content}") # Log the first 500 characters of filtered content
|
344 |
+
if not filtered_content:
|
345 |
+
print("Warning: No relevant chunks found. Returning empty response.")
|
346 |
+
return "{}"
|
347 |
+
# Generate the final prompt for extraction
|
348 |
+
prompt = self.prompt_template.format(content=filtered_content, schema=schema.model_json_schema())
|
349 |
+
print(f"Generated prompt for extraction: {prompt[:500]}...")
|
350 |
+
# Call the LLM to extract structured information
|
351 |
+
llm_response = self.llm_client.call_api(prompt)
|
352 |
+
print(f"LLM response: {llm_response[:500]}...")
|
353 |
+
# Return the structured response
|
354 |
+
if not llm_response:
|
355 |
+
print("Warning: LLM response is empty. Returning empty response.")
|
356 |
+
return "{}"
|
357 |
+
|
358 |
+
# json_response = extract_markdown_json(llm_response)
|
359 |
+
# if json_response is None:
|
360 |
+
# print("Warning: Failed to extract JSON from LLM response. Returning empty response.")
|
361 |
+
# return "{}"
|
362 |
+
|
363 |
+
return llm_response
|
364 |
|
365 |
# TODO: RAGExtractor class
|
366 |
class RAGExtractor(AIExtractor):
|
web2json/pipeline.py
CHANGED
@@ -27,7 +27,7 @@ class Pipeline:
|
|
27 |
"""
|
28 |
# Step 1: Preprocess the content
|
29 |
preprocessed_content = self.preprocessor.preprocess(content, is_url)
|
30 |
-
print(f"Preprocessed content: {preprocessed_content
|
31 |
print('+'*80)
|
32 |
# Step 2: Extract structured information using AI
|
33 |
extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
|
|
|
27 |
"""
|
28 |
# Step 1: Preprocess the content
|
29 |
preprocessed_content = self.preprocessor.preprocess(content, is_url)
|
30 |
+
print(f"Preprocessed content: {preprocessed_content}...")
|
31 |
print('+'*80)
|
32 |
# Step 2: Extract structured information using AI
|
33 |
extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
|
web2json/preprocessor.py
CHANGED
@@ -4,6 +4,74 @@ from bs4 import BeautifulSoup , Comment
|
|
4 |
from abc import ABC, abstractmethod
|
5 |
from typing import Any, Dict, Optional
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
class Preprocessor(ABC):
|
9 |
"""
|
@@ -136,9 +204,16 @@ class BasicPreprocessor(Preprocessor):
|
|
136 |
|
137 |
|
138 |
# Clean the HTML content
|
139 |
-
cleaned_content = self._clean_html(html_content)
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
|
144 |
|
|
|
4 |
from abc import ABC, abstractmethod
|
5 |
from typing import Any, Dict, Optional
|
6 |
|
7 |
+
class HTMLCleaner:
|
8 |
+
DEFAULT_REMOVE_TAGS = [
|
9 |
+
"script", "style"
|
10 |
+
]
|
11 |
+
|
12 |
+
def __init__(self, config: dict = None):
|
13 |
+
self.config = config or {}
|
14 |
+
# allow custom tags to remove
|
15 |
+
self.remove_tags = set(self.DEFAULT_REMOVE_TAGS) | set(self.config.get("extra_remove_tags", []))
|
16 |
+
|
17 |
+
def _clean_html(self, html_content: str) -> str:
|
18 |
+
"""
|
19 |
+
Cleans up the given HTML content by:
|
20 |
+
- Removing specified tags and their content.
|
21 |
+
- Stripping HTML comments.
|
22 |
+
- Optionally stripping out all attributes.
|
23 |
+
- Optionally flattening hyperlinks.
|
24 |
+
- Removing empty tags.
|
25 |
+
- Extracting and returning cleaned HTML or visible text.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
html_content (str): The HTML content to clean.
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
str: The cleaned HTML (if keep_tags=True) or normalized text.
|
32 |
+
"""
|
33 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
34 |
+
|
35 |
+
# Remove unwanted tags entirely
|
36 |
+
for tag_name in self.remove_tags:
|
37 |
+
for tag in soup.find_all(tag_name):
|
38 |
+
tag.decompose()
|
39 |
+
|
40 |
+
# Remove HTML comments
|
41 |
+
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
42 |
+
comment.extract()
|
43 |
+
|
44 |
+
# Strip attributes if requested
|
45 |
+
if self.config.get("strip_attrs", False):
|
46 |
+
for tag in soup.find_all(True):
|
47 |
+
tag.attrs = {}
|
48 |
+
|
49 |
+
# Flatten hyperlinks if requested
|
50 |
+
if self.config.get("strip_links", False):
|
51 |
+
for a in soup.find_all('a'):
|
52 |
+
a.replace_with(a.get_text())
|
53 |
+
|
54 |
+
# Remove empty tags (no text and no non-empty children)
|
55 |
+
for tag in soup.find_all(True):
|
56 |
+
if not tag.get_text(strip=True):
|
57 |
+
tag.decompose()
|
58 |
+
|
59 |
+
# Convert soup to HTML string if preserving tags
|
60 |
+
if self.config.get('keep_tags', False):
|
61 |
+
html_str = str(soup)
|
62 |
+
# Remove any empty lines
|
63 |
+
html_str = re.sub(r'(?m)^[ \t]*\n', '', html_str)
|
64 |
+
return html_str.strip()
|
65 |
+
|
66 |
+
# Extract visible text
|
67 |
+
text = soup.get_text(separator="\n", strip=True)
|
68 |
+
# Remove empty lines
|
69 |
+
lines = [line for line in text.splitlines() if line.strip()]
|
70 |
+
clean_text = "\n".join(lines)
|
71 |
+
# Normalize whitespace within lines
|
72 |
+
clean_text = re.sub(r'\s+', ' ', clean_text)
|
73 |
+
|
74 |
+
return clean_text.strip()
|
75 |
|
76 |
class Preprocessor(ABC):
|
77 |
"""
|
|
|
204 |
|
205 |
|
206 |
# Clean the HTML content
|
207 |
+
# cleaned_content = self._clean_html(html_content)
|
208 |
+
cleaner = HTMLCleaner({
|
209 |
+
'keep_tags': True if self.config.get('keep_tags', False) else False,
|
210 |
+
'strip_attrs': True,
|
211 |
+
'strip_links': True,
|
212 |
+
'extra_remove_tags': ['header', 'footer']
|
213 |
+
})
|
214 |
+
clean = cleaner._clean_html(html_content=html_content)
|
215 |
+
|
216 |
+
return clean.strip() # Return the cleaned text content, stripped of leading/trailing whitespace
|
217 |
|
218 |
|
219 |
|