abdo-Mansour commited on
Commit
c67f04e
·
1 Parent(s): 4ed1b4f

done with version 2

Browse files
app.py CHANGED
@@ -3,7 +3,7 @@ import pandas as pd
3
  import gradio as gr
4
  from typing import Dict, Any, Type
5
  from web2json.preprocessor import BasicPreprocessor
6
- from web2json.ai_extractor import AIExtractor, RAGExtractor, GeminiLLMClient
7
  from web2json.postprocessor import PostProcessor
8
  from web2json.pipeline import Pipeline
9
  from pydantic import BaseModel, Field, create_model
@@ -170,16 +170,66 @@ def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str,
170
  - Preserve the original formatting and context where relevant
171
  - Return the extracted data in the format specified by the schema"""
172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  # Initialize pipeline components
174
  # TODO: improve the RAG system and optimize (don't instantiate every time)
175
- preprocessor = BasicPreprocessor(config={'keep_tags': False})
176
  try:
177
- llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
 
178
  except Exception as e:
179
  return {"error": f"Failed to initialize LLM client: {str(e)}"}
180
 
181
  # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
182
- ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
183
  postprocessor = PostProcessor()
184
  pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
185
 
 
3
  import gradio as gr
4
  from typing import Dict, Any, Type
5
  from web2json.preprocessor import BasicPreprocessor
6
+ from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient
7
  from web2json.postprocessor import PostProcessor
8
  from web2json.pipeline import Pipeline
9
  from pydantic import BaseModel, Field, create_model
 
170
  - Preserve the original formatting and context where relevant
171
  - Return the extracted data in the format specified by the schema"""
172
 
173
+ classification_prompt_template = """
174
+ # HTML Chunk Relevance Classification Prompt
175
+
176
+ You are an HTML content classifier. Your task is to analyze an HTML chunk against a given schema and determine if the content is relevant.
177
+
178
+ ## Instructions:
179
+ 1. Carefully examine the provided HTML chunk
180
+ 2. Compare it against the given schema/criteria
181
+ 3. Determine if the HTML chunk contains content that matches or is relevant to the schema
182
+ 4. Respond with ONLY a JSON object containing a single field "relevant" with value 1 (relevant) or 0 (not relevant)
183
+
184
+ ## Input Format:
185
+ **Schema/Criteria:**
186
+ {schema}
187
+
188
+ **HTML Chunk:**
189
+ ```html
190
+ {content}
191
+ ```
192
+
193
+ ## Output Format:
194
+ Your response must be ONLY a valid JSON object with no additional text:
195
+
196
+ ```json
197
+ {{
198
+ "relevant": 1
199
+ }}
200
+ ```
201
+
202
+ OR
203
+
204
+ ```json
205
+ {{
206
+ "relevant": 0
207
+ }}
208
+ ```
209
+
210
+ ## Classification Rules:
211
+ - Output 1 if the HTML chunk contains content that matches the schema criteria
212
+ - Output 0 if the HTML chunk does not contain relevant content
213
+ - Consider semantic meaning, not just exact keyword matches
214
+ - Look at text content, attributes, structure, and context
215
+ - Ignore purely structural HTML elements (like divs, spans) unless they contain relevant content
216
+ - Be STRICT in your evaluation - only mark as relevant (1) if there is clear, meaningful content that directly relates to the schema
217
+ - Empty elements, placeholder text, navigation menus, headers/footers, and generic UI components should typically be marked as not relevant (0)
218
+ - The HTML chunk does not need to contain ALL schema information, but it must contain SUBSTANTIAL and SPECIFIC content related to the schema
219
+
220
+ CRITICAL: Your entire response MUST be exactly one JSON object. DO NOT include any explanations, reasoning, markdown formatting, code blocks, or additional text. Output ONLY the raw JSON object.
221
+ """
222
  # Initialize pipeline components
223
  # TODO: improve the RAG system and optimize (don't instantiate every time)
224
+ preprocessor = BasicPreprocessor(config={'keep_tags': True})
225
  try:
226
+ # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
227
+ llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
228
  except Exception as e:
229
  return {"error": f"Failed to initialize LLM client: {str(e)}"}
230
 
231
  # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
232
+ ai_extractor = LLMClassifierExtractor(llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
233
  postprocessor = PostProcessor()
234
  pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
235
 
requirements.txt CHANGED
@@ -10,4 +10,6 @@ json_repair
10
  numpy
11
  langchain
12
  langchain-text-splitters
13
- sentence-transformers
 
 
 
10
  numpy
11
  langchain
12
  langchain-text-splitters
13
+ sentence-transformers
14
+ openai
15
+ html_chunking
web2json/__pycache__/ai_extractor.cpython-311.pyc CHANGED
Binary files a/web2json/__pycache__/ai_extractor.cpython-311.pyc and b/web2json/__pycache__/ai_extractor.cpython-311.pyc differ
 
web2json/__pycache__/pipeline.cpython-311.pyc CHANGED
Binary files a/web2json/__pycache__/pipeline.cpython-311.pyc and b/web2json/__pycache__/pipeline.cpython-311.pyc differ
 
web2json/__pycache__/preprocessor.cpython-311.pyc CHANGED
Binary files a/web2json/__pycache__/preprocessor.cpython-311.pyc and b/web2json/__pycache__/preprocessor.cpython-311.pyc differ
 
web2json/ai_extractor.py CHANGED
@@ -1,14 +1,22 @@
1
  import os
2
- from abc import ABC, abstractmethod
 
3
  from google import genai
 
 
 
 
 
4
  from google.genai import types
5
  from pydantic import BaseModel
6
- import numpy as np
7
- from typing import List, Any, Dict, Tuple
8
- import time
 
 
 
9
  from langchain_text_splitters import HTMLHeaderTextSplitter
10
  from sentence_transformers import SentenceTransformer
11
-
12
  class LLMClient(ABC):
13
  """
14
  Abstract base class for calling LLM APIs.
@@ -96,7 +104,130 @@ class GeminiLLMClient(LLMClient):
96
  # Combine all output parts into a single string
97
  return response.text
98
 
99
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  class AIExtractor:
102
  def __init__(self, llm_client: LLMClient, prompt_template: str):
@@ -127,6 +258,109 @@ class AIExtractor:
127
  # print(f"Generated prompt: {prompt}")
128
  response = self.llm_client.call_api(prompt)
129
  return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  # TODO: RAGExtractor class
132
  class RAGExtractor(AIExtractor):
 
1
  import os
2
+ import time
3
+ import numpy as np
4
  from google import genai
5
+ from openai import OpenAI
6
+ import time
7
+ import random
8
+ from openai import RateLimitError
9
+ from functools import wraps
10
  from google.genai import types
11
  from pydantic import BaseModel
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ from html_chunking import get_html_chunks
14
+ from abc import ABC, abstractmethod
15
+ from typing import List, Any, Dict, Tuple, Optional
16
+ import re
17
+ import json
18
  from langchain_text_splitters import HTMLHeaderTextSplitter
19
  from sentence_transformers import SentenceTransformer
 
20
  class LLMClient(ABC):
21
  """
22
  Abstract base class for calling LLM APIs.
 
104
  # Combine all output parts into a single string
105
  return response.text
106
 
107
+ def extract_markdown_json(text: str) -> Optional[Dict[str, Any]]:
108
+ """
109
+ Find the first Markdown ```json ...``` block in `text`,
110
+ parse it as JSON, and return the resulting dict.
111
+ Returns None if no valid JSON block is found.
112
+ """
113
+ # 1) Look specifically for a ```json code fence
114
+ fence_match = re.search(
115
+ r"```json\s*(\{.*?\})\s*```",
116
+ text,
117
+ re.DOTALL | re.IGNORECASE
118
+ )
119
+ if not fence_match:
120
+ return None
121
+
122
+ json_str = fence_match.group(1)
123
+ try:
124
+ return json.loads(json_str)
125
+ except json.JSONDecodeError:
126
+ return None
127
+
128
+ def retry_on_ratelimit(max_retries=5, base_delay=1.0, max_delay=10.0):
129
+ def deco(fn):
130
+ @wraps(fn)
131
+ def wrapped(*args, **kwargs):
132
+ delay = base_delay
133
+ for attempt in range(max_retries):
134
+ try:
135
+ return fn(*args, **kwargs)
136
+ except RateLimitError:
137
+ if attempt == max_retries - 1:
138
+ # give up
139
+ raise
140
+ # back off + jitter
141
+ sleep = min(max_delay, delay) + random.uniform(0, delay)
142
+ time.sleep(sleep)
143
+ delay *= 2
144
+ # unreachable
145
+ return wrapped
146
+ return deco
147
+ class NvidiaLLMClient(LLMClient):
148
+ """
149
+ Concrete implementation of LLMClient for the NVIDIA API (non-streaming).
150
+ """
151
+
152
+ def __init__(self, config: dict):
153
+ """
154
+ Initializes the NvidiaLLMClient with an API key, model name, and optional generation settings.
155
+
156
+ Args:
157
+ config (dict): Configuration containing:
158
+ - 'api_key': (optional) API key for NVIDIA (falls back to NVIDIA_API_KEY env var)
159
+ - 'model_name': (optional) the model to use (default 'google/gemma-3-1b-it')
160
+ - 'generation_config': (optional) dict of generation parameters like temperature, top_p, etc.
161
+ """
162
+ api_key = config.get("api_key") or os.environ.get("NVIDIA_API_KEY")
163
+ if not api_key:
164
+ raise ValueError(
165
+ "API key for NVIDIA must be provided in config['api_key'] or NVIDIA_API_KEY env var."
166
+ )
167
+
168
+ self.client = OpenAI(
169
+ base_url="https://integrate.api.nvidia.com/v1",
170
+ api_key=api_key
171
+ )
172
+ self.model_name = config.get("model_name", "google/gemma-3-1b-it")
173
+
174
+ # Store generation settings with sensible defaults
175
+ gen_conf = config.get("generation_config", {})
176
+ self.temperature = gen_conf.get("temperature", 0.1)
177
+ self.top_p = gen_conf.get("top_p", 0.7)
178
+ self.max_tokens = gen_conf.get("max_tokens", 512)
179
+
180
+ def set_model(self, model_name: str):
181
+ """
182
+ Set the model name for the NVIDIA API client.
183
+
184
+ Args:
185
+ model_name (str): The name of the model to use.
186
+ """
187
+ self.model_name = model_name
188
+
189
+ @retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
190
+ def call_api(self, prompt: str) -> str:
191
+ """
192
+ Call the NVIDIA API with the given prompt (non-streaming).
193
+
194
+ Args:
195
+ prompt (str): The input text for the API.
196
+
197
+ Returns:
198
+ str: The generated text from the NVIDIA API.
199
+ """
200
+ response = self.client.chat.completions.create(
201
+ model=self.model_name,
202
+ messages=[{"role": "user", "content": prompt}],
203
+ temperature=self.temperature,
204
+ top_p=self.top_p,
205
+ max_tokens=self.max_tokens
206
+ # stream is omitted (defaults to False)
207
+ )
208
+ # print("DONE")
209
+ # For the standard (non-streaming) response:
210
+ # choices[0].message.content holds the generated text
211
+ return response.choices[0].message.content
212
+
213
+ def call_batch(self, prompts, max_workers=8):
214
+ """
215
+ Parallel batch with isolated errors: each prompt that still
216
+ fails after retries will raise, but others succeed.
217
+ """
218
+ from concurrent.futures import ThreadPoolExecutor, as_completed
219
+ results = [None] * len(prompts)
220
+ with ThreadPoolExecutor(max_workers=max_workers) as ex:
221
+ futures = {ex.submit(self.call_api, p): i for i, p in enumerate(prompts)}
222
+ for fut in as_completed(futures):
223
+ idx = futures[fut]
224
+ try:
225
+ results[idx] = fut.result()
226
+ except RateLimitError:
227
+ # You could set results[idx] = None or a default string
228
+ results[idx] = f"<failed after retries>"
229
+ return results
230
+
231
 
232
  class AIExtractor:
233
  def __init__(self, llm_client: LLMClient, prompt_template: str):
 
258
  # print(f"Generated prompt: {prompt}")
259
  response = self.llm_client.call_api(prompt)
260
  return response
261
+
262
+ class LLMClassifierExtractor(AIExtractor):
263
+ """
264
+ Extractor that uses an LLM to classify and extract structured information from text content.
265
+ This class is designed to handle classification tasks where the LLM generates structured output based on a provided schema.
266
+ """
267
+ def __init__(self, llm_client: LLMClient, prompt_template: str, classifier_prompt: str, ):
268
+ """
269
+ Initializes the LLMClassifierExtractor with an LLM client and a prompt template.
270
+
271
+ Args:
272
+ llm_client (LLMClient): An instance of a class that implements the LLMClient interface.
273
+ prompt_template (str): The template to use for generating prompts for the LLM.
274
+ """
275
+ super().__init__(llm_client, prompt_template)
276
+ self.classifier_prompt = classifier_prompt
277
+
278
+ def chunk_content(self, content: str , max_tokens: int = 500, is_clean: bool = True) -> List[str]:
279
+ """
280
+ Splits the content into manageable chunks for processing.
281
+
282
+ Args:
283
+ content (str): The raw content to be chunked.
284
+
285
+ Returns:
286
+ List[str]: A list of text chunks.
287
+ """
288
+ # Use the get_html_chunks function to split the content into chunks
289
+ return get_html_chunks(html=content, max_tokens=max_tokens, is_clean_html=is_clean, attr_cutoff_len=5)
290
+
291
+
292
+ def classify_chunks(self, chunks: List[str], schema: BaseModel) -> List[Dict[str, Any]]:
293
+ """
294
+ Classifies each chunk using the LLM based on the provided schema.
295
+
296
+ Args:
297
+ chunks (List[str]): A list of text chunks to classify.
298
+ schema (BaseModel): A Pydantic model defining the structure of the expected output.
299
+
300
+ Returns:
301
+ List[Dict[str, Any]]: A list of dictionaries containing classified information.
302
+ """
303
+ prompts = [self.classifier_prompt.format(content=chunk, schema=schema.model_json_schema()) for chunk in chunks]
304
+ classified_chunks = []
305
+ responses = self.llm_client.call_batch(prompts)
306
+ for response in responses:
307
+ # extract the json from the response
308
+ json_data = extract_markdown_json(response)
309
+ if json_data:
310
+ classified_chunks.append(json_data)
311
+ else:
312
+ classified_chunks.append({
313
+ "error": "Failed to extract JSON from response",
314
+ "relevant": 1,
315
+ })
316
+ return classified_chunks
317
+
318
+ def extract(self, content: str, schema: BaseModel) -> str:
319
+ """
320
+ Extracts structured information from the given content based on the provided schema.
321
+
322
+ Args:
323
+ content (str): The raw content to extract information from.
324
+ schema (BaseModel): A Pydantic model defining the structure of the expected output.
325
+
326
+ Returns:
327
+ str: The structured JSON object as a string.
328
+ """
329
+ # Chunk the HTML
330
+ chunks = self.chunk_content(content,max_tokens=1500)
331
+ print(f"Content successfully chunked into {len(chunks)} pieces.")
332
+ # Classify each chunk using the LLM
333
+ classified_chunks = self.classify_chunks(chunks, schema)
334
+ # Concatenate the positive classified chunks into a single string
335
+ print(f"Classified {classified_chunks} chunks.")
336
+ positive_chunks = []
337
+ for i, chunk in enumerate(classified_chunks):
338
+ if chunk.get("relevant", 0) > 0:
339
+ positive_chunks.append(chunks[i])
340
+ if len(positive_chunks) == 0:
341
+ positive_chunks = chunks
342
+ filtered_content = "\n\n".join(positive_chunks)
343
+ print(f"Filtered content for extraction: {filtered_content}") # Log the first 500 characters of filtered content
344
+ if not filtered_content:
345
+ print("Warning: No relevant chunks found. Returning empty response.")
346
+ return "{}"
347
+ # Generate the final prompt for extraction
348
+ prompt = self.prompt_template.format(content=filtered_content, schema=schema.model_json_schema())
349
+ print(f"Generated prompt for extraction: {prompt[:500]}...")
350
+ # Call the LLM to extract structured information
351
+ llm_response = self.llm_client.call_api(prompt)
352
+ print(f"LLM response: {llm_response[:500]}...")
353
+ # Return the structured response
354
+ if not llm_response:
355
+ print("Warning: LLM response is empty. Returning empty response.")
356
+ return "{}"
357
+
358
+ # json_response = extract_markdown_json(llm_response)
359
+ # if json_response is None:
360
+ # print("Warning: Failed to extract JSON from LLM response. Returning empty response.")
361
+ # return "{}"
362
+
363
+ return llm_response
364
 
365
  # TODO: RAGExtractor class
366
  class RAGExtractor(AIExtractor):
web2json/pipeline.py CHANGED
@@ -27,7 +27,7 @@ class Pipeline:
27
  """
28
  # Step 1: Preprocess the content
29
  preprocessed_content = self.preprocessor.preprocess(content, is_url)
30
- print(f"Preprocessed content: {preprocessed_content[:100]}...")
31
  print('+'*80)
32
  # Step 2: Extract structured information using AI
33
  extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
 
27
  """
28
  # Step 1: Preprocess the content
29
  preprocessed_content = self.preprocessor.preprocess(content, is_url)
30
+ print(f"Preprocessed content: {preprocessed_content}...")
31
  print('+'*80)
32
  # Step 2: Extract structured information using AI
33
  extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
web2json/preprocessor.py CHANGED
@@ -4,6 +4,74 @@ from bs4 import BeautifulSoup , Comment
4
  from abc import ABC, abstractmethod
5
  from typing import Any, Dict, Optional
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  class Preprocessor(ABC):
9
  """
@@ -136,9 +204,16 @@ class BasicPreprocessor(Preprocessor):
136
 
137
 
138
  # Clean the HTML content
139
- cleaned_content = self._clean_html(html_content)
140
-
141
- return cleaned_content.strip() # Return the cleaned text content, stripped of leading/trailing whitespace
 
 
 
 
 
 
 
142
 
143
 
144
 
 
4
  from abc import ABC, abstractmethod
5
  from typing import Any, Dict, Optional
6
 
7
+ class HTMLCleaner:
8
+ DEFAULT_REMOVE_TAGS = [
9
+ "script", "style"
10
+ ]
11
+
12
+ def __init__(self, config: dict = None):
13
+ self.config = config or {}
14
+ # allow custom tags to remove
15
+ self.remove_tags = set(self.DEFAULT_REMOVE_TAGS) | set(self.config.get("extra_remove_tags", []))
16
+
17
+ def _clean_html(self, html_content: str) -> str:
18
+ """
19
+ Cleans up the given HTML content by:
20
+ - Removing specified tags and their content.
21
+ - Stripping HTML comments.
22
+ - Optionally stripping out all attributes.
23
+ - Optionally flattening hyperlinks.
24
+ - Removing empty tags.
25
+ - Extracting and returning cleaned HTML or visible text.
26
+
27
+ Args:
28
+ html_content (str): The HTML content to clean.
29
+
30
+ Returns:
31
+ str: The cleaned HTML (if keep_tags=True) or normalized text.
32
+ """
33
+ soup = BeautifulSoup(html_content, "html.parser")
34
+
35
+ # Remove unwanted tags entirely
36
+ for tag_name in self.remove_tags:
37
+ for tag in soup.find_all(tag_name):
38
+ tag.decompose()
39
+
40
+ # Remove HTML comments
41
+ for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
42
+ comment.extract()
43
+
44
+ # Strip attributes if requested
45
+ if self.config.get("strip_attrs", False):
46
+ for tag in soup.find_all(True):
47
+ tag.attrs = {}
48
+
49
+ # Flatten hyperlinks if requested
50
+ if self.config.get("strip_links", False):
51
+ for a in soup.find_all('a'):
52
+ a.replace_with(a.get_text())
53
+
54
+ # Remove empty tags (no text and no non-empty children)
55
+ for tag in soup.find_all(True):
56
+ if not tag.get_text(strip=True):
57
+ tag.decompose()
58
+
59
+ # Convert soup to HTML string if preserving tags
60
+ if self.config.get('keep_tags', False):
61
+ html_str = str(soup)
62
+ # Remove any empty lines
63
+ html_str = re.sub(r'(?m)^[ \t]*\n', '', html_str)
64
+ return html_str.strip()
65
+
66
+ # Extract visible text
67
+ text = soup.get_text(separator="\n", strip=True)
68
+ # Remove empty lines
69
+ lines = [line for line in text.splitlines() if line.strip()]
70
+ clean_text = "\n".join(lines)
71
+ # Normalize whitespace within lines
72
+ clean_text = re.sub(r'\s+', ' ', clean_text)
73
+
74
+ return clean_text.strip()
75
 
76
  class Preprocessor(ABC):
77
  """
 
204
 
205
 
206
  # Clean the HTML content
207
+ # cleaned_content = self._clean_html(html_content)
208
+ cleaner = HTMLCleaner({
209
+ 'keep_tags': True if self.config.get('keep_tags', False) else False,
210
+ 'strip_attrs': True,
211
+ 'strip_links': True,
212
+ 'extra_remove_tags': ['header', 'footer']
213
+ })
214
+ clean = cleaner._clean_html(html_content=html_content)
215
+
216
+ return clean.strip() # Return the cleaned text content, stripped of leading/trailing whitespace
217
 
218
 
219