minar09 commited on
Commit
29de06b
·
verified ·
1 Parent(s): b153f50

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +65 -69
main.py CHANGED
@@ -8,7 +8,7 @@ from dataclasses import dataclass
8
  from fastapi.encoders import jsonable_encoder
9
  import fitz # PyMuPDF
10
  from sentence_transformers import SentenceTransformer
11
- from llama_cpp import Llama
12
 
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
@@ -29,46 +29,38 @@ class ProductSpec:
29
  class PDFProcessor:
30
  def __init__(self):
31
  self.emb_model = self._initialize_emb_model("all-MiniLM-L6-v2")
32
- # Choose the appropriate model filename below; adjust if needed.
33
- # self.llm = self._initialize_llm("deepseek-llm-7b-base.Q2_K.gguf")
34
- self.llm = self._initialize_llm("llama-2-7b.Q2_K.gguf")
35
  self.output_dir = Path("./output")
36
  self.output_dir.mkdir(exist_ok=True)
37
 
38
  def _initialize_emb_model(self, model_name):
39
  try:
40
- # Use SentenceTransformer if available
41
- return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
42
  except Exception as e:
43
- logger.warning(f"SentenceTransformer failed: {e}. Falling back to transformers model.")
44
  from transformers import AutoTokenizer, AutoModel
45
- tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/" + model_name)
46
- model = AutoModel.from_pretrained("sentence-transformers/" + model_name)
47
  return model
48
 
49
- def _initialize_llm(self, model_name):
50
- """Initialize LLM with automatic download if needed"""
51
- # Here we use from_pretrained so that if the model is missing locally it downloads it.
52
- model_path = os.path.join("models/", model_name)
53
- if os.path.exists(model_path):
54
- return Llama(
55
- model_path=model_path,
56
- n_ctx=1024,
57
- n_gpu_layers=-1,
58
- n_threads=os.cpu_count() - 1,
59
- verbose=False
60
- )
61
- else:
62
- return Llama.from_pretrained(
63
- repo_id="Tien203/llama.cpp",
64
- filename="Llama-2-7b-hf-q4_0.gguf",
65
  )
 
 
 
66
 
67
  def process_pdf(self, pdf_path: str) -> Dict:
68
- """Process PDF using PyMuPDF"""
69
  start_time = time.time()
70
 
71
- # Open PDF
72
  try:
73
  doc = fitz.open(pdf_path)
74
  except Exception as e:
@@ -78,37 +70,63 @@ class PDFProcessor:
78
  text_blocks = []
79
  tables = []
80
 
81
- # Extract text and tables from each page
82
  for page_num, page in enumerate(doc):
83
- # Extract text blocks from page and filter out very short blocks (noise)
84
  blocks = self._extract_text_blocks(page)
85
- filtered = [block for block in blocks if len(block.strip()) >= 10]
86
- logger.debug(f"Page {page_num + 1}: Extracted {len(blocks)} blocks, {len(filtered)} kept after filtering.")
87
- text_blocks.extend(filtered)
88
-
89
- # Extract tables (if any)
90
  tables.extend(self._extract_tables(page, page_num))
91
 
92
- # Process text blocks with LLM to extract product information
93
  products = []
94
  for idx, block in enumerate(text_blocks):
95
- # Log the text block for debugging
96
- logger.debug(f"Processing text block {idx}: {block[:100]}...")
97
  product = self._process_text_block(block)
98
- if product:
99
  product.tables = tables
100
- # Only add if at least one key (like name) is non-empty
101
- if product.name or product.description or product.price or (
102
- product.attributes and len(product.attributes) > 0):
103
- products.append(product.to_dict())
104
- else:
105
- logger.debug(f"LLM returned empty product for block {idx}.")
106
- else:
107
- logger.debug(f"No product extracted from block {idx}.")
108
 
109
  logger.info(f"Processed {len(products)} products in {time.time() - start_time:.2f}s")
110
  return {"products": products, "tables": tables}
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  def _extract_text_blocks(self, page) -> List[str]:
113
  """Extract text blocks from a PDF page using PyMuPDF's blocks method."""
114
  blocks = []
@@ -138,27 +156,6 @@ class PDFProcessor:
138
  logger.warning(f"Error extracting tables from page {page_num + 1}: {e}")
139
  return tables
140
 
141
- def _process_text_block(self, text: str) -> Optional[ProductSpec]:
142
- """Process a text block with LLM to extract product specifications."""
143
- prompt = self._generate_query_prompt(text)
144
- logger.debug(f"Generated prompt: {prompt[:200]}...")
145
- try:
146
- response = self.llm.create_chat_completion(
147
- messages=[{"role": "user", "content": prompt}],
148
- temperature=0.1,
149
- max_tokens=512
150
- )
151
- # Debug: log raw response
152
- logger.debug(f"LLM raw response: {response}")
153
- return self._parse_response(response['choices'][0]['message']['content'])
154
- except Exception as e:
155
- logger.warning(f"Error processing text block: {e}")
156
- return None
157
-
158
- def _generate_query_prompt(self, text: str) -> str:
159
- """Generate a prompt instructing the LLM to extract product information."""
160
- return f"""Extract product specifications from the following text. If no product is found, return an empty JSON object with keys.\n\nText:\n{text}\n\nReturn JSON format exactly as:\n{{\n \"name\": \"product name\",\n \"description\": \"product description\",\n \"price\": numeric_price,\n \"attributes\": {{ \"key\": \"value\" }}\n}}"""
161
-
162
  def _parse_response(self, response: str) -> Optional[ProductSpec]:
163
  """Parse the LLM's response to extract a product specification."""
164
  try:
@@ -193,7 +190,6 @@ def process_pdf_catalog(pdf_path: str):
193
 
194
 
195
  if __name__ == "__main__":
196
- # Example usage: change this if you call process_pdf_catalog elsewhere
197
  pdf_path = "path/to/your/pdf_file.pdf"
198
  result, message = process_pdf_catalog(pdf_path)
199
- print(result, message)
 
8
  from fastapi.encoders import jsonable_encoder
9
  import fitz # PyMuPDF
10
  from sentence_transformers import SentenceTransformer
11
+ from mlc_llm import MLCEngine
12
 
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
 
29
  class PDFProcessor:
30
  def __init__(self):
31
  self.emb_model = self._initialize_emb_model("all-MiniLM-L6-v2")
32
+ self.llm = self._initialize_llm()
 
 
33
  self.output_dir = Path("./output")
34
  self.output_dir.mkdir(exist_ok=True)
35
 
36
  def _initialize_emb_model(self, model_name):
37
  try:
38
+ return SentenceTransformer(f'sentence-transformers/{model_name}')
 
39
  except Exception as e:
40
+ logger.warning(f"SentenceTransformer failed: {e}")
41
  from transformers import AutoTokenizer, AutoModel
42
+ tokenizer = AutoTokenizer.from_pretrained(f"sentence-transformers/{model_name}")
43
+ model = AutoModel.from_pretrained(f"sentence-transformers/{model_name}")
44
  return model
45
 
46
+ def _initialize_llm(self):
47
+ """Initialize MLC LLM engine with optimized settings"""
48
+ try:
49
+ return MLCEngine(
50
+ model="HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC",
51
+ mode="server",
52
+ device="cuda" if os.getenv("USE_CUDA", "0") == "1" else "auto",
53
+ temperature=0.1,
54
+ max_tokens=512
 
 
 
 
 
 
 
55
  )
56
+ except Exception as e:
57
+ logger.error(f"Failed to initialize MLC Engine: {e}")
58
+ raise
59
 
60
  def process_pdf(self, pdf_path: str) -> Dict:
61
+ """Main PDF processing pipeline"""
62
  start_time = time.time()
63
 
 
64
  try:
65
  doc = fitz.open(pdf_path)
66
  except Exception as e:
 
70
  text_blocks = []
71
  tables = []
72
 
 
73
  for page_num, page in enumerate(doc):
 
74
  blocks = self._extract_text_blocks(page)
75
+ text_blocks.extend([b for b in blocks if len(b.strip()) >= 10])
 
 
 
 
76
  tables.extend(self._extract_tables(page, page_num))
77
 
 
78
  products = []
79
  for idx, block in enumerate(text_blocks):
 
 
80
  product = self._process_text_block(block)
81
+ if product and self._is_valid_product(product):
82
  product.tables = tables
83
+ products.append(product.to_dict())
 
 
 
 
 
 
 
84
 
85
  logger.info(f"Processed {len(products)} products in {time.time() - start_time:.2f}s")
86
  return {"products": products, "tables": tables}
87
 
88
+ def _process_text_block(self, text: str) -> Optional[ProductSpec]:
89
+ """Process text with MLC LLM using optimized prompt"""
90
+ try:
91
+ prompt = self._generate_query_prompt(text)
92
+ response = self.llm.chat.completions.create(
93
+ messages=[{"role": "user", "content": prompt}],
94
+ stream=False
95
+ )
96
+ return self._parse_response(response.choices[0].message.content)
97
+ except Exception as e:
98
+ logger.warning(f"Error processing text block: {e}")
99
+ return None
100
+
101
+ def _generate_query_prompt(self, text: str) -> str:
102
+ """Generate structured prompt for better JSON response"""
103
+ return f"""Extract product specifications as JSON from this text:
104
+
105
+ Text: {text}
106
+
107
+ Return valid JSON with exactly these keys:
108
+ - name (string)
109
+ - description (string, optional)
110
+ - price (number, optional)
111
+ - attributes (object with key-value pairs, optional)
112
+
113
+ Example:
114
+ {{
115
+ "name": "Example Product",
116
+ "description": "High-quality example item",
117
+ "price": 99.99,
118
+ "attributes": {{"color": "red", "size": "XL"}}
119
+ }}"""
120
+
121
+ def _is_valid_product(self, product: ProductSpec) -> bool:
122
+ """Validate extracted product data"""
123
+ return any([
124
+ product.name,
125
+ product.description,
126
+ product.price,
127
+ product.attributes
128
+ ])
129
+
130
  def _extract_text_blocks(self, page) -> List[str]:
131
  """Extract text blocks from a PDF page using PyMuPDF's blocks method."""
132
  blocks = []
 
156
  logger.warning(f"Error extracting tables from page {page_num + 1}: {e}")
157
  return tables
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  def _parse_response(self, response: str) -> Optional[ProductSpec]:
160
  """Parse the LLM's response to extract a product specification."""
161
  try:
 
190
 
191
 
192
  if __name__ == "__main__":
 
193
  pdf_path = "path/to/your/pdf_file.pdf"
194
  result, message = process_pdf_catalog(pdf_path)
195
+ print(json.dumps(result, indent=2), message)