minar09 commited on
Commit
17345fb
·
verified ·
1 Parent(s): 0735672

Upload 6 files

Browse files
Files changed (3) hide show
  1. app.py +21 -13
  2. main.py +85 -118
  3. requirements.txt +4 -0
app.py CHANGED
@@ -1,29 +1,37 @@
1
  import os
2
  import gradio as gr
3
  import main
 
4
 
5
 
6
  def predict_from_pdf(pdf_file):
 
7
  upload_dir = "./catalogue/"
8
  os.makedirs(upload_dir, exist_ok=True)
9
 
 
 
 
10
  try:
11
- # Save the uploaded file to a temporary location
12
- dest_path = os.path.join(upload_dir, pdf_file.name)
13
- with open(dest_path, "wb") as f:
14
- with open(pdf_file.name, "rb") as uploaded_file:
15
- f.write(uploaded_file.read())
16
-
17
- # Process the PDF
18
- df, response = main.process_pdf_catalog(dest_path)
 
19
  return df, response
 
20
  except Exception as e:
21
- return None, f"Error: {str(e)}"
22
 
23
 
 
24
  pdf_examples = [
25
- ["examples/flexpocket.pdf"],
26
- ["examples/ASICS_Catalog.pdf"],
27
  ]
28
 
29
  demo = gr.Interface(
@@ -32,8 +40,8 @@ demo = gr.Interface(
32
  outputs=["json", "text"],
33
  examples=pdf_examples,
34
  title="Open Source PDF Catalog Parser",
35
- description="Efficient PDF catalog processing using MinerU and OpenLLM",
36
- article="Uses MinerU for layout analysis and DeepSeek-7B for structured extraction"
37
  )
38
 
39
  if __name__ == "__main__":
 
1
  import os
2
  import gradio as gr
3
  import main
4
+ import shutil
5
 
6
 
7
  def predict_from_pdf(pdf_file):
8
+ # Create a temporary directory for file uploads
9
  upload_dir = "./catalogue/"
10
  os.makedirs(upload_dir, exist_ok=True)
11
 
12
+ # Use the provided file path from Gradio's file object
13
+ dest_file_path = os.path.join(upload_dir, os.path.basename(pdf_file.name))
14
+
15
  try:
16
+ # Save the uploaded file using shutil.copy
17
+ shutil.copy(pdf_file, dest_file_path)
18
+
19
+ # Check if the file was saved successfully
20
+ if not os.path.exists(dest_file_path):
21
+ return None, f"Error: The file {dest_file_path} could not be found or opened."
22
+
23
+ # Process the PDF and retrieve the product info
24
+ df, response = main.process_pdf_catalog(dest_file_path)
25
  return df, response
26
+
27
  except Exception as e:
28
+ return None, f"Error processing PDF: {str(e)}"
29
 
30
 
31
+ # Define example PDFs
32
  pdf_examples = [
33
+ ["catalogue/flexpocket.pdf"],
34
+ ["catalogue/ASICS_Catalog.pdf"],
35
  ]
36
 
37
  demo = gr.Interface(
 
40
  outputs=["json", "text"],
41
  examples=pdf_examples,
42
  title="Open Source PDF Catalog Parser",
43
+ description="Efficient PDF catalog processing using fitz and OpenLLM",
44
+ article="Uses MinerU for layout analysis and Llama-CPP for structured extraction"
45
  )
46
 
47
  if __name__ == "__main__":
main.py CHANGED
@@ -5,24 +5,15 @@ import logging
5
  from pathlib import Path
6
  from typing import List, Dict, Optional
7
  from dataclasses import dataclass
 
 
 
8
  from fastapi.encoders import jsonable_encoder
9
- # from sentence_transformers import SentenceTransformer
10
- # from llama_cpp import Llama
11
-
12
- # Fix: Dynamically adjust the module path if magic_pdf is in a non-standard location
13
- try:
14
- from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
15
- from magic_pdf.data.dataset import PymuDocDataset
16
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
17
- from magic_pdf.config.enums import SupportedPdfParseMethod
18
- except ModuleNotFoundError as e:
19
- logging.error(f"Failed to import magic_pdf modules: {e}")
20
- logging.info("Ensure that the magic_pdf package is installed and accessible in your Python environment.")
21
- raise e
22
 
23
  logging.basicConfig(level=logging.INFO)
24
  logger = logging.getLogger(__name__)
25
 
 
26
  @dataclass
27
  class ProductSpec:
28
  name: str
@@ -34,127 +25,101 @@ class ProductSpec:
34
  def to_dict(self):
35
  return jsonable_encoder(self)
36
 
 
37
  class PDFProcessor:
38
  def __init__(self):
39
  self.emb_model = self._initialize_emb_model("all-MiniLM-L6-v2")
40
- self.llm = self._initialize_llm("deepseek-llm-7b-base.Q5_K_M.gguf")
 
41
  self.output_dir = Path("./output")
42
  self.output_dir.mkdir(exist_ok=True)
43
 
44
  def _initialize_emb_model(self, model_name):
45
- # try:
46
- # model = SentenceTransformer("sentence-transformers/" + model_name)
47
- # model.save('models/'+ model_name)
48
- # return model
49
- # except:
50
- # Load model directly
51
- from transformers import AutoTokenizer, AutoModel
52
-
53
- tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
54
- model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
55
- return model
56
 
57
  def _initialize_llm(self, model_name):
58
  """Initialize LLM with automatic download if needed"""
59
- """
60
- model_path = os.path.join("models/", model_name)
61
- if os.path.exists(model_path):
62
- return Llama(
63
- model_path=model_path,
64
- n_ctx=2048,
65
- n_gpu_layers=35 if os.getenv('USE_GPU') else 0,
66
- n_threads=os.cpu_count() - 1,
67
- verbose=False
68
- )
69
- else:
70
- return Llama.from_pretrained(
71
- repo_id="TheBloke/deepseek-llm-7B-base-GGUF",
72
- filename=model_name,
73
- n_ctx=2048,
74
- n_threads=os.cpu_count() - 1,
75
- n_gpu_layers=35 if os.getenv('USE_GPU') else 0,
76
- verbose=False
77
- )
78
- """
79
- # Load model directly
80
- from transformers import AutoModel
81
- model = AutoModel.from_pretrained("TheBloke/deepseek-llm-7B-base-GGUF")
82
- return model
83
-
84
  def process_pdf(self, pdf_path: str) -> Dict:
85
- """Process PDF using MinerU pipeline"""
86
  start_time = time.time()
87
-
88
- # Initialize MinerU components
89
- local_image_dir = self.output_dir / "images"
90
- local_md_dir = self.output_dir
91
- image_dir = str(local_image_dir.name)
92
-
93
- os.makedirs(local_image_dir, exist_ok=True)
94
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  try:
96
- image_writer = FileBasedDataWriter(str(local_image_dir))
97
- md_writer = FileBasedDataWriter(str(local_md_dir))
98
-
99
- # Read PDF
100
- reader = FileBasedDataReader("")
101
- pdf_bytes = reader.read(pdf_path)
102
-
103
- # Create dataset and process
104
- ds = PymuDocDataset(pdf_bytes)
105
-
106
- if ds.classify() == SupportedPdfParseMethod.OCR:
107
- infer_result = ds.apply(doc_analyze, ocr=True)
108
- pipe_result = infer_result.pipe_ocr_mode(image_writer)
109
- else:
110
- infer_result = ds.apply(doc_analyze, ocr=False)
111
- pipe_result = infer_result.pipe_txt_mode(image_writer)
112
-
113
- # Get structured content
114
- middle_json = pipe_result.get_middle_json()
115
- tables = self._extract_tables(middle_json)
116
- text_blocks = self._extract_text_blocks(middle_json)
117
-
118
- # Process text blocks with LLM
119
- products = []
120
- for block in text_blocks:
121
- product = self._process_text_block(block)
122
- if product:
123
- product.tables = tables
124
- products.append(product.to_dict())
125
-
126
- logger.info(f"Processed {len(products)} products in {time.time()-start_time:.2f}s")
127
- return {"products": products, "tables": tables}
128
  except Exception as e:
129
- logger.error(f"Error during PDF processing: {e}")
130
- raise RuntimeError("PDF processing failed.") from e
131
-
132
- def _extract_tables(self, middle_json: Dict) -> List[Dict]:
133
- """Extract tables from MinerU's middle JSON"""
134
- tables = []
135
- for page in middle_json.get('pages', []):
136
- for table in page.get('tables', []):
137
- tables.append({
138
- "page": page.get('page_number'),
139
- "cells": table.get('cells', []),
140
- "header": table.get('header', []),
141
- "content": table.get('content', [])
142
- })
143
  return tables
144
-
145
- def _extract_text_blocks(self, middle_json: Dict) -> List[str]:
146
- """Extract text blocks from MinerU's middle JSON"""
147
- text_blocks = []
148
- for page in middle_json.get('pages', []):
149
- for block in page.get('blocks', []):
150
- if block.get('type') == 'text':
151
- text_blocks.append(block.get('text', ''))
152
- return text_blocks
153
-
154
  def _process_text_block(self, text: str) -> Optional[ProductSpec]:
155
  """Process text block with LLM"""
156
  prompt = self._generate_query_prompt(text)
157
-
158
  try:
159
  response = self.llm.create_chat_completion(
160
  messages=[{"role": "user", "content": prompt}],
@@ -165,11 +130,12 @@ class PDFProcessor:
165
  except Exception as e:
166
  logger.warning(f"Error processing text block: {e}")
167
  return None
168
-
169
  def _generate_query_prompt(self, text: str) -> str:
170
  """Generate extraction prompt"""
171
  return f"""Extract product specifications from this text:
172
  {text}
 
173
  Return JSON format:
174
  {{
175
  "name": "product name",
@@ -177,7 +143,7 @@ Return JSON format:
177
  "price": numeric_price,
178
  "attributes": {{ "key": "value" }}
179
  }}"""
180
-
181
  def _parse_response(self, response: str) -> Optional[ProductSpec]:
182
  """Parse LLM response"""
183
  try:
@@ -194,6 +160,7 @@ Return JSON format:
194
  logger.warning(f"Parse error: {e}")
195
  return None
196
 
 
197
  def process_pdf_catalog(pdf_path: str):
198
  processor = PDFProcessor()
199
  try:
 
5
  from pathlib import Path
6
  from typing import List, Dict, Optional
7
  from dataclasses import dataclass
8
+ import fitz # PyMuPDF
9
+ from sentence_transformers import SentenceTransformer
10
+ from llama_cpp import Llama
11
  from fastapi.encoders import jsonable_encoder
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
16
+
17
  @dataclass
18
  class ProductSpec:
19
  name: str
 
25
  def to_dict(self):
26
  return jsonable_encoder(self)
27
 
28
+
29
  class PDFProcessor:
30
  def __init__(self):
31
  self.emb_model = self._initialize_emb_model("all-MiniLM-L6-v2")
32
+ # self.llm = self._initialize_llm("llama-2-7b.Q2_K.gguf")
33
+ self.llm = self._initialize_llm("deepseek-llm-7b-base.Q2_K.gguf")
34
  self.output_dir = Path("./output")
35
  self.output_dir.mkdir(exist_ok=True)
36
 
37
  def _initialize_emb_model(self, model_name):
38
+ try:
39
+ from sentence_transformers import SentenceTransformer
40
+ return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
41
+ except:
42
+ # Load model directly
43
+ from transformers import AutoTokenizer, AutoModel
44
+
45
+ tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/" + model_name)
46
+ model = AutoModel.from_pretrained("sentence-transformers/" + model_name)
47
+ return model
 
48
 
49
  def _initialize_llm(self, model_name):
50
  """Initialize LLM with automatic download if needed"""
51
+ # model_path = os.path.join("models/", model_name)
52
+ # if os.path.exists(model_path):
53
+ # return Llama(
54
+ # model_path=model_path,
55
+ # n_ctx=1024,
56
+ # n_gpu_layers=-1,
57
+ # n_threads=os.cpu_count() - 1,
58
+ # verbose=False
59
+ # )
60
+ return Llama.from_pretrained(
61
+ repo_id="TheBloke/deepseek-llm-7B-base-GGUF",
62
+ filename=model_name,
63
+ )
64
+
 
 
 
 
 
 
 
 
 
 
 
65
  def process_pdf(self, pdf_path: str) -> Dict:
66
+ """Process PDF using PyMuPDF"""
67
  start_time = time.time()
68
+
69
+ # Open PDF
70
+ doc = fitz.open(pdf_path)
71
+ text_blocks = []
72
+ tables = []
73
+
74
+ # Extract text and tables
75
+ for page_num, page in enumerate(doc):
76
+ # Extract text blocks
77
+ text_blocks.extend(self._extract_text_blocks(page))
78
+
79
+ # Extract tables
80
+ tables.extend(self._extract_tables(page, page_num))
81
+
82
+ # Process text blocks with LLM
83
+ products = []
84
+ for block in text_blocks:
85
+ product = self._process_text_block(block)
86
+ if product:
87
+ product.tables = tables
88
+ products.append(product.to_dict())
89
+
90
+ logger.info(f"Processed {len(products)} products in {time.time() - start_time:.2f}s")
91
+ return {"products": products, "tables": tables}
92
+
93
+ def _extract_text_blocks(self, page) -> List[str]:
94
+ """Extract text blocks from a PDF page"""
95
+ blocks = []
96
+ for block in page.get_text("blocks"):
97
+ blocks.append(block[4]) # The text content is at index 4
98
+ return blocks
99
+
100
+ def _extract_tables(self, page, page_num: int) -> List[Dict]:
101
+ """Extract tables from a PDF page"""
102
+ tables = []
103
  try:
104
+ tab = page.find_tables()
105
+ if tab.tables:
106
+ for table_idx, table in enumerate(tab.tables):
107
+ table_data = table.extract()
108
+ if table_data:
109
+ tables.append({
110
+ "page": page_num + 1,
111
+ "cells": table_data,
112
+ "header": table.header.names if table.header else [],
113
+ "content": table_data
114
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  except Exception as e:
116
+ logger.warning(f"Error extracting tables from page {page_num}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  return tables
118
+
 
 
 
 
 
 
 
 
 
119
  def _process_text_block(self, text: str) -> Optional[ProductSpec]:
120
  """Process text block with LLM"""
121
  prompt = self._generate_query_prompt(text)
122
+
123
  try:
124
  response = self.llm.create_chat_completion(
125
  messages=[{"role": "user", "content": prompt}],
 
130
  except Exception as e:
131
  logger.warning(f"Error processing text block: {e}")
132
  return None
133
+
134
  def _generate_query_prompt(self, text: str) -> str:
135
  """Generate extraction prompt"""
136
  return f"""Extract product specifications from this text:
137
  {text}
138
+
139
  Return JSON format:
140
  {{
141
  "name": "product name",
 
143
  "price": numeric_price,
144
  "attributes": {{ "key": "value" }}
145
  }}"""
146
+
147
  def _parse_response(self, response: str) -> Optional[ProductSpec]:
148
  """Parse LLM response"""
149
  try:
 
160
  logger.warning(f"Parse error: {e}")
161
  return None
162
 
163
+
164
  def process_pdf_catalog(pdf_path: str):
165
  processor = PDFProcessor()
166
  try:
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ sentence-transformers
2
+ gradio
3
+ llama-cpp-python
4
+ PyMuPDF