Spaces:

Alteredverse
/

open-catalog-parser

Build error

App Files Files Community

minar09 commited on Feb 6

Commit

17345fb

verified ·

1 Parent(s): 0735672

Upload 6 files

Browse files

Files changed (3) hide show

app.py +21 -13
main.py +85 -118
requirements.txt +4 -0

app.py CHANGED Viewed

@@ -1,29 +1,37 @@
 import os
 import gradio as gr
 import main
 def predict_from_pdf(pdf_file):
     upload_dir = "./catalogue/"
     os.makedirs(upload_dir, exist_ok=True)
     try:
-        # Save the uploaded file to a temporary location
-        dest_path = os.path.join(upload_dir, pdf_file.name)
-        with open(dest_path, "wb") as f:
-            with open(pdf_file.name, "rb") as uploaded_file:
-                f.write(uploaded_file.read())
-        # Process the PDF
-        df, response = main.process_pdf_catalog(dest_path)
         return df, response
     except Exception as e:
-        return None, f"Error: {str(e)}"
 pdf_examples = [
-    ["examples/flexpocket.pdf"],
-    ["examples/ASICS_Catalog.pdf"],
 ]
 demo = gr.Interface(
@@ -32,8 +40,8 @@ demo = gr.Interface(
     outputs=["json", "text"],
     examples=pdf_examples,
     title="Open Source PDF Catalog Parser",
-    description="Efficient PDF catalog processing using MinerU and OpenLLM",
-    article="Uses MinerU for layout analysis and DeepSeek-7B for structured extraction"
 )
 if __name__ == "__main__":

 import os
 import gradio as gr
 import main
+import shutil
 def predict_from_pdf(pdf_file):
+    # Create a temporary directory for file uploads
     upload_dir = "./catalogue/"
     os.makedirs(upload_dir, exist_ok=True)
+    # Use the provided file path from Gradio's file object
+    dest_file_path = os.path.join(upload_dir, os.path.basename(pdf_file.name))
     try:
+        # Save the uploaded file using shutil.copy
+        shutil.copy(pdf_file, dest_file_path)
+        # Check if the file was saved successfully
+        if not os.path.exists(dest_file_path):
+            return None, f"Error: The file {dest_file_path} could not be found or opened."
+        # Process the PDF and retrieve the product info
+        df, response = main.process_pdf_catalog(dest_file_path)
         return df, response
     except Exception as e:
+        return None, f"Error processing PDF: {str(e)}"
+# Define example PDFs
 pdf_examples = [
+    ["catalogue/flexpocket.pdf"],
+    ["catalogue/ASICS_Catalog.pdf"],
 ]
 demo = gr.Interface(
     outputs=["json", "text"],
     examples=pdf_examples,
     title="Open Source PDF Catalog Parser",
+    description="Efficient PDF catalog processing using fitz and OpenLLM",
+    article="Uses MinerU for layout analysis and Llama-CPP for structured extraction"
 )
 if __name__ == "__main__":

main.py CHANGED Viewed

@@ -5,24 +5,15 @@ import logging
 from pathlib import Path
 from typing import List, Dict, Optional
 from dataclasses import dataclass
 from fastapi.encoders import jsonable_encoder
-# from sentence_transformers import SentenceTransformer
-# from llama_cpp import Llama
-# Fix: Dynamically adjust the module path if magic_pdf is in a non-standard location
-try:
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-    from magic_pdf.data.dataset import PymuDocDataset
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.config.enums import SupportedPdfParseMethod
-except ModuleNotFoundError as e:
-    logging.error(f"Failed to import magic_pdf modules: {e}")
-    logging.info("Ensure that the magic_pdf package is installed and accessible in your Python environment.")
-    raise e
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 @dataclass
 class ProductSpec:
     name: str
@@ -34,127 +25,101 @@ class ProductSpec:
     def to_dict(self):
         return jsonable_encoder(self)
 class PDFProcessor:
     def __init__(self):
         self.emb_model = self._initialize_emb_model("all-MiniLM-L6-v2")
-        self.llm = self._initialize_llm("deepseek-llm-7b-base.Q5_K_M.gguf")
         self.output_dir = Path("./output")
         self.output_dir.mkdir(exist_ok=True)
     def _initialize_emb_model(self, model_name):
-        # try:
-        #     model = SentenceTransformer("sentence-transformers/" + model_name)
-        #     model.save('models/'+ model_name)
-        #     return model
-        # except:
-        # Load model directly
-        from transformers import AutoTokenizer, AutoModel
-        tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
-        model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
-        return model
     def _initialize_llm(self, model_name):
         """Initialize LLM with automatic download if needed"""
-        """
-        model_path = os.path.join("models/", model_name)
-        if os.path.exists(model_path):
-            return Llama(
-                model_path=model_path,
-                n_ctx=2048,
-                n_gpu_layers=35 if os.getenv('USE_GPU') else 0,
-                n_threads=os.cpu_count() - 1,
-                verbose=False
-            )
-        else:
-            return Llama.from_pretrained(
-                repo_id="TheBloke/deepseek-llm-7B-base-GGUF",
-                filename=model_name,
-                n_ctx=2048,
-                n_threads=os.cpu_count() - 1,
-                n_gpu_layers=35 if os.getenv('USE_GPU') else 0,
-                verbose=False
-            )
-        """
-        # Load model directly
-        from transformers import AutoModel
-        model = AutoModel.from_pretrained("TheBloke/deepseek-llm-7B-base-GGUF")
-        return model
     def process_pdf(self, pdf_path: str) -> Dict:
-        """Process PDF using MinerU pipeline"""
         start_time = time.time()
-        # Initialize MinerU components
-        local_image_dir = self.output_dir / "images"
-        local_md_dir = self.output_dir
-        image_dir = str(local_image_dir.name)
-        os.makedirs(local_image_dir, exist_ok=True)
         try:
-            image_writer = FileBasedDataWriter(str(local_image_dir))
-            md_writer = FileBasedDataWriter(str(local_md_dir))
-            # Read PDF
-            reader = FileBasedDataReader("")
-            pdf_bytes = reader.read(pdf_path)
-            # Create dataset and process
-            ds = PymuDocDataset(pdf_bytes)
-            if ds.classify() == SupportedPdfParseMethod.OCR:
-                infer_result = ds.apply(doc_analyze, ocr=True)
-                pipe_result = infer_result.pipe_ocr_mode(image_writer)
-            else:
-                infer_result = ds.apply(doc_analyze, ocr=False)
-                pipe_result = infer_result.pipe_txt_mode(image_writer)
-            # Get structured content
-            middle_json = pipe_result.get_middle_json()
-            tables = self._extract_tables(middle_json)
-            text_blocks = self._extract_text_blocks(middle_json)
-            # Process text blocks with LLM
-            products = []
-            for block in text_blocks:
-                product = self._process_text_block(block)
-                if product:
-                    product.tables = tables
-                    products.append(product.to_dict())
-            logger.info(f"Processed {len(products)} products in {time.time()-start_time:.2f}s")
-            return {"products": products, "tables": tables}
         except Exception as e:
-            logger.error(f"Error during PDF processing: {e}")
-            raise RuntimeError("PDF processing failed.") from e
-    def _extract_tables(self, middle_json: Dict) -> List[Dict]:
-        """Extract tables from MinerU's middle JSON"""
-        tables = []
-        for page in middle_json.get('pages', []):
-            for table in page.get('tables', []):
-                tables.append({
-                    "page": page.get('page_number'),
-                    "cells": table.get('cells', []),
-                    "header": table.get('header', []),
-                    "content": table.get('content', [])
-                })
         return tables
-    def _extract_text_blocks(self, middle_json: Dict) -> List[str]:
-        """Extract text blocks from MinerU's middle JSON"""
-        text_blocks = []
-        for page in middle_json.get('pages', []):
-            for block in page.get('blocks', []):
-                if block.get('type') == 'text':
-                    text_blocks.append(block.get('text', ''))
-        return text_blocks
     def _process_text_block(self, text: str) -> Optional[ProductSpec]:
         """Process text block with LLM"""
         prompt = self._generate_query_prompt(text)
         try:
             response = self.llm.create_chat_completion(
                 messages=[{"role": "user", "content": prompt}],
@@ -165,11 +130,12 @@ class PDFProcessor:
         except Exception as e:
             logger.warning(f"Error processing text block: {e}")
             return None
     def _generate_query_prompt(self, text: str) -> str:
         """Generate extraction prompt"""
         return f"""Extract product specifications from this text:
 {text}
 Return JSON format:
 {{
     "name": "product name",
@@ -177,7 +143,7 @@ Return JSON format:
     "price": numeric_price,
     "attributes": {{ "key": "value" }}
 }}"""
     def _parse_response(self, response: str) -> Optional[ProductSpec]:
         """Parse LLM response"""
         try:
@@ -194,6 +160,7 @@ Return JSON format:
             logger.warning(f"Parse error: {e}")
             return None
 def process_pdf_catalog(pdf_path: str):
     processor = PDFProcessor()
     try:

 from pathlib import Path
 from typing import List, Dict, Optional
 from dataclasses import dataclass
+import fitz  # PyMuPDF
+from sentence_transformers import SentenceTransformer
+from llama_cpp import Llama
 from fastapi.encoders import jsonable_encoder
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 @dataclass
 class ProductSpec:
     name: str
     def to_dict(self):
         return jsonable_encoder(self)
 class PDFProcessor:
     def __init__(self):
         self.emb_model = self._initialize_emb_model("all-MiniLM-L6-v2")
+        # self.llm = self._initialize_llm("llama-2-7b.Q2_K.gguf")
+        self.llm = self._initialize_llm("deepseek-llm-7b-base.Q2_K.gguf")
         self.output_dir = Path("./output")
         self.output_dir.mkdir(exist_ok=True)
     def _initialize_emb_model(self, model_name):
+        try:
+            from sentence_transformers import SentenceTransformer
+            return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+        except:
+            # Load model directly
+            from transformers import AutoTokenizer, AutoModel
+            tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/" + model_name)
+            model = AutoModel.from_pretrained("sentence-transformers/" + model_name)
+            return model
     def _initialize_llm(self, model_name):
         """Initialize LLM with automatic download if needed"""
+        # model_path = os.path.join("models/", model_name)
+        # if os.path.exists(model_path):
+        #     return Llama(
+        #         model_path=model_path,
+        #         n_ctx=1024,
+        #         n_gpu_layers=-1,
+        #         n_threads=os.cpu_count() - 1,
+        #         verbose=False
+        #     )
+        return Llama.from_pretrained(
+            repo_id="TheBloke/deepseek-llm-7B-base-GGUF",
+            filename=model_name,
+        )
     def process_pdf(self, pdf_path: str) -> Dict:
+        """Process PDF using PyMuPDF"""
         start_time = time.time()
+        # Open PDF
+        doc = fitz.open(pdf_path)
+        text_blocks = []
+        tables = []
+        # Extract text and tables
+        for page_num, page in enumerate(doc):
+            # Extract text blocks
+            text_blocks.extend(self._extract_text_blocks(page))
+            # Extract tables
+            tables.extend(self._extract_tables(page, page_num))
+        # Process text blocks with LLM
+        products = []
+        for block in text_blocks:
+            product = self._process_text_block(block)
+            if product:
+                product.tables = tables
+                products.append(product.to_dict())
+        logger.info(f"Processed {len(products)} products in {time.time() - start_time:.2f}s")
+        return {"products": products, "tables": tables}
+    def _extract_text_blocks(self, page) -> List[str]:
+        """Extract text blocks from a PDF page"""
+        blocks = []
+        for block in page.get_text("blocks"):
+            blocks.append(block[4])  # The text content is at index 4
+        return blocks
+    def _extract_tables(self, page, page_num: int) -> List[Dict]:
+        """Extract tables from a PDF page"""
+        tables = []
         try:
+            tab = page.find_tables()
+            if tab.tables:
+                for table_idx, table in enumerate(tab.tables):
+                    table_data = table.extract()
+                    if table_data:
+                        tables.append({
+                            "page": page_num + 1,
+                            "cells": table_data,
+                            "header": table.header.names if table.header else [],
+                            "content": table_data
+                        })
         except Exception as e:
+            logger.warning(f"Error extracting tables from page {page_num}: {e}")
         return tables
     def _process_text_block(self, text: str) -> Optional[ProductSpec]:
         """Process text block with LLM"""
         prompt = self._generate_query_prompt(text)
         try:
             response = self.llm.create_chat_completion(
                 messages=[{"role": "user", "content": prompt}],
         except Exception as e:
             logger.warning(f"Error processing text block: {e}")
             return None
     def _generate_query_prompt(self, text: str) -> str:
         """Generate extraction prompt"""
         return f"""Extract product specifications from this text:
 {text}
 Return JSON format:
 {{
     "name": "product name",
     "price": numeric_price,
     "attributes": {{ "key": "value" }}
 }}"""
     def _parse_response(self, response: str) -> Optional[ProductSpec]:
         """Parse LLM response"""
         try:
             logger.warning(f"Parse error: {e}")
             return None
 def process_pdf_catalog(pdf_path: str):
     processor = PDFProcessor()
     try:

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+sentence-transformers
+gradio
+llama-cpp-python
+PyMuPDF