Spaces:

Alteredverse
/

open-catalog-parser

Build error

App Files Files Community

minar09 commited on Feb 6

Commit

f8daace

verified ·

1 Parent(s): a5b6e19

Upload 2 files

Browse files

Files changed (2) hide show

app.py +37 -137
main.py +199 -0

app.py CHANGED Viewed

@@ -1,148 +1,48 @@
 import os
 import gradio as gr
-import fitz  # PyMuPDF
 import shutil
-import json
-import torch
-from PIL import Image
-import re
-# Import multimodal and Qwen2-VL models and processor from your dependencies.
-from byaldi import RAGMultiModalModel
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
-from qwen_vl_utils import process_vision_info
-# --- Model Initialization ---
-def initialize_models():
-    """
-    Loads and returns the RAG multimodal and Qwen2-VL models along with the processor.
-    """
-    multimodal_rag = RAGMultiModalModel.from_pretrained("vidore/colpali")
-    qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
-        "Qwen/Qwen2-VL-2B-Instruct",
-        trust_remote_code=True,
-        torch_dtype=torch.float32
-    )
-    qwen_processor = AutoProcessor.from_pretrained(
-        "Qwen/Qwen2-VL-2B-Instruct",
-        trust_remote_code=True
-    )
-    return multimodal_rag, qwen_model, qwen_processor
-multimodal_rag, qwen_model, qwen_processor = initialize_models()
-# --- OCR Function ---
-def perform_ocr(image: Image.Image) -> str:
-    """
-    Extracts text from an image using the Qwen2-VL model.
-    """
-    query = "Extract text from the image in its original language."
-    user_input = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": query}
-            ]
-        }
-    ]
-    input_text = qwen_processor.apply_chat_template(user_input, tokenize=False, add_generation_prompt=True)
-    image_inputs, video_inputs = process_vision_info(user_input)
-    model_inputs = qwen_processor(
-        text=[input_text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt"
-    ).to("cpu")  # Use CPU for inference
-    with torch.no_grad():
-        generated_ids = qwen_model.generate(**model_inputs, max_new_tokens=2000)
-        # Remove the prompt tokens from the generated output
-        trimmed_ids = [output[len(model_inputs.input_ids):] for model_inputs.input_ids, output in zip(model_inputs.input_ids, generated_ids)]
-        ocr_result = qwen_processor.batch_decode(trimmed_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    return ocr_result
-# --- Product Parsing Function ---
-def parse_product_info(text: str) -> dict:
-    """
-    Parses the combined OCR text into structured product information using Qwen2-VL.
-    """
-    prompt = f"""Extract product specifications from the following text. If no product information is found, return an empty JSON object with keys.
-Text:
-{text}
-Return JSON format exactly as:
-{{
-    "name": "product name",
-    "description": "product description",
-    "price": numeric_price,
-    "attributes": {{"key": "value"}}
-}}"""
-    user_input = [{"role": "user", "content": prompt}]
-    input_text = qwen_processor.apply_chat_template(user_input, tokenize=False, add_generation_prompt=True)
-    image_inputs, video_inputs = process_vision_info(user_input)
-    model_inputs = qwen_processor(
-        text=[input_text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt"
-    ).to("cpu")
-    with torch.no_grad():
-        generated_ids = qwen_model.generate(**model_inputs, max_new_tokens=512)
-        trimmed_ids = [output[len(model_inputs.input_ids):] for model_inputs.input_ids, output in zip(model_inputs.input_ids, generated_ids)]
-        parsed_result = qwen_processor.batch_decode(trimmed_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
     try:
-        json_start = parsed_result.find('{')
-        json_end = parsed_result.rfind('}') + 1
-        data = json.loads(parsed_result[json_start:json_end])
-    except Exception as e:
-        data = {}
-    return data
-# --- PDF Processing Function ---
-def process_pdf(pdf_file) -> dict:
-    """
-    Processes a PDF file by converting each page to an image,
-    performing OCR on each page, and then parsing the combined
-    text into structured product information.
-    """
-    # Create a temporary directory for the PDF file
-    temp_dir = "./temp_pdf/"
-    os.makedirs(temp_dir, exist_ok=True)
-    pdf_path = os.path.join(temp_dir, pdf_file.name)
-    with open(pdf_path, "wb") as f:
-        if hasattr(pdf_file, "file"):
-            shutil.copyfileobj(pdf_file.file, f)
-        elif hasattr(pdf_file, "name"):
-            # In case pdf_file is a path string (unlikely in Gradio, but safe-guard)
-            shutil.copy(pdf_file.name, pdf_path)
-        else:
-            raise TypeError("Invalid file input type.")
-    # Open the PDF file using PyMuPDF
-    try:
-        doc = fitz.open(pdf_path)
-    except Exception as e:
-        raise RuntimeError(f"Cannot open PDF file: {e}")
-    combined_text = ""
-    # Iterate over each page and extract text via OCR
-    for page in doc:
-        try:
-            # Render page as image; adjust dpi as needed for quality/speed balance
-            pix = page.get_pixmap(dpi=150)
-            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-            page_text = perform_ocr(img)
-            combined_text += page_text + "\n"
-        except Exception as e:
-            print(f"Warning: Failed to process page {page.number + 1}: {e}")
-    # Parse the combined OCR text into structured product info
-    product_info = parse_product_info(combined_text)
-    return product_info
-# --- Gradio Interface ---
-with gr.Blocks() as interface:
-    gr.Markdown("<h1 style='text-align: center;'>PDF Product Info Extractor</h1>")
-    with gr.Row():
-        pdf_input = gr.File(label="Upload PDF File", file_count="single")
-        extract_btn = gr.Button("Extract Product Info")
-    output_box = gr.JSON(label="Extracted Product Info")
-    extract_btn.click(process_pdf, inputs=pdf_input, outputs=output_box)
-interface.launch(debug=True)

 import os
 import gradio as gr
+import main
 import shutil
+def predict_from_pdf(pdf_file):
+    # Create a temporary directory for file uploads
+    upload_dir = "./catalogue/"
+    os.makedirs(upload_dir, exist_ok=True)
+    # Use the provided file path from Gradio's file object
+    dest_file_path = os.path.join(upload_dir, os.path.basename(pdf_file.name))
     try:
+        # Save the uploaded file using shutil.copy
+        shutil.copy(pdf_file, dest_file_path)
+        # Check if the file was saved successfully
+        if not os.path.exists(dest_file_path):
+            return None, f"Error: The file {dest_file_path} could not be found or opened."
+        # Process the PDF and retrieve the product info
+        df, response = main.process_pdf_catalog(dest_file_path)
+        return df, response
+    except Exception as e:
+        return None, f"Error processing PDF: {str(e)}"
+# Define example PDFs
+pdf_examples = [
+    ["catalogue/flexpocket.pdf"],
+    ["catalogue/ASICS_Catalog.pdf"],
+]
+demo = gr.Interface(
+    fn=predict_from_pdf,
+    inputs=gr.File(label="Upload PDF Catalog"),
+    outputs=["json", "text"],
+    examples=pdf_examples,
+    title="Open Source PDF Catalog Parser",
+    description="Efficient PDF catalog processing using fitz and OpenLLM",
+    article="Uses PyMuPDF for layout analysis and Llama-CPP for structured extraction"
+)
+if __name__ == "__main__":
+    demo.queue().launch(server_name="0.0.0.0", server_port=7860, share=True)

main.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import os
+import json
+import time
+import logging
+from pathlib import Path
+from typing import List, Dict, Optional
+from dataclasses import dataclass
+from fastapi.encoders import jsonable_encoder
+import fitz  # PyMuPDF
+from sentence_transformers import SentenceTransformer
+from llama_cpp import Llama
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+@dataclass
+class ProductSpec:
+    name: str
+    description: Optional[str] = None
+    price: Optional[float] = None
+    attributes: Dict[str, str] = None
+    tables: List[Dict] = None
+    def to_dict(self):
+        return jsonable_encoder(self)
+class PDFProcessor:
+    def __init__(self):
+        self.emb_model = self._initialize_emb_model("all-MiniLM-L6-v2")
+        # Choose the appropriate model filename below; adjust if needed.
+        # self.llm = self._initialize_llm("deepseek-llm-7b-base.Q2_K.gguf")
+        self.llm = self._initialize_llm("llama-2-7b.Q2_K.gguf")
+        self.output_dir = Path("./output")
+        self.output_dir.mkdir(exist_ok=True)
+    def _initialize_emb_model(self, model_name):
+        try:
+            # Use SentenceTransformer if available
+            return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+        except Exception as e:
+            logger.warning(f"SentenceTransformer failed: {e}. Falling back to transformers model.")
+            from transformers import AutoTokenizer, AutoModel
+            tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/" + model_name)
+            model = AutoModel.from_pretrained("sentence-transformers/" + model_name)
+            return model
+    def _initialize_llm(self, model_name):
+        """Initialize LLM with automatic download if needed"""
+        # Here we use from_pretrained so that if the model is missing locally it downloads it.
+        model_path = os.path.join("models/", model_name)
+        if os.path.exists(model_path):
+            return Llama(
+                model_path=model_path,
+                n_ctx=1024,
+                n_gpu_layers=-1,
+                n_threads=os.cpu_count() - 1,
+                verbose=False
+            )
+        else:
+            return Llama.from_pretrained(
+                repo_id="Tien203/llama.cpp",
+                filename="Llama-2-7b-hf-q4_0.gguf",
+            )
+    def process_pdf(self, pdf_path: str) -> Dict:
+        """Process PDF using PyMuPDF"""
+        start_time = time.time()
+        # Open PDF
+        try:
+            doc = fitz.open(pdf_path)
+        except Exception as e:
+            logger.error(f"Failed to open PDF: {e}")
+            raise RuntimeError("Cannot open PDF file.") from e
+        text_blocks = []
+        tables = []
+        # Extract text and tables from each page
+        for page_num, page in enumerate(doc):
+            # Extract text blocks from page and filter out very short blocks (noise)
+            blocks = self._extract_text_blocks(page)
+            filtered = [block for block in blocks if len(block.strip()) >= 10]
+            logger.debug(f"Page {page_num + 1}: Extracted {len(blocks)} blocks, {len(filtered)} kept after filtering.")
+            text_blocks.extend(filtered)
+            # Extract tables (if any)
+            tables.extend(self._extract_tables(page, page_num))
+        # Process text blocks with LLM to extract product information
+        products = []
+        for idx, block in enumerate(text_blocks):
+            # Log the text block for debugging
+            logger.debug(f"Processing text block {idx}: {block[:100]}...")
+            product = self._process_text_block(block)
+            if product:
+                product.tables = tables
+                # Only add if at least one key (like name) is non-empty
+                if product.name or product.description or product.price or (
+                        product.attributes and len(product.attributes) > 0):
+                    products.append(product.to_dict())
+                else:
+                    logger.debug(f"LLM returned empty product for block {idx}.")
+            else:
+                logger.debug(f"No product extracted from block {idx}.")
+        logger.info(f"Processed {len(products)} products in {time.time() - start_time:.2f}s")
+        return {"products": products, "tables": tables}
+    def _extract_text_blocks(self, page) -> List[str]:
+        """Extract text blocks from a PDF page using PyMuPDF's blocks method."""
+        blocks = []
+        for block in page.get_text("blocks"):
+            # block[4] contains the text content
+            text = block[4].strip()
+            if text:
+                blocks.append(text)
+        return blocks
+    def _extract_tables(self, page, page_num: int) -> List[Dict]:
+        """Extract tables from a PDF page using PyMuPDF's table extraction (if available)."""
+        tables = []
+        try:
+            tab = page.find_tables()
+            if tab and hasattr(tab, 'tables') and tab.tables:
+                for table in tab.tables:
+                    table_data = table.extract()
+                    if table_data:
+                        tables.append({
+                            "page": page_num + 1,
+                            "cells": table_data,
+                            "header": table.header.names if table.header else [],
+                            "content": table_data
+                        })
+        except Exception as e:
+            logger.warning(f"Error extracting tables from page {page_num + 1}: {e}")
+        return tables
+    def _process_text_block(self, text: str) -> Optional[ProductSpec]:
+        """Process a text block with LLM to extract product specifications."""
+        prompt = self._generate_query_prompt(text)
+        logger.debug(f"Generated prompt: {prompt[:200]}...")
+        try:
+            response = self.llm.create_chat_completion(
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.1,
+                max_tokens=512
+            )
+            # Debug: log raw response
+            logger.debug(f"LLM raw response: {response}")
+            return self._parse_response(response['choices'][0]['message']['content'])
+        except Exception as e:
+            logger.warning(f"Error processing text block: {e}")
+            return None
+    def _generate_query_prompt(self, text: str) -> str:
+        """Generate a prompt instructing the LLM to extract product information."""
+        return f"""Extract product specifications from the following text. If no product is found, return an empty JSON object with keys.\n\nText:\n{text}\n\nReturn JSON format exactly as:\n{{\n    \"name\": \"product name\",\n    \"description\": \"product description\",\n    \"price\": numeric_price,\n    \"attributes\": {{ \"key\": \"value\" }}\n}}"""
+    def _parse_response(self, response: str) -> Optional[ProductSpec]:
+        """Parse the LLM's response to extract a product specification."""
+        try:
+            json_start = response.find('{')
+            json_end = response.rfind('}') + 1
+            json_str = response[json_start:json_end].strip()
+            if not json_str:
+                raise ValueError("No JSON content found in response.")
+            data = json.loads(json_str)
+            # If the returned JSON is essentially empty, return None
+            if all(not data.get(key) for key in ['name', 'description', 'price', 'attributes']):
+                return None
+            return ProductSpec(
+                name=data.get('name', ''),
+                description=data.get('description'),
+                price=data.get('price'),
+                attributes=data.get('attributes', {})
+            )
+        except (json.JSONDecodeError, KeyError, ValueError) as e:
+            logger.warning(f"Parse error: {e} in response: {response}")
+            return None
+def process_pdf_catalog(pdf_path: str):
+    processor = PDFProcessor()
+    try:
+        result = processor.process_pdf(pdf_path)
+        return result, "Processing completed successfully!"
+    except Exception as e:
+        logger.error(f"Processing failed: {e}")
+        return {}, "Error processing PDF"
+if __name__ == "__main__":
+    # Example usage: change this if you call process_pdf_catalog elsewhere
+    pdf_path = "path/to/your/pdf_file.pdf"
+    result, message = process_pdf_catalog(pdf_path)
+    print(result, message)