File size: 9,206 Bytes

f0f2280

"""
Hugging Face compatible wrapper for B2B Ecommerce NER Model
"""

import spacy
import json
import os
from typing import List, Dict, Any, Optional
from pathlib import Path
import pandas as pd
from fuzzywuzzy import fuzz, process
import pickle
import numpy as np


class B2BEcommerceNER:
    """
    Hugging Face compatible B2B Ecommerce Named Entity Recognition model.
    
    This model extracts structured information from B2B ecommerce orders including:
    - PRODUCT: Product names and descriptions
    - QUANTITY: Order quantities 
    - SIZE: Product sizes and measurements
    - UNIT: Units of measurement
    
    The model also includes fuzzy matching against a product catalog for enhanced accuracy.
    """
    
    def __init__(self, model_path: Optional[str] = None, catalog_path: Optional[str] = None):
        """
        Initialize the B2B Ecommerce NER model.
        
        Args:
            model_path: Path to the spaCy model directory
            catalog_path: Path to the product catalog CSV file
        """
        self.model_path = model_path or "spacy_model"
        self.catalog_path = catalog_path or "product_catalog.csv"
        self.nlp = None
        self.catalog_df = None
        self.entity_labels = ['PRODUCT', 'QUANTITY', 'SIZE', 'UNIT']
        
        # Load model and catalog if available
        if os.path.exists(self.model_path):
            self.load_model()
        if os.path.exists(self.catalog_path):
            self.load_catalog()
    
    def load_model(self):
        """Load the spaCy NER model"""
        try:
            self.nlp = spacy.load(self.model_path)
            print(f"Loaded spaCy model from {self.model_path}")
        except Exception as e:
            print(f"Error loading model: {e}")
            raise
    
    def load_catalog(self):
        """Load the product catalog for fuzzy matching"""
        try:
            self.catalog_df = pd.read_csv(self.catalog_path)
            print(f"Loaded product catalog with {len(self.catalog_df)} products")
        except Exception as e:
            print(f"Error loading catalog: {e}")
            self.catalog_df = None
    
    def predict(self, texts: List[str]) -> List[Dict[str, Any]]:
        """
        Predict entities for a list of texts.
        
        Args:
            texts: List of text strings to process
            
        Returns:
            List of predictions with entities and catalog matches
        """
        if self.nlp is None:
            raise ValueError("Model not loaded. Please call load_model() first.")
        
        results = []
        for text in texts:
            result = self._extract_entities(text)
            results.append(result)
        
        return results
    
    def _extract_entities(self, text: str) -> Dict[str, Any]:
        """Extract entities from a single text"""
        doc = self.nlp(text)
        
        entities = {
            'products': [],
            'quantities': [],
            'sizes': [], 
            'units': [],
            'catalog_matches': []
        }
        
        # Extract entities by type
        for ent in doc.ents:
            entity_info = {
                'text': ent.text,
                'label': ent.label_,
                'start': ent.start_char,
                'end': ent.end_char,
                'confidence': 1.0  # spaCy doesn't provide confidence by default
            }
            
            if ent.label_ == 'PRODUCT':
                entities['products'].append(entity_info)
                # Add catalog matching if available
                if self.catalog_df is not None:
                    matches = self._fuzzy_match_product(ent.text)
                    entities['catalog_matches'].extend(matches)
            elif ent.label_ == 'QUANTITY':
                entities['quantities'].append(entity_info)
            elif ent.label_ == 'SIZE':
                entities['sizes'].append(entity_info)
            elif ent.label_ == 'UNIT':
                entities['units'].append(entity_info)
        
        return {
            'text': text,
            'entities': entities,
            'total_entities': len(doc.ents)
        }
    
    def _fuzzy_match_product(self, product_text: str, threshold: int = 60, top_n: int = 3) -> List[Dict]:
        """Perform fuzzy matching against product catalog"""
        if self.catalog_df is None:
            return []
        
        # Prepare product names for matching
        product_names = self.catalog_df['Product'].fillna('').tolist()
        
        # Use fuzzywuzzy to find matches
        matches = process.extract(product_text, product_names, limit=top_n, scorer=fuzz.token_sort_ratio)
        
        results = []
        for match_text, score in matches:
            if score >= threshold:
                # Find the corresponding row in catalog
                catalog_row = self.catalog_df[self.catalog_df['Product'] == match_text].iloc[0]
                
                match_info = {
                    'brand': catalog_row['Brand'],
                    'product': catalog_row['Product'],
                    'sku': catalog_row['SKU'],
                    'match_score': score,
                    'original_query': product_text
                }
                results.append(match_info)
        
        return results
    
    def save_pretrained(self, save_directory: str):
        """
        Save the model in Hugging Face format.
        
        Args:
            save_directory: Directory to save the model
        """
        os.makedirs(save_directory, exist_ok=True)
        
        # Save model configuration
        config = {
            "model_type": "b2b_ecommerce_ner",
            "entity_labels": self.entity_labels,
            "spacy_model_path": self.model_path,
            "catalog_path": self.catalog_path,
            "framework": "spacy",
            "task": "token-classification",
            "language": "en"
        }
        
        with open(os.path.join(save_directory, "config.json"), "w") as f:
            json.dump(config, f, indent=2)
        
        # Copy spaCy model files if they exist
        if os.path.exists(self.model_path):
            import shutil
            target_model_path = os.path.join(save_directory, "spacy_model")
            if os.path.exists(target_model_path):
                shutil.rmtree(target_model_path)
            shutil.copytree(self.model_path, target_model_path)
        
        # Copy catalog file if it exists
        if os.path.exists(self.catalog_path):
            import shutil
            shutil.copy(self.catalog_path, os.path.join(save_directory, "product_catalog.csv"))
        
        print(f"Model saved to {save_directory}")
    
    @classmethod
    def from_pretrained(cls, model_path: str):
        """
        Load a model from a saved directory.
        
        Args:
            model_path: Path to the saved model directory
            
        Returns:
            B2BEcommerceNER instance
        """
        config_path = os.path.join(model_path, "config.json")
        if not os.path.exists(config_path):
            raise ValueError(f"No config.json found in {model_path}")
        
        with open(config_path, "r") as f:
            config = json.load(f)
        
        spacy_model_path = os.path.join(model_path, "spacy_model")
        catalog_path = os.path.join(model_path, "product_catalog.csv")
        
        model = cls(
            model_path=spacy_model_path if os.path.exists(spacy_model_path) else None,
            catalog_path=catalog_path if os.path.exists(catalog_path) else None
        )
        
        return model
    
    def pipeline(self, text: str) -> Dict[str, Any]:
        """
        Process a single text through the complete pipeline.
        This method makes the model compatible with Hugging Face pipeline interface.
        """
        result = self._extract_entities(text)
        
        # Format for Hugging Face pipeline compatibility
        formatted_entities = []
        for entity_type, entity_list in result['entities'].items():
            if entity_type != 'catalog_matches':
                for entity in entity_list:
                    formatted_entities.append({
                        'entity': entity['label'],
                        'score': entity['confidence'],
                        'index': None,  # Token index not available in spaCy
                        'word': entity['text'],
                        'start': entity['start'],
                        'end': entity['end']
                    })
        
        return formatted_entities


# Convenience functions for Hugging Face compatibility
def load_model(model_path: str = "b2b-ecommerce-ner"):
    """Load the B2B Ecommerce NER model"""
    return B2BEcommerceNER.from_pretrained(model_path)


def pipeline(task: str = "ner", model: str = "b2b-ecommerce-ner"):
    """Create a pipeline for the B2B Ecommerce NER model"""
    if task != "ner":
        raise ValueError("Only 'ner' task is supported")
    
    model_instance = load_model(model)
    
    def _pipeline(text: str):
        return model_instance.pipeline(text)
    
    return _pipeline