""" Hugging Face compatible wrapper for B2B Ecommerce NER Model """ import spacy import json import os from typing import List, Dict, Any, Optional from pathlib import Path import pandas as pd from fuzzywuzzy import fuzz, process import pickle import numpy as np class B2BEcommerceNER: """ Hugging Face compatible B2B Ecommerce Named Entity Recognition model. This model extracts structured information from B2B ecommerce orders including: - PRODUCT: Product names and descriptions - QUANTITY: Order quantities - SIZE: Product sizes and measurements - UNIT: Units of measurement The model also includes fuzzy matching against a product catalog for enhanced accuracy. """ def __init__(self, model_path: Optional[str] = None, catalog_path: Optional[str] = None): """ Initialize the B2B Ecommerce NER model. Args: model_path: Path to the spaCy model directory catalog_path: Path to the product catalog CSV file """ self.model_path = model_path or "spacy_model" self.catalog_path = catalog_path or "product_catalog.csv" self.nlp = None self.catalog_df = None self.entity_labels = ['PRODUCT', 'QUANTITY', 'SIZE', 'UNIT'] # Load model and catalog if available if os.path.exists(self.model_path): self.load_model() if os.path.exists(self.catalog_path): self.load_catalog() def load_model(self): """Load the spaCy NER model""" try: self.nlp = spacy.load(self.model_path) print(f"Loaded spaCy model from {self.model_path}") except Exception as e: print(f"Error loading model: {e}") raise def load_catalog(self): """Load the product catalog for fuzzy matching""" try: self.catalog_df = pd.read_csv(self.catalog_path) print(f"Loaded product catalog with {len(self.catalog_df)} products") except Exception as e: print(f"Error loading catalog: {e}") self.catalog_df = None def predict(self, texts: List[str]) -> List[Dict[str, Any]]: """ Predict entities for a list of texts. Args: texts: List of text strings to process Returns: List of predictions with entities and catalog matches """ if self.nlp is None: raise ValueError("Model not loaded. Please call load_model() first.") results = [] for text in texts: result = self._extract_entities(text) results.append(result) return results def _extract_entities(self, text: str) -> Dict[str, Any]: """Extract entities from a single text""" doc = self.nlp(text) entities = { 'products': [], 'quantities': [], 'sizes': [], 'units': [], 'catalog_matches': [] } # Extract entities by type for ent in doc.ents: entity_info = { 'text': ent.text, 'label': ent.label_, 'start': ent.start_char, 'end': ent.end_char, 'confidence': 1.0 # spaCy doesn't provide confidence by default } if ent.label_ == 'PRODUCT': entities['products'].append(entity_info) # Add catalog matching if available if self.catalog_df is not None: matches = self._fuzzy_match_product(ent.text) entities['catalog_matches'].extend(matches) elif ent.label_ == 'QUANTITY': entities['quantities'].append(entity_info) elif ent.label_ == 'SIZE': entities['sizes'].append(entity_info) elif ent.label_ == 'UNIT': entities['units'].append(entity_info) return { 'text': text, 'entities': entities, 'total_entities': len(doc.ents) } def _fuzzy_match_product(self, product_text: str, threshold: int = 60, top_n: int = 3) -> List[Dict]: """Perform fuzzy matching against product catalog""" if self.catalog_df is None: return [] # Prepare product names for matching product_names = self.catalog_df['Product'].fillna('').tolist() # Use fuzzywuzzy to find matches matches = process.extract(product_text, product_names, limit=top_n, scorer=fuzz.token_sort_ratio) results = [] for match_text, score in matches: if score >= threshold: # Find the corresponding row in catalog catalog_row = self.catalog_df[self.catalog_df['Product'] == match_text].iloc[0] match_info = { 'brand': catalog_row['Brand'], 'product': catalog_row['Product'], 'sku': catalog_row['SKU'], 'match_score': score, 'original_query': product_text } results.append(match_info) return results def save_pretrained(self, save_directory: str): """ Save the model in Hugging Face format. Args: save_directory: Directory to save the model """ os.makedirs(save_directory, exist_ok=True) # Save model configuration config = { "model_type": "b2b_ecommerce_ner", "entity_labels": self.entity_labels, "spacy_model_path": self.model_path, "catalog_path": self.catalog_path, "framework": "spacy", "task": "token-classification", "language": "en" } with open(os.path.join(save_directory, "config.json"), "w") as f: json.dump(config, f, indent=2) # Copy spaCy model files if they exist if os.path.exists(self.model_path): import shutil target_model_path = os.path.join(save_directory, "spacy_model") if os.path.exists(target_model_path): shutil.rmtree(target_model_path) shutil.copytree(self.model_path, target_model_path) # Copy catalog file if it exists if os.path.exists(self.catalog_path): import shutil shutil.copy(self.catalog_path, os.path.join(save_directory, "product_catalog.csv")) print(f"Model saved to {save_directory}") @classmethod def from_pretrained(cls, model_path: str): """ Load a model from a saved directory. Args: model_path: Path to the saved model directory Returns: B2BEcommerceNER instance """ config_path = os.path.join(model_path, "config.json") if not os.path.exists(config_path): raise ValueError(f"No config.json found in {model_path}") with open(config_path, "r") as f: config = json.load(f) spacy_model_path = os.path.join(model_path, "spacy_model") catalog_path = os.path.join(model_path, "product_catalog.csv") model = cls( model_path=spacy_model_path if os.path.exists(spacy_model_path) else None, catalog_path=catalog_path if os.path.exists(catalog_path) else None ) return model def pipeline(self, text: str) -> Dict[str, Any]: """ Process a single text through the complete pipeline. This method makes the model compatible with Hugging Face pipeline interface. """ result = self._extract_entities(text) # Format for Hugging Face pipeline compatibility formatted_entities = [] for entity_type, entity_list in result['entities'].items(): if entity_type != 'catalog_matches': for entity in entity_list: formatted_entities.append({ 'entity': entity['label'], 'score': entity['confidence'], 'index': None, # Token index not available in spaCy 'word': entity['text'], 'start': entity['start'], 'end': entity['end'] }) return formatted_entities # Convenience functions for Hugging Face compatibility def load_model(model_path: str = "b2b-ecommerce-ner"): """Load the B2B Ecommerce NER model""" return B2BEcommerceNER.from_pretrained(model_path) def pipeline(task: str = "ner", model: str = "b2b-ecommerce-ner"): """Create a pipeline for the B2B Ecommerce NER model""" if task != "ner": raise ValueError("Only 'ner' task is supported") model_instance = load_model(model) def _pipeline(text: str): return model_instance.pipeline(text) return _pipeline