b2b-ecomm-ner / model.py
Purva17's picture
Upload folder using huggingface_hub
f0f2280 verified
"""
Hugging Face compatible wrapper for B2B Ecommerce NER Model
"""
import spacy
import json
import os
from typing import List, Dict, Any, Optional
from pathlib import Path
import pandas as pd
from fuzzywuzzy import fuzz, process
import pickle
import numpy as np
class B2BEcommerceNER:
"""
Hugging Face compatible B2B Ecommerce Named Entity Recognition model.
This model extracts structured information from B2B ecommerce orders including:
- PRODUCT: Product names and descriptions
- QUANTITY: Order quantities
- SIZE: Product sizes and measurements
- UNIT: Units of measurement
The model also includes fuzzy matching against a product catalog for enhanced accuracy.
"""
def __init__(self, model_path: Optional[str] = None, catalog_path: Optional[str] = None):
"""
Initialize the B2B Ecommerce NER model.
Args:
model_path: Path to the spaCy model directory
catalog_path: Path to the product catalog CSV file
"""
self.model_path = model_path or "spacy_model"
self.catalog_path = catalog_path or "product_catalog.csv"
self.nlp = None
self.catalog_df = None
self.entity_labels = ['PRODUCT', 'QUANTITY', 'SIZE', 'UNIT']
# Load model and catalog if available
if os.path.exists(self.model_path):
self.load_model()
if os.path.exists(self.catalog_path):
self.load_catalog()
def load_model(self):
"""Load the spaCy NER model"""
try:
self.nlp = spacy.load(self.model_path)
print(f"Loaded spaCy model from {self.model_path}")
except Exception as e:
print(f"Error loading model: {e}")
raise
def load_catalog(self):
"""Load the product catalog for fuzzy matching"""
try:
self.catalog_df = pd.read_csv(self.catalog_path)
print(f"Loaded product catalog with {len(self.catalog_df)} products")
except Exception as e:
print(f"Error loading catalog: {e}")
self.catalog_df = None
def predict(self, texts: List[str]) -> List[Dict[str, Any]]:
"""
Predict entities for a list of texts.
Args:
texts: List of text strings to process
Returns:
List of predictions with entities and catalog matches
"""
if self.nlp is None:
raise ValueError("Model not loaded. Please call load_model() first.")
results = []
for text in texts:
result = self._extract_entities(text)
results.append(result)
return results
def _extract_entities(self, text: str) -> Dict[str, Any]:
"""Extract entities from a single text"""
doc = self.nlp(text)
entities = {
'products': [],
'quantities': [],
'sizes': [],
'units': [],
'catalog_matches': []
}
# Extract entities by type
for ent in doc.ents:
entity_info = {
'text': ent.text,
'label': ent.label_,
'start': ent.start_char,
'end': ent.end_char,
'confidence': 1.0 # spaCy doesn't provide confidence by default
}
if ent.label_ == 'PRODUCT':
entities['products'].append(entity_info)
# Add catalog matching if available
if self.catalog_df is not None:
matches = self._fuzzy_match_product(ent.text)
entities['catalog_matches'].extend(matches)
elif ent.label_ == 'QUANTITY':
entities['quantities'].append(entity_info)
elif ent.label_ == 'SIZE':
entities['sizes'].append(entity_info)
elif ent.label_ == 'UNIT':
entities['units'].append(entity_info)
return {
'text': text,
'entities': entities,
'total_entities': len(doc.ents)
}
def _fuzzy_match_product(self, product_text: str, threshold: int = 60, top_n: int = 3) -> List[Dict]:
"""Perform fuzzy matching against product catalog"""
if self.catalog_df is None:
return []
# Prepare product names for matching
product_names = self.catalog_df['Product'].fillna('').tolist()
# Use fuzzywuzzy to find matches
matches = process.extract(product_text, product_names, limit=top_n, scorer=fuzz.token_sort_ratio)
results = []
for match_text, score in matches:
if score >= threshold:
# Find the corresponding row in catalog
catalog_row = self.catalog_df[self.catalog_df['Product'] == match_text].iloc[0]
match_info = {
'brand': catalog_row['Brand'],
'product': catalog_row['Product'],
'sku': catalog_row['SKU'],
'match_score': score,
'original_query': product_text
}
results.append(match_info)
return results
def save_pretrained(self, save_directory: str):
"""
Save the model in Hugging Face format.
Args:
save_directory: Directory to save the model
"""
os.makedirs(save_directory, exist_ok=True)
# Save model configuration
config = {
"model_type": "b2b_ecommerce_ner",
"entity_labels": self.entity_labels,
"spacy_model_path": self.model_path,
"catalog_path": self.catalog_path,
"framework": "spacy",
"task": "token-classification",
"language": "en"
}
with open(os.path.join(save_directory, "config.json"), "w") as f:
json.dump(config, f, indent=2)
# Copy spaCy model files if they exist
if os.path.exists(self.model_path):
import shutil
target_model_path = os.path.join(save_directory, "spacy_model")
if os.path.exists(target_model_path):
shutil.rmtree(target_model_path)
shutil.copytree(self.model_path, target_model_path)
# Copy catalog file if it exists
if os.path.exists(self.catalog_path):
import shutil
shutil.copy(self.catalog_path, os.path.join(save_directory, "product_catalog.csv"))
print(f"Model saved to {save_directory}")
@classmethod
def from_pretrained(cls, model_path: str):
"""
Load a model from a saved directory.
Args:
model_path: Path to the saved model directory
Returns:
B2BEcommerceNER instance
"""
config_path = os.path.join(model_path, "config.json")
if not os.path.exists(config_path):
raise ValueError(f"No config.json found in {model_path}")
with open(config_path, "r") as f:
config = json.load(f)
spacy_model_path = os.path.join(model_path, "spacy_model")
catalog_path = os.path.join(model_path, "product_catalog.csv")
model = cls(
model_path=spacy_model_path if os.path.exists(spacy_model_path) else None,
catalog_path=catalog_path if os.path.exists(catalog_path) else None
)
return model
def pipeline(self, text: str) -> Dict[str, Any]:
"""
Process a single text through the complete pipeline.
This method makes the model compatible with Hugging Face pipeline interface.
"""
result = self._extract_entities(text)
# Format for Hugging Face pipeline compatibility
formatted_entities = []
for entity_type, entity_list in result['entities'].items():
if entity_type != 'catalog_matches':
for entity in entity_list:
formatted_entities.append({
'entity': entity['label'],
'score': entity['confidence'],
'index': None, # Token index not available in spaCy
'word': entity['text'],
'start': entity['start'],
'end': entity['end']
})
return formatted_entities
# Convenience functions for Hugging Face compatibility
def load_model(model_path: str = "b2b-ecommerce-ner"):
"""Load the B2B Ecommerce NER model"""
return B2BEcommerceNER.from_pretrained(model_path)
def pipeline(task: str = "ner", model: str = "b2b-ecommerce-ner"):
"""Create a pipeline for the B2B Ecommerce NER model"""
if task != "ner":
raise ValueError("Only 'ner' task is supported")
model_instance = load_model(model)
def _pipeline(text: str):
return model_instance.pipeline(text)
return _pipeline