|
""" |
|
Hugging Face compatible wrapper for B2B Ecommerce NER Model |
|
""" |
|
|
|
import spacy |
|
import json |
|
import os |
|
from typing import List, Dict, Any, Optional |
|
from pathlib import Path |
|
import pandas as pd |
|
from fuzzywuzzy import fuzz, process |
|
import pickle |
|
import numpy as np |
|
|
|
|
|
class B2BEcommerceNER: |
|
""" |
|
Hugging Face compatible B2B Ecommerce Named Entity Recognition model. |
|
|
|
This model extracts structured information from B2B ecommerce orders including: |
|
- PRODUCT: Product names and descriptions |
|
- QUANTITY: Order quantities |
|
- SIZE: Product sizes and measurements |
|
- UNIT: Units of measurement |
|
|
|
The model also includes fuzzy matching against a product catalog for enhanced accuracy. |
|
""" |
|
|
|
def __init__(self, model_path: Optional[str] = None, catalog_path: Optional[str] = None): |
|
""" |
|
Initialize the B2B Ecommerce NER model. |
|
|
|
Args: |
|
model_path: Path to the spaCy model directory |
|
catalog_path: Path to the product catalog CSV file |
|
""" |
|
self.model_path = model_path or "spacy_model" |
|
self.catalog_path = catalog_path or "product_catalog.csv" |
|
self.nlp = None |
|
self.catalog_df = None |
|
self.entity_labels = ['PRODUCT', 'QUANTITY', 'SIZE', 'UNIT'] |
|
|
|
|
|
if os.path.exists(self.model_path): |
|
self.load_model() |
|
if os.path.exists(self.catalog_path): |
|
self.load_catalog() |
|
|
|
def load_model(self): |
|
"""Load the spaCy NER model""" |
|
try: |
|
self.nlp = spacy.load(self.model_path) |
|
print(f"Loaded spaCy model from {self.model_path}") |
|
except Exception as e: |
|
print(f"Error loading model: {e}") |
|
raise |
|
|
|
def load_catalog(self): |
|
"""Load the product catalog for fuzzy matching""" |
|
try: |
|
self.catalog_df = pd.read_csv(self.catalog_path) |
|
print(f"Loaded product catalog with {len(self.catalog_df)} products") |
|
except Exception as e: |
|
print(f"Error loading catalog: {e}") |
|
self.catalog_df = None |
|
|
|
def predict(self, texts: List[str]) -> List[Dict[str, Any]]: |
|
""" |
|
Predict entities for a list of texts. |
|
|
|
Args: |
|
texts: List of text strings to process |
|
|
|
Returns: |
|
List of predictions with entities and catalog matches |
|
""" |
|
if self.nlp is None: |
|
raise ValueError("Model not loaded. Please call load_model() first.") |
|
|
|
results = [] |
|
for text in texts: |
|
result = self._extract_entities(text) |
|
results.append(result) |
|
|
|
return results |
|
|
|
def _extract_entities(self, text: str) -> Dict[str, Any]: |
|
"""Extract entities from a single text""" |
|
doc = self.nlp(text) |
|
|
|
entities = { |
|
'products': [], |
|
'quantities': [], |
|
'sizes': [], |
|
'units': [], |
|
'catalog_matches': [] |
|
} |
|
|
|
|
|
for ent in doc.ents: |
|
entity_info = { |
|
'text': ent.text, |
|
'label': ent.label_, |
|
'start': ent.start_char, |
|
'end': ent.end_char, |
|
'confidence': 1.0 |
|
} |
|
|
|
if ent.label_ == 'PRODUCT': |
|
entities['products'].append(entity_info) |
|
|
|
if self.catalog_df is not None: |
|
matches = self._fuzzy_match_product(ent.text) |
|
entities['catalog_matches'].extend(matches) |
|
elif ent.label_ == 'QUANTITY': |
|
entities['quantities'].append(entity_info) |
|
elif ent.label_ == 'SIZE': |
|
entities['sizes'].append(entity_info) |
|
elif ent.label_ == 'UNIT': |
|
entities['units'].append(entity_info) |
|
|
|
return { |
|
'text': text, |
|
'entities': entities, |
|
'total_entities': len(doc.ents) |
|
} |
|
|
|
def _fuzzy_match_product(self, product_text: str, threshold: int = 60, top_n: int = 3) -> List[Dict]: |
|
"""Perform fuzzy matching against product catalog""" |
|
if self.catalog_df is None: |
|
return [] |
|
|
|
|
|
product_names = self.catalog_df['Product'].fillna('').tolist() |
|
|
|
|
|
matches = process.extract(product_text, product_names, limit=top_n, scorer=fuzz.token_sort_ratio) |
|
|
|
results = [] |
|
for match_text, score in matches: |
|
if score >= threshold: |
|
|
|
catalog_row = self.catalog_df[self.catalog_df['Product'] == match_text].iloc[0] |
|
|
|
match_info = { |
|
'brand': catalog_row['Brand'], |
|
'product': catalog_row['Product'], |
|
'sku': catalog_row['SKU'], |
|
'match_score': score, |
|
'original_query': product_text |
|
} |
|
results.append(match_info) |
|
|
|
return results |
|
|
|
def save_pretrained(self, save_directory: str): |
|
""" |
|
Save the model in Hugging Face format. |
|
|
|
Args: |
|
save_directory: Directory to save the model |
|
""" |
|
os.makedirs(save_directory, exist_ok=True) |
|
|
|
|
|
config = { |
|
"model_type": "b2b_ecommerce_ner", |
|
"entity_labels": self.entity_labels, |
|
"spacy_model_path": self.model_path, |
|
"catalog_path": self.catalog_path, |
|
"framework": "spacy", |
|
"task": "token-classification", |
|
"language": "en" |
|
} |
|
|
|
with open(os.path.join(save_directory, "config.json"), "w") as f: |
|
json.dump(config, f, indent=2) |
|
|
|
|
|
if os.path.exists(self.model_path): |
|
import shutil |
|
target_model_path = os.path.join(save_directory, "spacy_model") |
|
if os.path.exists(target_model_path): |
|
shutil.rmtree(target_model_path) |
|
shutil.copytree(self.model_path, target_model_path) |
|
|
|
|
|
if os.path.exists(self.catalog_path): |
|
import shutil |
|
shutil.copy(self.catalog_path, os.path.join(save_directory, "product_catalog.csv")) |
|
|
|
print(f"Model saved to {save_directory}") |
|
|
|
@classmethod |
|
def from_pretrained(cls, model_path: str): |
|
""" |
|
Load a model from a saved directory. |
|
|
|
Args: |
|
model_path: Path to the saved model directory |
|
|
|
Returns: |
|
B2BEcommerceNER instance |
|
""" |
|
config_path = os.path.join(model_path, "config.json") |
|
if not os.path.exists(config_path): |
|
raise ValueError(f"No config.json found in {model_path}") |
|
|
|
with open(config_path, "r") as f: |
|
config = json.load(f) |
|
|
|
spacy_model_path = os.path.join(model_path, "spacy_model") |
|
catalog_path = os.path.join(model_path, "product_catalog.csv") |
|
|
|
model = cls( |
|
model_path=spacy_model_path if os.path.exists(spacy_model_path) else None, |
|
catalog_path=catalog_path if os.path.exists(catalog_path) else None |
|
) |
|
|
|
return model |
|
|
|
def pipeline(self, text: str) -> Dict[str, Any]: |
|
""" |
|
Process a single text through the complete pipeline. |
|
This method makes the model compatible with Hugging Face pipeline interface. |
|
""" |
|
result = self._extract_entities(text) |
|
|
|
|
|
formatted_entities = [] |
|
for entity_type, entity_list in result['entities'].items(): |
|
if entity_type != 'catalog_matches': |
|
for entity in entity_list: |
|
formatted_entities.append({ |
|
'entity': entity['label'], |
|
'score': entity['confidence'], |
|
'index': None, |
|
'word': entity['text'], |
|
'start': entity['start'], |
|
'end': entity['end'] |
|
}) |
|
|
|
return formatted_entities |
|
|
|
|
|
|
|
def load_model(model_path: str = "b2b-ecommerce-ner"): |
|
"""Load the B2B Ecommerce NER model""" |
|
return B2BEcommerceNER.from_pretrained(model_path) |
|
|
|
|
|
def pipeline(task: str = "ner", model: str = "b2b-ecommerce-ner"): |
|
"""Create a pipeline for the B2B Ecommerce NER model""" |
|
if task != "ner": |
|
raise ValueError("Only 'ner' task is supported") |
|
|
|
model_instance = load_model(model) |
|
|
|
def _pipeline(text: str): |
|
return model_instance.pipeline(text) |
|
|
|
return _pipeline |
|
|