b2b-ecomm-ner / model.py

Upload folder using huggingface_hub

f0f2280 verified about 2 months ago

9.21 kB

	"""
	Hugging Face compatible wrapper for B2B Ecommerce NER Model
	"""

	import spacy
	import json
	import os
	from typing import List, Dict, Any, Optional
	from pathlib import Path
	import pandas as pd
	from fuzzywuzzy import fuzz, process
	import pickle
	import numpy as np


	class B2BEcommerceNER:
	"""
	Hugging Face compatible B2B Ecommerce Named Entity Recognition model.

	This model extracts structured information from B2B ecommerce orders including:
	- PRODUCT: Product names and descriptions
	- QUANTITY: Order quantities
	- SIZE: Product sizes and measurements
	- UNIT: Units of measurement

	The model also includes fuzzy matching against a product catalog for enhanced accuracy.
	"""

	def __init__(self, model_path: Optional[str] = None, catalog_path: Optional[str] = None):
	"""
	Initialize the B2B Ecommerce NER model.

	Args:
	model_path: Path to the spaCy model directory
	catalog_path: Path to the product catalog CSV file
	"""
	self.model_path = model_path or "spacy_model"
	self.catalog_path = catalog_path or "product_catalog.csv"
	self.nlp = None
	self.catalog_df = None
	self.entity_labels = ['PRODUCT', 'QUANTITY', 'SIZE', 'UNIT']

	# Load model and catalog if available
	if os.path.exists(self.model_path):
	self.load_model()
	if os.path.exists(self.catalog_path):
	self.load_catalog()

	def load_model(self):
	"""Load the spaCy NER model"""
	try:
	self.nlp = spacy.load(self.model_path)
	print(f"Loaded spaCy model from {self.model_path}")
	except Exception as e:
	print(f"Error loading model: {e}")
	raise

	def load_catalog(self):
	"""Load the product catalog for fuzzy matching"""
	try:
	self.catalog_df = pd.read_csv(self.catalog_path)
	print(f"Loaded product catalog with {len(self.catalog_df)} products")
	except Exception as e:
	print(f"Error loading catalog: {e}")
	self.catalog_df = None

	def predict(self, texts: List[str]) -> List[Dict[str, Any]]:
	"""
	Predict entities for a list of texts.

	Args:
	texts: List of text strings to process

	Returns:
	List of predictions with entities and catalog matches
	"""
	if self.nlp is None:
	raise ValueError("Model not loaded. Please call load_model() first.")

	results = []
	for text in texts:
	result = self._extract_entities(text)
	results.append(result)

	return results

	def _extract_entities(self, text: str) -> Dict[str, Any]:
	"""Extract entities from a single text"""
	doc = self.nlp(text)

	entities = {
	'products': [],
	'quantities': [],
	'sizes': [],
	'units': [],
	'catalog_matches': []
	}

	# Extract entities by type
	for ent in doc.ents:
	entity_info = {
	'text': ent.text,
	'label': ent.label_,
	'start': ent.start_char,
	'end': ent.end_char,
	'confidence': 1.0 # spaCy doesn't provide confidence by default
	}

	if ent.label_ == 'PRODUCT':
	entities['products'].append(entity_info)
	# Add catalog matching if available
	if self.catalog_df is not None:
	matches = self._fuzzy_match_product(ent.text)
	entities['catalog_matches'].extend(matches)
	elif ent.label_ == 'QUANTITY':
	entities['quantities'].append(entity_info)
	elif ent.label_ == 'SIZE':
	entities['sizes'].append(entity_info)
	elif ent.label_ == 'UNIT':
	entities['units'].append(entity_info)

	return {
	'text': text,
	'entities': entities,
	'total_entities': len(doc.ents)
	}

	def _fuzzy_match_product(self, product_text: str, threshold: int = 60, top_n: int = 3) -> List[Dict]:
	"""Perform fuzzy matching against product catalog"""
	if self.catalog_df is None:
	return []

	# Prepare product names for matching
	product_names = self.catalog_df['Product'].fillna('').tolist()

	# Use fuzzywuzzy to find matches
	matches = process.extract(product_text, product_names, limit=top_n, scorer=fuzz.token_sort_ratio)

	results = []
	for match_text, score in matches:
	if score >= threshold:
	# Find the corresponding row in catalog
	catalog_row = self.catalog_df[self.catalog_df['Product'] == match_text].iloc[0]

	match_info = {
	'brand': catalog_row['Brand'],
	'product': catalog_row['Product'],
	'sku': catalog_row['SKU'],
	'match_score': score,
	'original_query': product_text
	}
	results.append(match_info)

	return results

	def save_pretrained(self, save_directory: str):
	"""
	Save the model in Hugging Face format.

	Args:
	save_directory: Directory to save the model
	"""
	os.makedirs(save_directory, exist_ok=True)

	# Save model configuration
	config = {
	"model_type": "b2b_ecommerce_ner",
	"entity_labels": self.entity_labels,
	"spacy_model_path": self.model_path,
	"catalog_path": self.catalog_path,
	"framework": "spacy",
	"task": "token-classification",
	"language": "en"
	}

	with open(os.path.join(save_directory, "config.json"), "w") as f:
	json.dump(config, f, indent=2)

	# Copy spaCy model files if they exist
	if os.path.exists(self.model_path):
	import shutil
	target_model_path = os.path.join(save_directory, "spacy_model")
	if os.path.exists(target_model_path):
	shutil.rmtree(target_model_path)
	shutil.copytree(self.model_path, target_model_path)

	# Copy catalog file if it exists
	if os.path.exists(self.catalog_path):
	import shutil
	shutil.copy(self.catalog_path, os.path.join(save_directory, "product_catalog.csv"))

	print(f"Model saved to {save_directory}")

	@classmethod
	def from_pretrained(cls, model_path: str):
	"""
	Load a model from a saved directory.

	Args:
	model_path: Path to the saved model directory

	Returns:
	B2BEcommerceNER instance
	"""
	config_path = os.path.join(model_path, "config.json")
	if not os.path.exists(config_path):
	raise ValueError(f"No config.json found in {model_path}")

	with open(config_path, "r") as f:
	config = json.load(f)

	spacy_model_path = os.path.join(model_path, "spacy_model")
	catalog_path = os.path.join(model_path, "product_catalog.csv")

	model = cls(
	model_path=spacy_model_path if os.path.exists(spacy_model_path) else None,
	catalog_path=catalog_path if os.path.exists(catalog_path) else None
	)

	return model

	def pipeline(self, text: str) -> Dict[str, Any]:
	"""
	Process a single text through the complete pipeline.
	This method makes the model compatible with Hugging Face pipeline interface.
	"""
	result = self._extract_entities(text)

	# Format for Hugging Face pipeline compatibility
	formatted_entities = []
	for entity_type, entity_list in result['entities'].items():
	if entity_type != 'catalog_matches':
	for entity in entity_list:
	formatted_entities.append({
	'entity': entity['label'],
	'score': entity['confidence'],
	'index': None, # Token index not available in spaCy
	'word': entity['text'],
	'start': entity['start'],
	'end': entity['end']
	})

	return formatted_entities


	# Convenience functions for Hugging Face compatibility
	def load_model(model_path: str = "b2b-ecommerce-ner"):
	"""Load the B2B Ecommerce NER model"""
	return B2BEcommerceNER.from_pretrained(model_path)


	def pipeline(task: str = "ner", model: str = "b2b-ecommerce-ner"):
	"""Create a pipeline for the B2B Ecommerce NER model"""
	if task != "ner":
	raise ValueError("Only 'ner' task is supported")

	model_instance = load_model(model)

	def _pipeline(text: str):
	return model_instance.pipeline(text)

	return _pipeline