File size: 9,206 Bytes
f0f2280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
"""
Hugging Face compatible wrapper for B2B Ecommerce NER Model
"""

import spacy
import json
import os
from typing import List, Dict, Any, Optional
from pathlib import Path
import pandas as pd
from fuzzywuzzy import fuzz, process
import pickle
import numpy as np


class B2BEcommerceNER:
    """
    Hugging Face compatible B2B Ecommerce Named Entity Recognition model.
    
    This model extracts structured information from B2B ecommerce orders including:
    - PRODUCT: Product names and descriptions
    - QUANTITY: Order quantities 
    - SIZE: Product sizes and measurements
    - UNIT: Units of measurement
    
    The model also includes fuzzy matching against a product catalog for enhanced accuracy.
    """
    
    def __init__(self, model_path: Optional[str] = None, catalog_path: Optional[str] = None):
        """
        Initialize the B2B Ecommerce NER model.
        
        Args:
            model_path: Path to the spaCy model directory
            catalog_path: Path to the product catalog CSV file
        """
        self.model_path = model_path or "spacy_model"
        self.catalog_path = catalog_path or "product_catalog.csv"
        self.nlp = None
        self.catalog_df = None
        self.entity_labels = ['PRODUCT', 'QUANTITY', 'SIZE', 'UNIT']
        
        # Load model and catalog if available
        if os.path.exists(self.model_path):
            self.load_model()
        if os.path.exists(self.catalog_path):
            self.load_catalog()
    
    def load_model(self):
        """Load the spaCy NER model"""
        try:
            self.nlp = spacy.load(self.model_path)
            print(f"Loaded spaCy model from {self.model_path}")
        except Exception as e:
            print(f"Error loading model: {e}")
            raise
    
    def load_catalog(self):
        """Load the product catalog for fuzzy matching"""
        try:
            self.catalog_df = pd.read_csv(self.catalog_path)
            print(f"Loaded product catalog with {len(self.catalog_df)} products")
        except Exception as e:
            print(f"Error loading catalog: {e}")
            self.catalog_df = None
    
    def predict(self, texts: List[str]) -> List[Dict[str, Any]]:
        """
        Predict entities for a list of texts.
        
        Args:
            texts: List of text strings to process
            
        Returns:
            List of predictions with entities and catalog matches
        """
        if self.nlp is None:
            raise ValueError("Model not loaded. Please call load_model() first.")
        
        results = []
        for text in texts:
            result = self._extract_entities(text)
            results.append(result)
        
        return results
    
    def _extract_entities(self, text: str) -> Dict[str, Any]:
        """Extract entities from a single text"""
        doc = self.nlp(text)
        
        entities = {
            'products': [],
            'quantities': [],
            'sizes': [], 
            'units': [],
            'catalog_matches': []
        }
        
        # Extract entities by type
        for ent in doc.ents:
            entity_info = {
                'text': ent.text,
                'label': ent.label_,
                'start': ent.start_char,
                'end': ent.end_char,
                'confidence': 1.0  # spaCy doesn't provide confidence by default
            }
            
            if ent.label_ == 'PRODUCT':
                entities['products'].append(entity_info)
                # Add catalog matching if available
                if self.catalog_df is not None:
                    matches = self._fuzzy_match_product(ent.text)
                    entities['catalog_matches'].extend(matches)
            elif ent.label_ == 'QUANTITY':
                entities['quantities'].append(entity_info)
            elif ent.label_ == 'SIZE':
                entities['sizes'].append(entity_info)
            elif ent.label_ == 'UNIT':
                entities['units'].append(entity_info)
        
        return {
            'text': text,
            'entities': entities,
            'total_entities': len(doc.ents)
        }
    
    def _fuzzy_match_product(self, product_text: str, threshold: int = 60, top_n: int = 3) -> List[Dict]:
        """Perform fuzzy matching against product catalog"""
        if self.catalog_df is None:
            return []
        
        # Prepare product names for matching
        product_names = self.catalog_df['Product'].fillna('').tolist()
        
        # Use fuzzywuzzy to find matches
        matches = process.extract(product_text, product_names, limit=top_n, scorer=fuzz.token_sort_ratio)
        
        results = []
        for match_text, score in matches:
            if score >= threshold:
                # Find the corresponding row in catalog
                catalog_row = self.catalog_df[self.catalog_df['Product'] == match_text].iloc[0]
                
                match_info = {
                    'brand': catalog_row['Brand'],
                    'product': catalog_row['Product'],
                    'sku': catalog_row['SKU'],
                    'match_score': score,
                    'original_query': product_text
                }
                results.append(match_info)
        
        return results
    
    def save_pretrained(self, save_directory: str):
        """
        Save the model in Hugging Face format.
        
        Args:
            save_directory: Directory to save the model
        """
        os.makedirs(save_directory, exist_ok=True)
        
        # Save model configuration
        config = {
            "model_type": "b2b_ecommerce_ner",
            "entity_labels": self.entity_labels,
            "spacy_model_path": self.model_path,
            "catalog_path": self.catalog_path,
            "framework": "spacy",
            "task": "token-classification",
            "language": "en"
        }
        
        with open(os.path.join(save_directory, "config.json"), "w") as f:
            json.dump(config, f, indent=2)
        
        # Copy spaCy model files if they exist
        if os.path.exists(self.model_path):
            import shutil
            target_model_path = os.path.join(save_directory, "spacy_model")
            if os.path.exists(target_model_path):
                shutil.rmtree(target_model_path)
            shutil.copytree(self.model_path, target_model_path)
        
        # Copy catalog file if it exists
        if os.path.exists(self.catalog_path):
            import shutil
            shutil.copy(self.catalog_path, os.path.join(save_directory, "product_catalog.csv"))
        
        print(f"Model saved to {save_directory}")
    
    @classmethod
    def from_pretrained(cls, model_path: str):
        """
        Load a model from a saved directory.
        
        Args:
            model_path: Path to the saved model directory
            
        Returns:
            B2BEcommerceNER instance
        """
        config_path = os.path.join(model_path, "config.json")
        if not os.path.exists(config_path):
            raise ValueError(f"No config.json found in {model_path}")
        
        with open(config_path, "r") as f:
            config = json.load(f)
        
        spacy_model_path = os.path.join(model_path, "spacy_model")
        catalog_path = os.path.join(model_path, "product_catalog.csv")
        
        model = cls(
            model_path=spacy_model_path if os.path.exists(spacy_model_path) else None,
            catalog_path=catalog_path if os.path.exists(catalog_path) else None
        )
        
        return model
    
    def pipeline(self, text: str) -> Dict[str, Any]:
        """
        Process a single text through the complete pipeline.
        This method makes the model compatible with Hugging Face pipeline interface.
        """
        result = self._extract_entities(text)
        
        # Format for Hugging Face pipeline compatibility
        formatted_entities = []
        for entity_type, entity_list in result['entities'].items():
            if entity_type != 'catalog_matches':
                for entity in entity_list:
                    formatted_entities.append({
                        'entity': entity['label'],
                        'score': entity['confidence'],
                        'index': None,  # Token index not available in spaCy
                        'word': entity['text'],
                        'start': entity['start'],
                        'end': entity['end']
                    })
        
        return formatted_entities


# Convenience functions for Hugging Face compatibility
def load_model(model_path: str = "b2b-ecommerce-ner"):
    """Load the B2B Ecommerce NER model"""
    return B2BEcommerceNER.from_pretrained(model_path)


def pipeline(task: str = "ner", model: str = "b2b-ecommerce-ner"):
    """Create a pipeline for the B2B Ecommerce NER model"""
    if task != "ner":
        raise ValueError("Only 'ner' task is supported")
    
    model_instance = load_model(model)
    
    def _pipeline(text: str):
        return model_instance.pipeline(text)
    
    return _pipeline