Upload Indian Address NER model (checkpoint-20793)

Browse files

Files changed (6) hide show

README.md +38 -21
model.safetensors +1 -1
optimizer.pt +1 -1
scaler.pt +1 -1
trainer_state.json +53 -53
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -75,18 +75,22 @@ class IndianAddressNER:
 }
     def predict(self, address):
-        """Extract entities from an Indian address"""
         if not address.strip():
             return {}
-        # Tokenize
         inputs = self.tokenizer(
             address,
             return_tensors="pt",
             truncation=True,
             padding=True,
-            max_length=128
         )
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
         # Predict
@@ -96,23 +100,32 @@ class IndianAddressNER:
             predicted_ids = torch.argmax(predictions, dim=-1)
             confidence_scores = torch.max(predictions, dim=-1)[0]
-        # Convert to tokens and labels
-        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
-        predicted_labels = [self.id2entity.get(str(id.item()), "O") for id in predicted_ids[0]]
-        confidences = confidence_scores[0].cpu().numpy()
-        # Group entities
-        entities = self.group_entities(tokens, predicted_labels, confidences)
         return entities
-    def group_entities(self, tokens, labels, confidences):
-        """Group B- and I- tags into complete entities"""
         entities = {}
         current_entity = None
-        for i, (token, label, conf) in enumerate(zip(tokens, labels, confidences)):
-            if token in ["[CLS]", "[SEP]", "[PAD]"]:
                 continue
             if label.startswith("B-"):
                 # Save previous entity
@@ -121,7 +134,7 @@ class IndianAddressNER:
                     if entity_type not in entities:
                         entities[entity_type] = []
                     entities[entity_type].append({
-                        "text": current_entity["text"].replace("##", ""),
                         "confidence": current_entity["confidence"]
                     })
@@ -129,16 +142,20 @@ class IndianAddressNER:
                 entity_type = label[2:]  # Remove "B-"
                 current_entity = {
                     "type": entity_type,
-                    "text": token,
-                    "confidence": conf
                 }
             elif label.startswith("I-") and current_entity:
                 # Continue current entity
                 entity_type = label[2:]  # Remove "I-"
                 if entity_type == current_entity["type"]:
-                    current_entity["text"] += token
-                    current_entity["confidence"] = (current_entity["confidence"] + conf) / 2
             elif label == "O" and current_entity:
                 # End current entity
@@ -146,7 +163,7 @@ class IndianAddressNER:
                 if entity_type not in entities:
                     entities[entity_type] = []
                 entities[entity_type].append({
-                    "text": current_entity["text"].replace("##", ""),
                     "confidence": current_entity["confidence"]
                 })
                 current_entity = None
@@ -157,7 +174,7 @@ class IndianAddressNER:
             if entity_type not in entities:
                 entities[entity_type] = []
             entities[entity_type].append({
-                "text": current_entity["text"].replace("##", ""),
                 "confidence": current_entity["confidence"]
             })
@@ -348,7 +365,7 @@ The model uses BIO (Begin-Inside-Outside) tagging scheme:
 ## 🔄 Model Updates
 - **Version**: v1.0 (Checkpoint 20793)
-- **Last Updated**: 2025-06-18
 - **Training Completion**: Based on augmented Indian address dataset
 - **Base Model**: ModernBERT for advanced transformer architecture

 }
     def predict(self, address):
+        """Extract entities from an Indian address - FIXED VERSION"""
         if not address.strip():
             return {}
+        # Tokenize with offset mapping for better text reconstruction
         inputs = self.tokenizer(
             address,
             return_tensors="pt",
             truncation=True,
             padding=True,
+            max_length=128,
+            return_offsets_mapping=True
         )
+        # Extract offset mapping before moving to device
+        offset_mapping = inputs.pop("offset_mapping")[0]
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
         # Predict
             predicted_ids = torch.argmax(predictions, dim=-1)
             confidence_scores = torch.max(predictions, dim=-1)[0]
+        # Extract entities using offset mapping
+        entities = self.extract_entities_with_offsets(
+            address,
+            predicted_ids[0],
+            confidence_scores[0],
+            offset_mapping
+        )
         return entities
+    def extract_entities_with_offsets(self, original_text, predicted_ids, confidences, offset_mapping):
+        """Extract entities using offset mapping for accurate text reconstruction"""
         entities = {}
         current_entity = None
+        for i, (pred_id, conf) in enumerate(zip(predicted_ids, confidences)):
+            if i >= len(offset_mapping):
+                break
+            start, end = offset_mapping[i]
+            # Skip special tokens (they have (0,0) mapping)
+            if start == end == 0:
                 continue
+            label = self.id2entity.get(str(pred_id.item()), "O")
             if label.startswith("B-"):
                 # Save previous entity
                     if entity_type not in entities:
                         entities[entity_type] = []
                     entities[entity_type].append({
+                        "text": current_entity["text"],
                         "confidence": current_entity["confidence"]
                     })
                 entity_type = label[2:]  # Remove "B-"
                 current_entity = {
                     "type": entity_type,
+                    "text": original_text[start:end],
+                    "confidence": conf.item(),
+                    "start": start,
+                    "end": end
                 }
             elif label.startswith("I-") and current_entity:
                 # Continue current entity
                 entity_type = label[2:]  # Remove "I-"
                 if entity_type == current_entity["type"]:
+                    # Extend the entity to include this token
+                    current_entity["text"] = original_text[current_entity["start"]:end]
+                    current_entity["confidence"] = (current_entity["confidence"] + conf.item()) / 2
+                    current_entity["end"] = end
             elif label == "O" and current_entity:
                 # End current entity
                 if entity_type not in entities:
                     entities[entity_type] = []
                 entities[entity_type].append({
+                    "text": current_entity["text"],
                     "confidence": current_entity["confidence"]
                 })
                 current_entity = None
             if entity_type not in entities:
                 entities[entity_type] = []
             entities[entity_type].append({
+                "text": current_entity["text"],
                 "confidence": current_entity["confidence"]
             })
 ## 🔄 Model Updates
 - **Version**: v1.0 (Checkpoint 20793)
+- **Last Updated**: 2025-06-19
 - **Training Completion**: Based on augmented Indian address dataset
 - **Base Model**: ModernBERT for advanced transformer architecture

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e242a3a6d175f59836680a63e9037180d3e5d45df51eddda9cf767b7751640e
 size 598504388

 version https://git-lfs.github.com/spec/v1
+oid sha256:9058c65b4d9171fecc974ea5530a94f0e36e1f9b320999312374b3b4307bbe59
 size 598504388

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33148bf168dcfa2c829d4e2de706d4578b5b00a3a79f34b9876c4ed320eb2c51
 size 1197097035

 version https://git-lfs.github.com/spec/v1
+oid sha256:22021f18dcf340f6d46c8a3ccbcf129a08ee158cca4d125e92b3eec86edfe7bb
 size 1197097035

scaler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9e8fb40bd132881680545509b4688668f7ae4a27b2834707a357ace07774d3b0
 size 1383

 version https://git-lfs.github.com/spec/v1
+oid sha256:a2d86e8e98ba5f01fdb306f1d640b368177180b6e6edfb393d5f89dbaaa4b934
 size 1383

trainer_state.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
-  "best_global_step": 13862,
-  "best_metric": 0.95041712283294,
-  "best_model_checkpoint": "./ner_output_modernbert/combined_300percent_ModernBERT-base_20250618_100426/checkpoints/checkpoint-13862",
   "epoch": 3.0,
   "eval_steps": 500,
   "global_step": 20793,
@@ -11,122 +11,122 @@
   "log_history": [
     {
       "epoch": 0.24989179050642044,
-      "grad_norm": 0.3928024172782898,
       "learning_rate": 4.5842350791131636e-05,
-      "loss": 0.3016,
       "step": 1732
     },
     {
       "epoch": 0.49978358101284087,
-      "grad_norm": 1.6826777458190918,
       "learning_rate": 4.1677487616024626e-05,
-      "loss": 0.1843,
       "step": 3464
     },
     {
       "epoch": 0.7496753715192613,
-      "grad_norm": 0.5472883582115173,
       "learning_rate": 3.751262444091762e-05,
-      "loss": 0.1683,
       "step": 5196
     },
     {
       "epoch": 0.9995671620256817,
-      "grad_norm": 1.2803065776824951,
       "learning_rate": 3.3350165921223495e-05,
-      "loss": 0.1605,
       "step": 6928
     },
     {
       "epoch": 1.0,
-      "eval_accuracy": 0.9449486797375143,
-      "eval_f1": 0.9438781242313503,
-      "eval_loss": 0.15803375840187073,
-      "eval_precision": 0.9437059457495476,
-      "eval_recall": 0.9449486797375143,
-      "eval_runtime": 70.9502,
-      "eval_samples_per_second": 275.813,
-      "eval_steps_per_second": 17.252,
       "step": 6931
     },
     {
       "epoch": 1.2494589525321023,
-      "grad_norm": 3.4767799377441406,
-      "learning_rate": 2.9185302746116482e-05,
-      "loss": 0.1231,
       "step": 8660
     },
     {
       "epoch": 1.4993507430385224,
-      "grad_norm": 0.13331933319568634,
       "learning_rate": 2.5022844226422353e-05,
-      "loss": 0.121,
       "step": 10392
     },
     {
       "epoch": 1.749242533544943,
-      "grad_norm": 0.5231122374534607,
       "learning_rate": 2.0857981051315347e-05,
-      "loss": 0.1214,
       "step": 12124
     },
     {
       "epoch": 1.9991343240513635,
-      "grad_norm": 1.5934361219406128,
       "learning_rate": 1.669552253162122e-05,
-      "loss": 0.1108,
       "step": 13856
     },
     {
       "epoch": 2.0,
-      "eval_accuracy": 0.9505525536248465,
-      "eval_f1": 0.95041712283294,
-      "eval_loss": 0.14365626871585846,
-      "eval_precision": 0.9506382247961251,
-      "eval_recall": 0.9505525536248465,
-      "eval_runtime": 59.4547,
-      "eval_samples_per_second": 329.141,
-      "eval_steps_per_second": 20.587,
       "step": 13862
     },
     {
       "epoch": 2.249026114557784,
-      "grad_norm": 0.0014948190655559301,
-      "learning_rate": 1.2530659356514212e-05,
-      "loss": 0.0724,
       "step": 15588
     },
     {
       "epoch": 2.4989179050642045,
-      "grad_norm": 0.966975212097168,
       "learning_rate": 8.368200836820084e-06,
-      "loss": 0.0702,
       "step": 17320
     },
     {
       "epoch": 2.7488096955706247,
-      "grad_norm": 0.004424717742949724,
       "learning_rate": 4.205742317125956e-06,
-      "loss": 0.0669,
       "step": 19052
     },
     {
       "epoch": 2.998701486077045,
-      "grad_norm": 1.1055166721343994,
       "learning_rate": 4.328379743182802e-08,
-      "loss": 0.064,
       "step": 20784
     },
     {
       "epoch": 3.0,
-      "eval_accuracy": 0.9507602394581222,
-      "eval_f1": 0.9503624395424796,
-      "eval_loss": 0.16907508671283722,
-      "eval_precision": 0.9501301589170589,
-      "eval_recall": 0.9507602394581222,
-      "eval_runtime": 62.6422,
-      "eval_samples_per_second": 312.393,
-      "eval_steps_per_second": 19.54,
       "step": 20793
     }
   ],

 {
+  "best_global_step": 20793,
+  "best_metric": 0.949582664985337,
+  "best_model_checkpoint": "./ner_output_modernbert/combined_300percent_ModernBERT-base_20250619_055856/checkpoints/checkpoint-20793",
   "epoch": 3.0,
   "eval_steps": 500,
   "global_step": 20793,
   "log_history": [
     {
       "epoch": 0.24989179050642044,
+      "grad_norm": 0.1257457733154297,
       "learning_rate": 4.5842350791131636e-05,
+      "loss": 0.3051,
       "step": 1732
     },
     {
       "epoch": 0.49978358101284087,
+      "grad_norm": 0.5818042755126953,
       "learning_rate": 4.1677487616024626e-05,
+      "loss": 0.1847,
       "step": 3464
     },
     {
       "epoch": 0.7496753715192613,
+      "grad_norm": 0.45203471183776855,
       "learning_rate": 3.751262444091762e-05,
+      "loss": 0.1685,
       "step": 5196
     },
     {
       "epoch": 0.9995671620256817,
+      "grad_norm": 0.9936078786849976,
       "learning_rate": 3.3350165921223495e-05,
+      "loss": 0.1611,
       "step": 6928
     },
     {
       "epoch": 1.0,
+      "eval_accuracy": 0.9448721639042021,
+      "eval_f1": 0.9438293472729468,
+      "eval_loss": 0.1583022028207779,
+      "eval_precision": 0.9438239601502502,
+      "eval_recall": 0.9448721639042021,
+      "eval_runtime": 66.8003,
+      "eval_samples_per_second": 292.948,
+      "eval_steps_per_second": 18.323,
       "step": 6931
     },
     {
       "epoch": 1.2494589525321023,
+      "grad_norm": 2.6818487644195557,
+      "learning_rate": 2.9187707401529363e-05,
+      "loss": 0.1245,
       "step": 8660
     },
     {
       "epoch": 1.4993507430385224,
+      "grad_norm": 0.14635096490383148,
       "learning_rate": 2.5022844226422353e-05,
+      "loss": 0.1212,
       "step": 10392
     },
     {
       "epoch": 1.749242533544943,
+      "grad_norm": 0.5487073659896851,
       "learning_rate": 2.0857981051315347e-05,
+      "loss": 0.1218,
       "step": 12124
     },
     {
       "epoch": 1.9991343240513635,
+      "grad_norm": 1.3607397079467773,
       "learning_rate": 1.669552253162122e-05,
+      "loss": 0.1117,
       "step": 13856
     },
     {
       "epoch": 2.0,
+      "eval_accuracy": 0.9494521830695967,
+      "eval_f1": 0.9492911172803227,
+      "eval_loss": 0.14517587423324585,
+      "eval_precision": 0.9494150440403941,
+      "eval_recall": 0.9494521830695967,
+      "eval_runtime": 63.4366,
+      "eval_samples_per_second": 308.481,
+      "eval_steps_per_second": 19.295,
       "step": 13862
     },
     {
       "epoch": 2.249026114557784,
+      "grad_norm": 0.0037306402809917927,
+      "learning_rate": 1.2533064011927093e-05,
+      "loss": 0.0739,
       "step": 15588
     },
     {
       "epoch": 2.4989179050642045,
+      "grad_norm": 0.5977134108543396,
       "learning_rate": 8.368200836820084e-06,
+      "loss": 0.0709,
       "step": 17320
     },
     {
       "epoch": 2.7488096955706247,
+      "grad_norm": 0.007765794638544321,
       "learning_rate": 4.205742317125956e-06,
+      "loss": 0.0679,
       "step": 19052
     },
     {
       "epoch": 2.998701486077045,
+      "grad_norm": 2.047842025756836,
       "learning_rate": 4.328379743182802e-08,
+      "loss": 0.0643,
       "step": 20784
     },
     {
       "epoch": 3.0,
+      "eval_accuracy": 0.949984150291671,
+      "eval_f1": 0.949582664985337,
+      "eval_loss": 0.1690009981393814,
+      "eval_precision": 0.9493962920968388,
+      "eval_recall": 0.949984150291671,
+      "eval_runtime": 64.0908,
+      "eval_samples_per_second": 305.333,
+      "eval_steps_per_second": 19.098,
       "step": 20793
     }
   ],

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:982cb4223d70957ced08a996f9d01d874365c30eed8d9f65a2ca7ea7e01255b0
 size 5777

 version https://git-lfs.github.com/spec/v1
+oid sha256:7f01b0496d454ec432e306869978de94d3a9263262bdc01f48f66edfae77c6ad
 size 5777