adjaysagar commited on
Commit
e7072c2
·
verified ·
1 Parent(s): 275819a

Upload Indian Address NER model (checkpoint-20793)

Browse files
Files changed (6) hide show
  1. README.md +38 -21
  2. model.safetensors +1 -1
  3. optimizer.pt +1 -1
  4. scaler.pt +1 -1
  5. trainer_state.json +53 -53
  6. training_args.bin +1 -1
README.md CHANGED
@@ -75,18 +75,22 @@ class IndianAddressNER:
75
  }
76
 
77
  def predict(self, address):
78
- """Extract entities from an Indian address"""
79
  if not address.strip():
80
  return {}
81
 
82
- # Tokenize
83
  inputs = self.tokenizer(
84
  address,
85
  return_tensors="pt",
86
  truncation=True,
87
  padding=True,
88
- max_length=128
 
89
  )
 
 
 
90
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
91
 
92
  # Predict
@@ -96,23 +100,32 @@ class IndianAddressNER:
96
  predicted_ids = torch.argmax(predictions, dim=-1)
97
  confidence_scores = torch.max(predictions, dim=-1)[0]
98
 
99
- # Convert to tokens and labels
100
- tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
101
- predicted_labels = [self.id2entity.get(str(id.item()), "O") for id in predicted_ids[0]]
102
- confidences = confidence_scores[0].cpu().numpy()
 
 
 
103
 
104
- # Group entities
105
- entities = self.group_entities(tokens, predicted_labels, confidences)
106
  return entities
107
 
108
- def group_entities(self, tokens, labels, confidences):
109
- """Group B- and I- tags into complete entities"""
110
  entities = {}
111
  current_entity = None
112
 
113
- for i, (token, label, conf) in enumerate(zip(tokens, labels, confidences)):
114
- if token in ["[CLS]", "[SEP]", "[PAD]"]:
 
 
 
 
 
 
115
  continue
 
 
116
 
117
  if label.startswith("B-"):
118
  # Save previous entity
@@ -121,7 +134,7 @@ class IndianAddressNER:
121
  if entity_type not in entities:
122
  entities[entity_type] = []
123
  entities[entity_type].append({
124
- "text": current_entity["text"].replace("##", ""),
125
  "confidence": current_entity["confidence"]
126
  })
127
 
@@ -129,16 +142,20 @@ class IndianAddressNER:
129
  entity_type = label[2:] # Remove "B-"
130
  current_entity = {
131
  "type": entity_type,
132
- "text": token,
133
- "confidence": conf
 
 
134
  }
135
 
136
  elif label.startswith("I-") and current_entity:
137
  # Continue current entity
138
  entity_type = label[2:] # Remove "I-"
139
  if entity_type == current_entity["type"]:
140
- current_entity["text"] += token
141
- current_entity["confidence"] = (current_entity["confidence"] + conf) / 2
 
 
142
 
143
  elif label == "O" and current_entity:
144
  # End current entity
@@ -146,7 +163,7 @@ class IndianAddressNER:
146
  if entity_type not in entities:
147
  entities[entity_type] = []
148
  entities[entity_type].append({
149
- "text": current_entity["text"].replace("##", ""),
150
  "confidence": current_entity["confidence"]
151
  })
152
  current_entity = None
@@ -157,7 +174,7 @@ class IndianAddressNER:
157
  if entity_type not in entities:
158
  entities[entity_type] = []
159
  entities[entity_type].append({
160
- "text": current_entity["text"].replace("##", ""),
161
  "confidence": current_entity["confidence"]
162
  })
163
 
@@ -348,7 +365,7 @@ The model uses BIO (Begin-Inside-Outside) tagging scheme:
348
  ## 🔄 Model Updates
349
 
350
  - **Version**: v1.0 (Checkpoint 20793)
351
- - **Last Updated**: 2025-06-18
352
  - **Training Completion**: Based on augmented Indian address dataset
353
  - **Base Model**: ModernBERT for advanced transformer architecture
354
 
 
75
  }
76
 
77
  def predict(self, address):
78
+ """Extract entities from an Indian address - FIXED VERSION"""
79
  if not address.strip():
80
  return {}
81
 
82
+ # Tokenize with offset mapping for better text reconstruction
83
  inputs = self.tokenizer(
84
  address,
85
  return_tensors="pt",
86
  truncation=True,
87
  padding=True,
88
+ max_length=128,
89
+ return_offsets_mapping=True
90
  )
91
+
92
+ # Extract offset mapping before moving to device
93
+ offset_mapping = inputs.pop("offset_mapping")[0]
94
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
95
 
96
  # Predict
 
100
  predicted_ids = torch.argmax(predictions, dim=-1)
101
  confidence_scores = torch.max(predictions, dim=-1)[0]
102
 
103
+ # Extract entities using offset mapping
104
+ entities = self.extract_entities_with_offsets(
105
+ address,
106
+ predicted_ids[0],
107
+ confidence_scores[0],
108
+ offset_mapping
109
+ )
110
 
 
 
111
  return entities
112
 
113
+ def extract_entities_with_offsets(self, original_text, predicted_ids, confidences, offset_mapping):
114
+ """Extract entities using offset mapping for accurate text reconstruction"""
115
  entities = {}
116
  current_entity = None
117
 
118
+ for i, (pred_id, conf) in enumerate(zip(predicted_ids, confidences)):
119
+ if i >= len(offset_mapping):
120
+ break
121
+
122
+ start, end = offset_mapping[i]
123
+
124
+ # Skip special tokens (they have (0,0) mapping)
125
+ if start == end == 0:
126
  continue
127
+
128
+ label = self.id2entity.get(str(pred_id.item()), "O")
129
 
130
  if label.startswith("B-"):
131
  # Save previous entity
 
134
  if entity_type not in entities:
135
  entities[entity_type] = []
136
  entities[entity_type].append({
137
+ "text": current_entity["text"],
138
  "confidence": current_entity["confidence"]
139
  })
140
 
 
142
  entity_type = label[2:] # Remove "B-"
143
  current_entity = {
144
  "type": entity_type,
145
+ "text": original_text[start:end],
146
+ "confidence": conf.item(),
147
+ "start": start,
148
+ "end": end
149
  }
150
 
151
  elif label.startswith("I-") and current_entity:
152
  # Continue current entity
153
  entity_type = label[2:] # Remove "I-"
154
  if entity_type == current_entity["type"]:
155
+ # Extend the entity to include this token
156
+ current_entity["text"] = original_text[current_entity["start"]:end]
157
+ current_entity["confidence"] = (current_entity["confidence"] + conf.item()) / 2
158
+ current_entity["end"] = end
159
 
160
  elif label == "O" and current_entity:
161
  # End current entity
 
163
  if entity_type not in entities:
164
  entities[entity_type] = []
165
  entities[entity_type].append({
166
+ "text": current_entity["text"],
167
  "confidence": current_entity["confidence"]
168
  })
169
  current_entity = None
 
174
  if entity_type not in entities:
175
  entities[entity_type] = []
176
  entities[entity_type].append({
177
+ "text": current_entity["text"],
178
  "confidence": current_entity["confidence"]
179
  })
180
 
 
365
  ## 🔄 Model Updates
366
 
367
  - **Version**: v1.0 (Checkpoint 20793)
368
+ - **Last Updated**: 2025-06-19
369
  - **Training Completion**: Based on augmented Indian address dataset
370
  - **Base Model**: ModernBERT for advanced transformer architecture
371
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e242a3a6d175f59836680a63e9037180d3e5d45df51eddda9cf767b7751640e
3
  size 598504388
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9058c65b4d9171fecc974ea5530a94f0e36e1f9b320999312374b3b4307bbe59
3
  size 598504388
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33148bf168dcfa2c829d4e2de706d4578b5b00a3a79f34b9876c4ed320eb2c51
3
  size 1197097035
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22021f18dcf340f6d46c8a3ccbcf129a08ee158cca4d125e92b3eec86edfe7bb
3
  size 1197097035
scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e8fb40bd132881680545509b4688668f7ae4a27b2834707a357ace07774d3b0
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2d86e8e98ba5f01fdb306f1d640b368177180b6e6edfb393d5f89dbaaa4b934
3
  size 1383
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "best_global_step": 13862,
3
- "best_metric": 0.95041712283294,
4
- "best_model_checkpoint": "./ner_output_modernbert/combined_300percent_ModernBERT-base_20250618_100426/checkpoints/checkpoint-13862",
5
  "epoch": 3.0,
6
  "eval_steps": 500,
7
  "global_step": 20793,
@@ -11,122 +11,122 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.24989179050642044,
14
- "grad_norm": 0.3928024172782898,
15
  "learning_rate": 4.5842350791131636e-05,
16
- "loss": 0.3016,
17
  "step": 1732
18
  },
19
  {
20
  "epoch": 0.49978358101284087,
21
- "grad_norm": 1.6826777458190918,
22
  "learning_rate": 4.1677487616024626e-05,
23
- "loss": 0.1843,
24
  "step": 3464
25
  },
26
  {
27
  "epoch": 0.7496753715192613,
28
- "grad_norm": 0.5472883582115173,
29
  "learning_rate": 3.751262444091762e-05,
30
- "loss": 0.1683,
31
  "step": 5196
32
  },
33
  {
34
  "epoch": 0.9995671620256817,
35
- "grad_norm": 1.2803065776824951,
36
  "learning_rate": 3.3350165921223495e-05,
37
- "loss": 0.1605,
38
  "step": 6928
39
  },
40
  {
41
  "epoch": 1.0,
42
- "eval_accuracy": 0.9449486797375143,
43
- "eval_f1": 0.9438781242313503,
44
- "eval_loss": 0.15803375840187073,
45
- "eval_precision": 0.9437059457495476,
46
- "eval_recall": 0.9449486797375143,
47
- "eval_runtime": 70.9502,
48
- "eval_samples_per_second": 275.813,
49
- "eval_steps_per_second": 17.252,
50
  "step": 6931
51
  },
52
  {
53
  "epoch": 1.2494589525321023,
54
- "grad_norm": 3.4767799377441406,
55
- "learning_rate": 2.9185302746116482e-05,
56
- "loss": 0.1231,
57
  "step": 8660
58
  },
59
  {
60
  "epoch": 1.4993507430385224,
61
- "grad_norm": 0.13331933319568634,
62
  "learning_rate": 2.5022844226422353e-05,
63
- "loss": 0.121,
64
  "step": 10392
65
  },
66
  {
67
  "epoch": 1.749242533544943,
68
- "grad_norm": 0.5231122374534607,
69
  "learning_rate": 2.0857981051315347e-05,
70
- "loss": 0.1214,
71
  "step": 12124
72
  },
73
  {
74
  "epoch": 1.9991343240513635,
75
- "grad_norm": 1.5934361219406128,
76
  "learning_rate": 1.669552253162122e-05,
77
- "loss": 0.1108,
78
  "step": 13856
79
  },
80
  {
81
  "epoch": 2.0,
82
- "eval_accuracy": 0.9505525536248465,
83
- "eval_f1": 0.95041712283294,
84
- "eval_loss": 0.14365626871585846,
85
- "eval_precision": 0.9506382247961251,
86
- "eval_recall": 0.9505525536248465,
87
- "eval_runtime": 59.4547,
88
- "eval_samples_per_second": 329.141,
89
- "eval_steps_per_second": 20.587,
90
  "step": 13862
91
  },
92
  {
93
  "epoch": 2.249026114557784,
94
- "grad_norm": 0.0014948190655559301,
95
- "learning_rate": 1.2530659356514212e-05,
96
- "loss": 0.0724,
97
  "step": 15588
98
  },
99
  {
100
  "epoch": 2.4989179050642045,
101
- "grad_norm": 0.966975212097168,
102
  "learning_rate": 8.368200836820084e-06,
103
- "loss": 0.0702,
104
  "step": 17320
105
  },
106
  {
107
  "epoch": 2.7488096955706247,
108
- "grad_norm": 0.004424717742949724,
109
  "learning_rate": 4.205742317125956e-06,
110
- "loss": 0.0669,
111
  "step": 19052
112
  },
113
  {
114
  "epoch": 2.998701486077045,
115
- "grad_norm": 1.1055166721343994,
116
  "learning_rate": 4.328379743182802e-08,
117
- "loss": 0.064,
118
  "step": 20784
119
  },
120
  {
121
  "epoch": 3.0,
122
- "eval_accuracy": 0.9507602394581222,
123
- "eval_f1": 0.9503624395424796,
124
- "eval_loss": 0.16907508671283722,
125
- "eval_precision": 0.9501301589170589,
126
- "eval_recall": 0.9507602394581222,
127
- "eval_runtime": 62.6422,
128
- "eval_samples_per_second": 312.393,
129
- "eval_steps_per_second": 19.54,
130
  "step": 20793
131
  }
132
  ],
 
1
  {
2
+ "best_global_step": 20793,
3
+ "best_metric": 0.949582664985337,
4
+ "best_model_checkpoint": "./ner_output_modernbert/combined_300percent_ModernBERT-base_20250619_055856/checkpoints/checkpoint-20793",
5
  "epoch": 3.0,
6
  "eval_steps": 500,
7
  "global_step": 20793,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.24989179050642044,
14
+ "grad_norm": 0.1257457733154297,
15
  "learning_rate": 4.5842350791131636e-05,
16
+ "loss": 0.3051,
17
  "step": 1732
18
  },
19
  {
20
  "epoch": 0.49978358101284087,
21
+ "grad_norm": 0.5818042755126953,
22
  "learning_rate": 4.1677487616024626e-05,
23
+ "loss": 0.1847,
24
  "step": 3464
25
  },
26
  {
27
  "epoch": 0.7496753715192613,
28
+ "grad_norm": 0.45203471183776855,
29
  "learning_rate": 3.751262444091762e-05,
30
+ "loss": 0.1685,
31
  "step": 5196
32
  },
33
  {
34
  "epoch": 0.9995671620256817,
35
+ "grad_norm": 0.9936078786849976,
36
  "learning_rate": 3.3350165921223495e-05,
37
+ "loss": 0.1611,
38
  "step": 6928
39
  },
40
  {
41
  "epoch": 1.0,
42
+ "eval_accuracy": 0.9448721639042021,
43
+ "eval_f1": 0.9438293472729468,
44
+ "eval_loss": 0.1583022028207779,
45
+ "eval_precision": 0.9438239601502502,
46
+ "eval_recall": 0.9448721639042021,
47
+ "eval_runtime": 66.8003,
48
+ "eval_samples_per_second": 292.948,
49
+ "eval_steps_per_second": 18.323,
50
  "step": 6931
51
  },
52
  {
53
  "epoch": 1.2494589525321023,
54
+ "grad_norm": 2.6818487644195557,
55
+ "learning_rate": 2.9187707401529363e-05,
56
+ "loss": 0.1245,
57
  "step": 8660
58
  },
59
  {
60
  "epoch": 1.4993507430385224,
61
+ "grad_norm": 0.14635096490383148,
62
  "learning_rate": 2.5022844226422353e-05,
63
+ "loss": 0.1212,
64
  "step": 10392
65
  },
66
  {
67
  "epoch": 1.749242533544943,
68
+ "grad_norm": 0.5487073659896851,
69
  "learning_rate": 2.0857981051315347e-05,
70
+ "loss": 0.1218,
71
  "step": 12124
72
  },
73
  {
74
  "epoch": 1.9991343240513635,
75
+ "grad_norm": 1.3607397079467773,
76
  "learning_rate": 1.669552253162122e-05,
77
+ "loss": 0.1117,
78
  "step": 13856
79
  },
80
  {
81
  "epoch": 2.0,
82
+ "eval_accuracy": 0.9494521830695967,
83
+ "eval_f1": 0.9492911172803227,
84
+ "eval_loss": 0.14517587423324585,
85
+ "eval_precision": 0.9494150440403941,
86
+ "eval_recall": 0.9494521830695967,
87
+ "eval_runtime": 63.4366,
88
+ "eval_samples_per_second": 308.481,
89
+ "eval_steps_per_second": 19.295,
90
  "step": 13862
91
  },
92
  {
93
  "epoch": 2.249026114557784,
94
+ "grad_norm": 0.0037306402809917927,
95
+ "learning_rate": 1.2533064011927093e-05,
96
+ "loss": 0.0739,
97
  "step": 15588
98
  },
99
  {
100
  "epoch": 2.4989179050642045,
101
+ "grad_norm": 0.5977134108543396,
102
  "learning_rate": 8.368200836820084e-06,
103
+ "loss": 0.0709,
104
  "step": 17320
105
  },
106
  {
107
  "epoch": 2.7488096955706247,
108
+ "grad_norm": 0.007765794638544321,
109
  "learning_rate": 4.205742317125956e-06,
110
+ "loss": 0.0679,
111
  "step": 19052
112
  },
113
  {
114
  "epoch": 2.998701486077045,
115
+ "grad_norm": 2.047842025756836,
116
  "learning_rate": 4.328379743182802e-08,
117
+ "loss": 0.0643,
118
  "step": 20784
119
  },
120
  {
121
  "epoch": 3.0,
122
+ "eval_accuracy": 0.949984150291671,
123
+ "eval_f1": 0.949582664985337,
124
+ "eval_loss": 0.1690009981393814,
125
+ "eval_precision": 0.9493962920968388,
126
+ "eval_recall": 0.949984150291671,
127
+ "eval_runtime": 64.0908,
128
+ "eval_samples_per_second": 305.333,
129
+ "eval_steps_per_second": 19.098,
130
  "step": 20793
131
  }
132
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:982cb4223d70957ced08a996f9d01d874365c30eed8d9f65a2ca7ea7e01255b0
3
  size 5777
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f01b0496d454ec432e306869978de94d3a9263262bdc01f48f66edfae77c6ad
3
  size 5777