Upload Indian Address NER model (checkpoint-20793)
Browse files- README.md +38 -21
- model.safetensors +1 -1
- optimizer.pt +1 -1
- scaler.pt +1 -1
- trainer_state.json +53 -53
- training_args.bin +1 -1
README.md
CHANGED
@@ -75,18 +75,22 @@ class IndianAddressNER:
|
|
75 |
}
|
76 |
|
77 |
def predict(self, address):
|
78 |
-
"""Extract entities from an Indian address"""
|
79 |
if not address.strip():
|
80 |
return {}
|
81 |
|
82 |
-
# Tokenize
|
83 |
inputs = self.tokenizer(
|
84 |
address,
|
85 |
return_tensors="pt",
|
86 |
truncation=True,
|
87 |
padding=True,
|
88 |
-
max_length=128
|
|
|
89 |
)
|
|
|
|
|
|
|
90 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
91 |
|
92 |
# Predict
|
@@ -96,23 +100,32 @@ class IndianAddressNER:
|
|
96 |
predicted_ids = torch.argmax(predictions, dim=-1)
|
97 |
confidence_scores = torch.max(predictions, dim=-1)[0]
|
98 |
|
99 |
-
#
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
103 |
|
104 |
-
# Group entities
|
105 |
-
entities = self.group_entities(tokens, predicted_labels, confidences)
|
106 |
return entities
|
107 |
|
108 |
-
def
|
109 |
-
"""
|
110 |
entities = {}
|
111 |
current_entity = None
|
112 |
|
113 |
-
for i, (
|
114 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
continue
|
|
|
|
|
116 |
|
117 |
if label.startswith("B-"):
|
118 |
# Save previous entity
|
@@ -121,7 +134,7 @@ class IndianAddressNER:
|
|
121 |
if entity_type not in entities:
|
122 |
entities[entity_type] = []
|
123 |
entities[entity_type].append({
|
124 |
-
"text": current_entity["text"]
|
125 |
"confidence": current_entity["confidence"]
|
126 |
})
|
127 |
|
@@ -129,16 +142,20 @@ class IndianAddressNER:
|
|
129 |
entity_type = label[2:] # Remove "B-"
|
130 |
current_entity = {
|
131 |
"type": entity_type,
|
132 |
-
"text":
|
133 |
-
"confidence": conf
|
|
|
|
|
134 |
}
|
135 |
|
136 |
elif label.startswith("I-") and current_entity:
|
137 |
# Continue current entity
|
138 |
entity_type = label[2:] # Remove "I-"
|
139 |
if entity_type == current_entity["type"]:
|
140 |
-
|
141 |
-
current_entity["
|
|
|
|
|
142 |
|
143 |
elif label == "O" and current_entity:
|
144 |
# End current entity
|
@@ -146,7 +163,7 @@ class IndianAddressNER:
|
|
146 |
if entity_type not in entities:
|
147 |
entities[entity_type] = []
|
148 |
entities[entity_type].append({
|
149 |
-
"text": current_entity["text"]
|
150 |
"confidence": current_entity["confidence"]
|
151 |
})
|
152 |
current_entity = None
|
@@ -157,7 +174,7 @@ class IndianAddressNER:
|
|
157 |
if entity_type not in entities:
|
158 |
entities[entity_type] = []
|
159 |
entities[entity_type].append({
|
160 |
-
"text": current_entity["text"]
|
161 |
"confidence": current_entity["confidence"]
|
162 |
})
|
163 |
|
@@ -348,7 +365,7 @@ The model uses BIO (Begin-Inside-Outside) tagging scheme:
|
|
348 |
## 🔄 Model Updates
|
349 |
|
350 |
- **Version**: v1.0 (Checkpoint 20793)
|
351 |
-
- **Last Updated**: 2025-06-
|
352 |
- **Training Completion**: Based on augmented Indian address dataset
|
353 |
- **Base Model**: ModernBERT for advanced transformer architecture
|
354 |
|
|
|
75 |
}
|
76 |
|
77 |
def predict(self, address):
|
78 |
+
"""Extract entities from an Indian address - FIXED VERSION"""
|
79 |
if not address.strip():
|
80 |
return {}
|
81 |
|
82 |
+
# Tokenize with offset mapping for better text reconstruction
|
83 |
inputs = self.tokenizer(
|
84 |
address,
|
85 |
return_tensors="pt",
|
86 |
truncation=True,
|
87 |
padding=True,
|
88 |
+
max_length=128,
|
89 |
+
return_offsets_mapping=True
|
90 |
)
|
91 |
+
|
92 |
+
# Extract offset mapping before moving to device
|
93 |
+
offset_mapping = inputs.pop("offset_mapping")[0]
|
94 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
95 |
|
96 |
# Predict
|
|
|
100 |
predicted_ids = torch.argmax(predictions, dim=-1)
|
101 |
confidence_scores = torch.max(predictions, dim=-1)[0]
|
102 |
|
103 |
+
# Extract entities using offset mapping
|
104 |
+
entities = self.extract_entities_with_offsets(
|
105 |
+
address,
|
106 |
+
predicted_ids[0],
|
107 |
+
confidence_scores[0],
|
108 |
+
offset_mapping
|
109 |
+
)
|
110 |
|
|
|
|
|
111 |
return entities
|
112 |
|
113 |
+
def extract_entities_with_offsets(self, original_text, predicted_ids, confidences, offset_mapping):
|
114 |
+
"""Extract entities using offset mapping for accurate text reconstruction"""
|
115 |
entities = {}
|
116 |
current_entity = None
|
117 |
|
118 |
+
for i, (pred_id, conf) in enumerate(zip(predicted_ids, confidences)):
|
119 |
+
if i >= len(offset_mapping):
|
120 |
+
break
|
121 |
+
|
122 |
+
start, end = offset_mapping[i]
|
123 |
+
|
124 |
+
# Skip special tokens (they have (0,0) mapping)
|
125 |
+
if start == end == 0:
|
126 |
continue
|
127 |
+
|
128 |
+
label = self.id2entity.get(str(pred_id.item()), "O")
|
129 |
|
130 |
if label.startswith("B-"):
|
131 |
# Save previous entity
|
|
|
134 |
if entity_type not in entities:
|
135 |
entities[entity_type] = []
|
136 |
entities[entity_type].append({
|
137 |
+
"text": current_entity["text"],
|
138 |
"confidence": current_entity["confidence"]
|
139 |
})
|
140 |
|
|
|
142 |
entity_type = label[2:] # Remove "B-"
|
143 |
current_entity = {
|
144 |
"type": entity_type,
|
145 |
+
"text": original_text[start:end],
|
146 |
+
"confidence": conf.item(),
|
147 |
+
"start": start,
|
148 |
+
"end": end
|
149 |
}
|
150 |
|
151 |
elif label.startswith("I-") and current_entity:
|
152 |
# Continue current entity
|
153 |
entity_type = label[2:] # Remove "I-"
|
154 |
if entity_type == current_entity["type"]:
|
155 |
+
# Extend the entity to include this token
|
156 |
+
current_entity["text"] = original_text[current_entity["start"]:end]
|
157 |
+
current_entity["confidence"] = (current_entity["confidence"] + conf.item()) / 2
|
158 |
+
current_entity["end"] = end
|
159 |
|
160 |
elif label == "O" and current_entity:
|
161 |
# End current entity
|
|
|
163 |
if entity_type not in entities:
|
164 |
entities[entity_type] = []
|
165 |
entities[entity_type].append({
|
166 |
+
"text": current_entity["text"],
|
167 |
"confidence": current_entity["confidence"]
|
168 |
})
|
169 |
current_entity = None
|
|
|
174 |
if entity_type not in entities:
|
175 |
entities[entity_type] = []
|
176 |
entities[entity_type].append({
|
177 |
+
"text": current_entity["text"],
|
178 |
"confidence": current_entity["confidence"]
|
179 |
})
|
180 |
|
|
|
365 |
## 🔄 Model Updates
|
366 |
|
367 |
- **Version**: v1.0 (Checkpoint 20793)
|
368 |
+
- **Last Updated**: 2025-06-19
|
369 |
- **Training Completion**: Based on augmented Indian address dataset
|
370 |
- **Base Model**: ModernBERT for advanced transformer architecture
|
371 |
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 598504388
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9058c65b4d9171fecc974ea5530a94f0e36e1f9b320999312374b3b4307bbe59
|
3 |
size 598504388
|
optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1197097035
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:22021f18dcf340f6d46c8a3ccbcf129a08ee158cca4d125e92b3eec86edfe7bb
|
3 |
size 1197097035
|
scaler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1383
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a2d86e8e98ba5f01fdb306f1d640b368177180b6e6edfb393d5f89dbaaa4b934
|
3 |
size 1383
|
trainer_state.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
{
|
2 |
-
"best_global_step":
|
3 |
-
"best_metric": 0.
|
4 |
-
"best_model_checkpoint": "./ner_output_modernbert/combined_300percent_ModernBERT-
|
5 |
"epoch": 3.0,
|
6 |
"eval_steps": 500,
|
7 |
"global_step": 20793,
|
@@ -11,122 +11,122 @@
|
|
11 |
"log_history": [
|
12 |
{
|
13 |
"epoch": 0.24989179050642044,
|
14 |
-
"grad_norm": 0.
|
15 |
"learning_rate": 4.5842350791131636e-05,
|
16 |
-
"loss": 0.
|
17 |
"step": 1732
|
18 |
},
|
19 |
{
|
20 |
"epoch": 0.49978358101284087,
|
21 |
-
"grad_norm":
|
22 |
"learning_rate": 4.1677487616024626e-05,
|
23 |
-
"loss": 0.
|
24 |
"step": 3464
|
25 |
},
|
26 |
{
|
27 |
"epoch": 0.7496753715192613,
|
28 |
-
"grad_norm": 0.
|
29 |
"learning_rate": 3.751262444091762e-05,
|
30 |
-
"loss": 0.
|
31 |
"step": 5196
|
32 |
},
|
33 |
{
|
34 |
"epoch": 0.9995671620256817,
|
35 |
-
"grad_norm":
|
36 |
"learning_rate": 3.3350165921223495e-05,
|
37 |
-
"loss": 0.
|
38 |
"step": 6928
|
39 |
},
|
40 |
{
|
41 |
"epoch": 1.0,
|
42 |
-
"eval_accuracy": 0.
|
43 |
-
"eval_f1": 0.
|
44 |
-
"eval_loss": 0.
|
45 |
-
"eval_precision": 0.
|
46 |
-
"eval_recall": 0.
|
47 |
-
"eval_runtime":
|
48 |
-
"eval_samples_per_second":
|
49 |
-
"eval_steps_per_second":
|
50 |
"step": 6931
|
51 |
},
|
52 |
{
|
53 |
"epoch": 1.2494589525321023,
|
54 |
-
"grad_norm":
|
55 |
-
"learning_rate": 2.
|
56 |
-
"loss": 0.
|
57 |
"step": 8660
|
58 |
},
|
59 |
{
|
60 |
"epoch": 1.4993507430385224,
|
61 |
-
"grad_norm": 0.
|
62 |
"learning_rate": 2.5022844226422353e-05,
|
63 |
-
"loss": 0.
|
64 |
"step": 10392
|
65 |
},
|
66 |
{
|
67 |
"epoch": 1.749242533544943,
|
68 |
-
"grad_norm": 0.
|
69 |
"learning_rate": 2.0857981051315347e-05,
|
70 |
-
"loss": 0.
|
71 |
"step": 12124
|
72 |
},
|
73 |
{
|
74 |
"epoch": 1.9991343240513635,
|
75 |
-
"grad_norm": 1.
|
76 |
"learning_rate": 1.669552253162122e-05,
|
77 |
-
"loss": 0.
|
78 |
"step": 13856
|
79 |
},
|
80 |
{
|
81 |
"epoch": 2.0,
|
82 |
-
"eval_accuracy": 0.
|
83 |
-
"eval_f1": 0.
|
84 |
-
"eval_loss": 0.
|
85 |
-
"eval_precision": 0.
|
86 |
-
"eval_recall": 0.
|
87 |
-
"eval_runtime":
|
88 |
-
"eval_samples_per_second":
|
89 |
-
"eval_steps_per_second":
|
90 |
"step": 13862
|
91 |
},
|
92 |
{
|
93 |
"epoch": 2.249026114557784,
|
94 |
-
"grad_norm": 0.
|
95 |
-
"learning_rate": 1.
|
96 |
-
"loss": 0.
|
97 |
"step": 15588
|
98 |
},
|
99 |
{
|
100 |
"epoch": 2.4989179050642045,
|
101 |
-
"grad_norm": 0.
|
102 |
"learning_rate": 8.368200836820084e-06,
|
103 |
-
"loss": 0.
|
104 |
"step": 17320
|
105 |
},
|
106 |
{
|
107 |
"epoch": 2.7488096955706247,
|
108 |
-
"grad_norm": 0.
|
109 |
"learning_rate": 4.205742317125956e-06,
|
110 |
-
"loss": 0.
|
111 |
"step": 19052
|
112 |
},
|
113 |
{
|
114 |
"epoch": 2.998701486077045,
|
115 |
-
"grad_norm":
|
116 |
"learning_rate": 4.328379743182802e-08,
|
117 |
-
"loss": 0.
|
118 |
"step": 20784
|
119 |
},
|
120 |
{
|
121 |
"epoch": 3.0,
|
122 |
-
"eval_accuracy": 0.
|
123 |
-
"eval_f1": 0.
|
124 |
-
"eval_loss": 0.
|
125 |
-
"eval_precision": 0.
|
126 |
-
"eval_recall": 0.
|
127 |
-
"eval_runtime":
|
128 |
-
"eval_samples_per_second":
|
129 |
-
"eval_steps_per_second": 19.
|
130 |
"step": 20793
|
131 |
}
|
132 |
],
|
|
|
1 |
{
|
2 |
+
"best_global_step": 20793,
|
3 |
+
"best_metric": 0.949582664985337,
|
4 |
+
"best_model_checkpoint": "./ner_output_modernbert/combined_300percent_ModernBERT-base_20250619_055856/checkpoints/checkpoint-20793",
|
5 |
"epoch": 3.0,
|
6 |
"eval_steps": 500,
|
7 |
"global_step": 20793,
|
|
|
11 |
"log_history": [
|
12 |
{
|
13 |
"epoch": 0.24989179050642044,
|
14 |
+
"grad_norm": 0.1257457733154297,
|
15 |
"learning_rate": 4.5842350791131636e-05,
|
16 |
+
"loss": 0.3051,
|
17 |
"step": 1732
|
18 |
},
|
19 |
{
|
20 |
"epoch": 0.49978358101284087,
|
21 |
+
"grad_norm": 0.5818042755126953,
|
22 |
"learning_rate": 4.1677487616024626e-05,
|
23 |
+
"loss": 0.1847,
|
24 |
"step": 3464
|
25 |
},
|
26 |
{
|
27 |
"epoch": 0.7496753715192613,
|
28 |
+
"grad_norm": 0.45203471183776855,
|
29 |
"learning_rate": 3.751262444091762e-05,
|
30 |
+
"loss": 0.1685,
|
31 |
"step": 5196
|
32 |
},
|
33 |
{
|
34 |
"epoch": 0.9995671620256817,
|
35 |
+
"grad_norm": 0.9936078786849976,
|
36 |
"learning_rate": 3.3350165921223495e-05,
|
37 |
+
"loss": 0.1611,
|
38 |
"step": 6928
|
39 |
},
|
40 |
{
|
41 |
"epoch": 1.0,
|
42 |
+
"eval_accuracy": 0.9448721639042021,
|
43 |
+
"eval_f1": 0.9438293472729468,
|
44 |
+
"eval_loss": 0.1583022028207779,
|
45 |
+
"eval_precision": 0.9438239601502502,
|
46 |
+
"eval_recall": 0.9448721639042021,
|
47 |
+
"eval_runtime": 66.8003,
|
48 |
+
"eval_samples_per_second": 292.948,
|
49 |
+
"eval_steps_per_second": 18.323,
|
50 |
"step": 6931
|
51 |
},
|
52 |
{
|
53 |
"epoch": 1.2494589525321023,
|
54 |
+
"grad_norm": 2.6818487644195557,
|
55 |
+
"learning_rate": 2.9187707401529363e-05,
|
56 |
+
"loss": 0.1245,
|
57 |
"step": 8660
|
58 |
},
|
59 |
{
|
60 |
"epoch": 1.4993507430385224,
|
61 |
+
"grad_norm": 0.14635096490383148,
|
62 |
"learning_rate": 2.5022844226422353e-05,
|
63 |
+
"loss": 0.1212,
|
64 |
"step": 10392
|
65 |
},
|
66 |
{
|
67 |
"epoch": 1.749242533544943,
|
68 |
+
"grad_norm": 0.5487073659896851,
|
69 |
"learning_rate": 2.0857981051315347e-05,
|
70 |
+
"loss": 0.1218,
|
71 |
"step": 12124
|
72 |
},
|
73 |
{
|
74 |
"epoch": 1.9991343240513635,
|
75 |
+
"grad_norm": 1.3607397079467773,
|
76 |
"learning_rate": 1.669552253162122e-05,
|
77 |
+
"loss": 0.1117,
|
78 |
"step": 13856
|
79 |
},
|
80 |
{
|
81 |
"epoch": 2.0,
|
82 |
+
"eval_accuracy": 0.9494521830695967,
|
83 |
+
"eval_f1": 0.9492911172803227,
|
84 |
+
"eval_loss": 0.14517587423324585,
|
85 |
+
"eval_precision": 0.9494150440403941,
|
86 |
+
"eval_recall": 0.9494521830695967,
|
87 |
+
"eval_runtime": 63.4366,
|
88 |
+
"eval_samples_per_second": 308.481,
|
89 |
+
"eval_steps_per_second": 19.295,
|
90 |
"step": 13862
|
91 |
},
|
92 |
{
|
93 |
"epoch": 2.249026114557784,
|
94 |
+
"grad_norm": 0.0037306402809917927,
|
95 |
+
"learning_rate": 1.2533064011927093e-05,
|
96 |
+
"loss": 0.0739,
|
97 |
"step": 15588
|
98 |
},
|
99 |
{
|
100 |
"epoch": 2.4989179050642045,
|
101 |
+
"grad_norm": 0.5977134108543396,
|
102 |
"learning_rate": 8.368200836820084e-06,
|
103 |
+
"loss": 0.0709,
|
104 |
"step": 17320
|
105 |
},
|
106 |
{
|
107 |
"epoch": 2.7488096955706247,
|
108 |
+
"grad_norm": 0.007765794638544321,
|
109 |
"learning_rate": 4.205742317125956e-06,
|
110 |
+
"loss": 0.0679,
|
111 |
"step": 19052
|
112 |
},
|
113 |
{
|
114 |
"epoch": 2.998701486077045,
|
115 |
+
"grad_norm": 2.047842025756836,
|
116 |
"learning_rate": 4.328379743182802e-08,
|
117 |
+
"loss": 0.0643,
|
118 |
"step": 20784
|
119 |
},
|
120 |
{
|
121 |
"epoch": 3.0,
|
122 |
+
"eval_accuracy": 0.949984150291671,
|
123 |
+
"eval_f1": 0.949582664985337,
|
124 |
+
"eval_loss": 0.1690009981393814,
|
125 |
+
"eval_precision": 0.9493962920968388,
|
126 |
+
"eval_recall": 0.949984150291671,
|
127 |
+
"eval_runtime": 64.0908,
|
128 |
+
"eval_samples_per_second": 305.333,
|
129 |
+
"eval_steps_per_second": 19.098,
|
130 |
"step": 20793
|
131 |
}
|
132 |
],
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5777
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7f01b0496d454ec432e306869978de94d3a9263262bdc01f48f66edfae77c6ad
|
3 |
size 5777
|