π ModernBERT Indian Address NER Model
This model is a fine-tuned ModernBERT for Named Entity Recognition (NER) on Indian addresses. It can extract and classify various address components from Indian address text with high accuracy, leveraging ModernBERT's state-of-the-art architecture and improved efficiency.
π― Model Description
ModernBERT fine-tuned for Indian address Named Entity Recognition (NER)
Key Capabilities
- Address Component Extraction: Identify and classify various parts of Indian addresses
- Multi-format Support: Handle various Indian address formats and styles
- Modern Architecture: Built on ModernBERT's advanced transformer design
- High Accuracy: Fine-tuned on augmented Indian address dataset
- Fast Inference: Optimized ModernBERT for quick entity extraction
- Robust Recognition: Handles partial, incomplete, or informal addresses
- Efficient Processing: ModernBERT's improved efficiency for better performance
- State-of-the-art Base: Leverages latest transformer innovations
π Model Architecture
- Base Model: answerdotai/ModernBERT-base (ModernBERT)
- Model Type: Token Classification (NER)
- Vocabulary Size: 50,368 tokens
- Hidden Size: 768
- Number of Layers: 22
- Attention Heads: 12
- Max Sequence Length: 8192 tokens
- Number of Labels: 23
- Model Size: ~1716MB
- Checkpoint: 20793
π Usage Examples
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import warnings
warnings.filterwarnings("ignore")
class IndianAddressNER:
def __init__(self):
model_name = "shiprocket-ai/open-modernbert-indian-address-ner"
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForTokenClassification.from_pretrained(model_name)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
self.model.eval()
# Entity mappings
self.id2entity = {
"0": "O",
"1": "B-building_name",
"2": "I-building_name",
"3": "B-city",
"4": "I-city",
"5": "B-country",
"6": "I-country",
"7": "B-floor",
"8": "I-floor",
"9": "B-house_details",
"10": "I-house_details",
"11": "B-locality",
"12": "I-locality",
"13": "B-pincode",
"14": "I-pincode",
"15": "B-road",
"16": "I-road",
"17": "B-state",
"18": "I-state",
"19": "B-sub_locality",
"20": "I-sub_locality",
"21": "B-landmarks",
"22": "I-landmarks"
}
def predict(self, address):
"""Extract entities from an Indian address - FIXED VERSION"""
if not address.strip():
return {}
# Tokenize with offset mapping for better text reconstruction
inputs = self.tokenizer(
address,
return_tensors="pt",
truncation=True,
padding=True,
max_length=128,
return_offsets_mapping=True
)
# Extract offset mapping before moving to device
offset_mapping = inputs.pop("offset_mapping")[0]
inputs = {k: v.to(self.device) for k, v in inputs.items()}
# Predict
with torch.no_grad():
outputs = self.model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predicted_ids = torch.argmax(predictions, dim=-1)
confidence_scores = torch.max(predictions, dim=-1)[0]
# Extract entities using offset mapping
entities = self.extract_entities_with_offsets(
address,
predicted_ids[0],
confidence_scores[0],
offset_mapping
)
return entities
def extract_entities_with_offsets(self, original_text, predicted_ids, confidences, offset_mapping):
"""Extract entities using offset mapping for accurate text reconstruction"""
entities = {}
current_entity = None
for i, (pred_id, conf) in enumerate(zip(predicted_ids, confidences)):
if i >= len(offset_mapping):
break
start, end = offset_mapping[i]
# Skip special tokens (they have (0,0) mapping)
if start == end == 0:
continue
label = self.id2entity.get(str(pred_id.item()), "O")
if label.startswith("B-"):
# Save previous entity
if current_entity:
entity_type = current_entity["type"]
if entity_type not in entities:
entities[entity_type] = []
entities[entity_type].append({
"text": current_entity["text"],
"confidence": current_entity["confidence"]
})
# Start new entity
entity_type = label[2:] # Remove "B-"
current_entity = {
"type": entity_type,
"text": original_text[start:end],
"confidence": conf.item(),
"start": start,
"end": end
}
elif label.startswith("I-") and current_entity:
# Continue current entity
entity_type = label[2:] # Remove "I-"
if entity_type == current_entity["type"]:
# Extend the entity to include this token
current_entity["text"] = original_text[current_entity["start"]:end]
current_entity["confidence"] = (current_entity["confidence"] + conf.item()) / 2
current_entity["end"] = end
elif label == "O" and current_entity:
# End current entity
entity_type = current_entity["type"]
if entity_type not in entities:
entities[entity_type] = []
entities[entity_type].append({
"text": current_entity["text"],
"confidence": current_entity["confidence"]
})
current_entity = None
# Add final entity if exists
if current_entity:
entity_type = current_entity["type"]
if entity_type not in entities:
entities[entity_type] = []
entities[entity_type].append({
"text": current_entity["text"],
"confidence": current_entity["confidence"]
})
return entities
# Usage example
ner = IndianAddressNER()
# Test addresses
test_addresses = [
"Shop No 123, Sunshine Apartments, Andheri West, Mumbai, 400058",
"DLF Cyber City, Sector 25, Gurgaon, Haryana",
"Flat 201, MG Road, Bangalore, Karnataka, 560001",
"Phoenix Mall, Kurla West, Mumbai"
]
print("π INDIAN ADDRESS NER EXAMPLES")
print("=" * 50)
for address in test_addresses:
print(f"\nπ Address: {address}")
entities = ner.predict(address)
if entities:
for entity_type, entity_list in sorted(entities.items()):
print(f"π·οΈ {entity_type.replace('_', ' ').title()}:")
for entity in entity_list:
confidence = entity['confidence']
text = entity['text']
confidence_icon = "π’" if confidence > 0.8 else "π‘" if confidence > 0.6 else "π΄"
print(f" {confidence_icon} {text} (confidence: {confidence:.3f})")
else:
print("β No entities found")
print("-" * 40)
π·οΈ Supported Entity Types
The model can identify and extract the following address components:
- Building Name: building_name
- City: city
- Country: country
- Floor: floor
- House Details: house_details
- Landmarks: landmarks
- Locality: locality
- Pincode: pincode
- Road: road
- State: state
- Sub Locality: sub_locality
π Performance Highlights
- Indian Address Optimized: Specialized for Indian address patterns and formats
- ModernBERT Advantage: State-of-the-art transformer architecture
- High Precision: Accurate entity boundary detection
- Multi-component Recognition: Identifies multiple entities in complex addresses
- Confidence Scoring: Provides confidence scores for each extracted entity
- Fast Inference: Optimized for real-time applications
- Robust Handling: Works with partial or informal address formats
- Efficient Architecture: ModernBERT's improved computational efficiency
- Advanced Understanding: Better contextual comprehension with modern design
π§ Training Details
- Dataset: 300% augmented Indian address dataset
- Training Strategy: Fine-tuned from pre-trained ModernBERT
- Specialization: Indian address entity extraction
- Context Length: 128 tokens
- Version: v1.0
- Framework: PyTorch + Transformers
- BIO Tagging: Uses Begin-Inside-Outside tagging scheme
- Base Model Advantage: ModernBERT's advanced architecture and efficiency
π‘ Use Cases
1. Address Parsing & Standardization
- Parse unstructured address text into components
- Standardize address formats for databases
- Extract specific components for validation
2. Form Auto-completion
- Auto-fill address forms by extracting components
- Validate address field completeness
- Suggest corrections for incomplete addresses
3. Data Processing & Migration
- Clean legacy address databases
- Extract structured data from unstructured text
- Migrate addresses between different systems
4. Logistics & Delivery
- Extract delivery-relevant components
- Validate address completeness for shipping
- Improve address accuracy for last-mile delivery
5. Geocoding Preprocessing
- Prepare addresses for geocoding APIs
- Extract location components for mapping
- Improve geocoding accuracy with clean components
β‘ Performance Tips
- Input Length: Keep addresses under 128 tokens for optimal performance
- Batch Processing: Process multiple addresses in batches for efficiency
- GPU Usage: Use GPU for faster inference on large datasets
- Confidence Filtering: Filter results by confidence score for higher precision
- Text Preprocessing: Clean input text for better recognition
- ModernBERT Advantage: Model benefits from advanced architecture optimizations
β οΈ Limitations
- Language Support: Primarily optimized for English Indian addresses
- Regional Variations: May struggle with highly regional or colloquial formats
- New Localities: Performance may vary on very recent developments
- Complex Formatting: May have difficulty with highly unstructured text
- Context Dependency: Works best with clear address context
π Entity Mapping
The model uses BIO (Begin-Inside-Outside) tagging scheme:
{
"entity2id": {
"O": 0,
"B-building_name": 1,
"I-building_name": 2,
"B-city": 3,
"I-city": 4,
"B-country": 5,
"I-country": 6,
"B-floor": 7,
"I-floor": 8,
"B-house_details": 9,
"I-house_details": 10,
"B-locality": 11,
"I-locality": 12,
"B-pincode": 13,
"I-pincode": 14,
"B-road": 15,
"I-road": 16,
"B-state": 17,
"I-state": 18,
"B-sub_locality": 19,
"I-sub_locality": 20,
"B-landmarks": 21,
"I-landmarks": 22
},
"id2entity": {
"0": "O",
"1": "B-building_name",
"2": "I-building_name",
"3": "B-city",
"4": "I-city",
"5": "B-country",
"6": "I-country",
"7": "B-floor",
"8": "I-floor",
"9": "B-house_details",
"10": "I-house_details",
"11": "B-locality",
"12": "I-locality",
"13": "B-pincode",
"14": "I-pincode",
"15": "B-road",
"16": "I-road",
"17": "B-state",
"18": "I-state",
"19": "B-sub_locality",
"20": "I-sub_locality",
"21": "B-landmarks",
"22": "I-landmarks"
}
}
π Model Files
config.json
: Model configuration and hyperparameterspytorch_model.bin
/model.safetensors
: Model weightstokenizer.json
: Tokenizer configurationtokenizer_config.json
: Tokenizer settingsvocab.txt
: Vocabulary fileentity_mappings.json
: Entity type mappings
π Model Updates
- Version: v1.0 (Checkpoint 20793)
- Last Updated: 2025-06-19
- Training Completion: Based on augmented Indian address dataset
- Base Model: ModernBERT for advanced transformer architecture
π Citation
If you use this model in your research or applications, please cite:
@misc{open-modernbert-indian-address-ner,
title={ModernBERT Indian Address NER Model},
year={2025},
publisher={Hugging Face},
url={https://huggingface.co/shiprocket-ai/open-modernbert-indian-address-ner}
}
π Support & Contact
For questions, issues, or feature requests:
- Open an issue in this repository
- Contact: shiprocket-ai team
- Documentation: See usage examples above
π License
This model is released under the Apache 2.0 License. See LICENSE file for details.
Specialized for Indian address entity recognition - Built with β€οΈ by shiprocket-ai team using ModernBERT
- Downloads last month
- 16