cahya-indonesian-ner-tuned / inference_example.py
asmud's picture
Upload folder using huggingface_hub
a1347f3 verified
raw
history blame
6.65 kB
#!/usr/bin/env python3
"""
Indonesian NER BERT - Inference Example
========================================
This script demonstrates how to use the Indonesian NER BERT model
for named entity recognition on Indonesian text.
Usage:
python inference_example.py
"""
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch
def load_model(model_name_or_path="asmud/cahya-indonesian-ner-tuned"):
"""Load the Indonesian NER BERT model and tokenizer"""
print("๐Ÿ”„ Loading Indonesian NER BERT model...")
try:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForTokenClassification.from_pretrained(model_name_or_path)
print("โœ… Model loaded successfully!")
return tokenizer, model
except Exception as e:
print(f"โŒ Error loading model: {e}")
return None, None
def create_ner_pipeline(model, tokenizer):
"""Create a NER pipeline for easy inference"""
return pipeline(
"ner",
model=model,
tokenizer=tokenizer,
aggregation_strategy="simple",
device=0 if torch.cuda.is_available() else -1
)
def demonstrate_basic_usage():
"""Demonstrate basic NER inference"""
print("\n๐ŸŽฏ BASIC USAGE DEMONSTRATION")
print("=" * 50)
# Load model
tokenizer, model = load_model()
if not model or not tokenizer:
return
# Create pipeline
ner_pipeline = create_ner_pipeline(model, tokenizer)
# Example texts
example_texts = [
"Presiden Joko Widodo menghadiri rapat di Gedung DPR pada 15 Januari 2024.",
"Bank Indonesia menetapkan suku bunga 5.75 persen untuk mendorong investasi.",
"Kementerian Kesehatan mengalokasikan dana sebesar 10 miliar rupiah untuk program vaksinasi.",
"Gubernur Jawa Barat meresmikan Bandara Internasional Kertajati di Majalengka.",
"Mahkamah Konstitusi memutuskan UU No. 12 Tahun 2023 tentang Pemilu tidak bertentangan dengan konstitusi."
]
for i, text in enumerate(example_texts, 1):
print(f"\n๐Ÿ“ Example {i}:")
print(f"Text: {text}")
print("Entities found:")
# Get NER results
results = ner_pipeline(text)
if results:
for entity in results:
print(f" ๐Ÿท๏ธ {entity['entity_group']:>6}: {entity['word']:<20} (confidence: {entity['score']:.3f})")
else:
print(" No entities found.")
print("-" * 80)
def demonstrate_custom_inference():
"""Demonstrate custom token-level inference"""
print("\n๐Ÿ”ง CUSTOM INFERENCE DEMONSTRATION")
print("=" * 50)
# Load model components
tokenizer, model = load_model()
if not model or not tokenizer:
return
def predict_tokens(text):
"""Perform token-level NER prediction"""
# Tokenize
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
# Predict
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predicted_labels = torch.argmax(predictions, dim=-1)
# Convert to readable format
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
labels = [model.config.id2label[label_id.item()] for label_id in predicted_labels[0]]
confidences = [torch.max(predictions[0][i]).item() for i in range(len(tokens))]
# Filter out special tokens
results = []
for token, label, conf in zip(tokens, labels, confidences):
if token not in ['[CLS]', '[SEP]', '[PAD]']:
results.append((token, label, conf))
return results
# Example text
text = "Menteri Retno Marsudi bertemu dengan delegasi ASEAN di Hotel Indonesia pada pukul 14.30 WIB."
print(f"Text: {text}")
print("\nToken-level predictions:")
print(f"{'Token':<15} {'Label':<8} {'Confidence':<10}")
print("-" * 35)
results = predict_tokens(text)
for token, label, conf in results:
# Clean up subword tokens
display_token = token.replace('##', '')
print(f"{display_token:<15} {label:<8} {conf:<10.3f}")
def demonstrate_entity_types():
"""Demonstrate all supported entity types"""
print("\n๐Ÿท๏ธ SUPPORTED ENTITY TYPES DEMONSTRATION")
print("=" * 50)
# Load model
tokenizer, model = load_model()
if not model or not tokenizer:
return
ner_pipeline = create_ner_pipeline(model, tokenizer)
# Examples showcasing different entity types
entity_examples = {
"Person (PER)": "Menteri Budi Gunadi Sadikin memberikan keterangan pers.",
"Organization (ORG)": "PT Telkom Indonesia meluncurkan layanan 5G terbaru.",
"Location (LOC)": "Wisatawan mengunjungi Danau Toba dan Gunung Bromo.",
"Geopolitical (GPE)": "Delegasi dari Jakarta bertemu dengan perwakilan Surabaya.",
"Date (DAT)": "Acara dilaksanakan pada 17 Agustus 2024.",
"Time (TIM)": "Rapat dimulai pukul 09.00 WIB.",
"Money (MON)": "Anggaran sebesar 50 miliar rupiah telah disetujui.",
"Percentage (PCT)": "Inflasi naik 3.2 persen bulan ini.",
"Quantity (QTY)": "Bantuan berupa 500 ton beras disalurkan.",
"Facility (FAC)": "Peresmian Bandara Soekarno-Hatta Terminal 4.",
"Law (LAW)": "UU No. 23 Tahun 2014 tentang Pemerintahan Daerah.",
"Event (EVT)": "Konferensi Asia-Pasifik 2024 akan digelar bulan depan."
}
for category, text in entity_examples.items():
print(f"\n๐Ÿ“‚ {category}:")
print(f" Text: {text}")
print(" Entities:")
results = ner_pipeline(text)
if results:
for entity in results:
print(f" โ€ข {entity['entity_group']}: {entity['word']} ({entity['score']:.3f})")
else:
print(" No entities detected")
def main():
"""Main demonstration function"""
print("๐Ÿ‡ฎ๐Ÿ‡ฉ Indonesian NER BERT - Inference Examples")
print("=" * 60)
print("This script demonstrates various ways to use the Indonesian NER BERT model")
print("for named entity recognition in Indonesian text.")
# Run demonstrations
demonstrate_basic_usage()
demonstrate_custom_inference()
demonstrate_entity_types()
print("\n๐ŸŽ‰ Demonstration completed!")
print("For more information, see the README.md file or visit the model page.")
if __name__ == "__main__":
main()