|
|
|
""" |
|
Indonesian NER BERT - Inference Example |
|
======================================== |
|
|
|
This script demonstrates how to use the Indonesian NER BERT model |
|
for named entity recognition on Indonesian text. |
|
|
|
Usage: |
|
python inference_example.py |
|
""" |
|
|
|
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline |
|
import torch |
|
|
|
def load_model(model_name_or_path="asmud/cahya-indonesian-ner-tuned"): |
|
"""Load the Indonesian NER BERT model and tokenizer""" |
|
print("๐ Loading Indonesian NER BERT model...") |
|
|
|
try: |
|
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) |
|
model = AutoModelForTokenClassification.from_pretrained(model_name_or_path) |
|
print("โ
Model loaded successfully!") |
|
return tokenizer, model |
|
except Exception as e: |
|
print(f"โ Error loading model: {e}") |
|
return None, None |
|
|
|
def create_ner_pipeline(model, tokenizer): |
|
"""Create a NER pipeline for easy inference""" |
|
return pipeline( |
|
"ner", |
|
model=model, |
|
tokenizer=tokenizer, |
|
aggregation_strategy="simple", |
|
device=0 if torch.cuda.is_available() else -1 |
|
) |
|
|
|
def demonstrate_basic_usage(): |
|
"""Demonstrate basic NER inference""" |
|
print("\n๐ฏ BASIC USAGE DEMONSTRATION") |
|
print("=" * 50) |
|
|
|
|
|
tokenizer, model = load_model() |
|
if not model or not tokenizer: |
|
return |
|
|
|
|
|
ner_pipeline = create_ner_pipeline(model, tokenizer) |
|
|
|
|
|
example_texts = [ |
|
"Presiden Joko Widodo menghadiri rapat di Gedung DPR pada 15 Januari 2024.", |
|
"Bank Indonesia menetapkan suku bunga 5.75 persen untuk mendorong investasi.", |
|
"Kementerian Kesehatan mengalokasikan dana sebesar 10 miliar rupiah untuk program vaksinasi.", |
|
"Gubernur Jawa Barat meresmikan Bandara Internasional Kertajati di Majalengka.", |
|
"Mahkamah Konstitusi memutuskan UU No. 12 Tahun 2023 tentang Pemilu tidak bertentangan dengan konstitusi." |
|
] |
|
|
|
for i, text in enumerate(example_texts, 1): |
|
print(f"\n๐ Example {i}:") |
|
print(f"Text: {text}") |
|
print("Entities found:") |
|
|
|
|
|
results = ner_pipeline(text) |
|
|
|
if results: |
|
for entity in results: |
|
print(f" ๐ท๏ธ {entity['entity_group']:>6}: {entity['word']:<20} (confidence: {entity['score']:.3f})") |
|
else: |
|
print(" No entities found.") |
|
|
|
print("-" * 80) |
|
|
|
def demonstrate_custom_inference(): |
|
"""Demonstrate custom token-level inference""" |
|
print("\n๐ง CUSTOM INFERENCE DEMONSTRATION") |
|
print("=" * 50) |
|
|
|
|
|
tokenizer, model = load_model() |
|
if not model or not tokenizer: |
|
return |
|
|
|
def predict_tokens(text): |
|
"""Perform token-level NER prediction""" |
|
|
|
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) |
|
predicted_labels = torch.argmax(predictions, dim=-1) |
|
|
|
|
|
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) |
|
labels = [model.config.id2label[label_id.item()] for label_id in predicted_labels[0]] |
|
confidences = [torch.max(predictions[0][i]).item() for i in range(len(tokens))] |
|
|
|
|
|
results = [] |
|
for token, label, conf in zip(tokens, labels, confidences): |
|
if token not in ['[CLS]', '[SEP]', '[PAD]']: |
|
results.append((token, label, conf)) |
|
|
|
return results |
|
|
|
|
|
text = "Menteri Retno Marsudi bertemu dengan delegasi ASEAN di Hotel Indonesia pada pukul 14.30 WIB." |
|
print(f"Text: {text}") |
|
print("\nToken-level predictions:") |
|
print(f"{'Token':<15} {'Label':<8} {'Confidence':<10}") |
|
print("-" * 35) |
|
|
|
results = predict_tokens(text) |
|
for token, label, conf in results: |
|
|
|
display_token = token.replace('##', '') |
|
print(f"{display_token:<15} {label:<8} {conf:<10.3f}") |
|
|
|
def demonstrate_entity_types(): |
|
"""Demonstrate all supported entity types""" |
|
print("\n๐ท๏ธ SUPPORTED ENTITY TYPES DEMONSTRATION") |
|
print("=" * 50) |
|
|
|
|
|
tokenizer, model = load_model() |
|
if not model or not tokenizer: |
|
return |
|
|
|
ner_pipeline = create_ner_pipeline(model, tokenizer) |
|
|
|
|
|
entity_examples = { |
|
"Person (PER)": "Menteri Budi Gunadi Sadikin memberikan keterangan pers.", |
|
"Organization (ORG)": "PT Telkom Indonesia meluncurkan layanan 5G terbaru.", |
|
"Location (LOC)": "Wisatawan mengunjungi Danau Toba dan Gunung Bromo.", |
|
"Geopolitical (GPE)": "Delegasi dari Jakarta bertemu dengan perwakilan Surabaya.", |
|
"Date (DAT)": "Acara dilaksanakan pada 17 Agustus 2024.", |
|
"Time (TIM)": "Rapat dimulai pukul 09.00 WIB.", |
|
"Money (MON)": "Anggaran sebesar 50 miliar rupiah telah disetujui.", |
|
"Percentage (PCT)": "Inflasi naik 3.2 persen bulan ini.", |
|
"Quantity (QTY)": "Bantuan berupa 500 ton beras disalurkan.", |
|
"Facility (FAC)": "Peresmian Bandara Soekarno-Hatta Terminal 4.", |
|
"Law (LAW)": "UU No. 23 Tahun 2014 tentang Pemerintahan Daerah.", |
|
"Event (EVT)": "Konferensi Asia-Pasifik 2024 akan digelar bulan depan." |
|
} |
|
|
|
for category, text in entity_examples.items(): |
|
print(f"\n๐ {category}:") |
|
print(f" Text: {text}") |
|
print(" Entities:") |
|
|
|
results = ner_pipeline(text) |
|
if results: |
|
for entity in results: |
|
print(f" โข {entity['entity_group']}: {entity['word']} ({entity['score']:.3f})") |
|
else: |
|
print(" No entities detected") |
|
|
|
def main(): |
|
"""Main demonstration function""" |
|
print("๐ฎ๐ฉ Indonesian NER BERT - Inference Examples") |
|
print("=" * 60) |
|
print("This script demonstrates various ways to use the Indonesian NER BERT model") |
|
print("for named entity recognition in Indonesian text.") |
|
|
|
|
|
demonstrate_basic_usage() |
|
demonstrate_custom_inference() |
|
demonstrate_entity_types() |
|
|
|
print("\n๐ Demonstration completed!") |
|
print("For more information, see the README.md file or visit the model page.") |
|
|
|
if __name__ == "__main__": |
|
main() |