#!/usr/bin/env python3 """ Indonesian NER BERT - Inference Example ======================================== This script demonstrates how to use the Indonesian NER BERT model for named entity recognition on Indonesian text. Usage: python inference_example.py """ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline import torch def load_model(model_name_or_path="asmud/cahya-indonesian-ner-tuned"): """Load the Indonesian NER BERT model and tokenizer""" print("šŸ”„ Loading Indonesian NER BERT model...") try: tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) model = AutoModelForTokenClassification.from_pretrained(model_name_or_path) print("āœ… Model loaded successfully!") return tokenizer, model except Exception as e: print(f"āŒ Error loading model: {e}") return None, None def create_ner_pipeline(model, tokenizer): """Create a NER pipeline for easy inference""" return pipeline( "ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=0 if torch.cuda.is_available() else -1 ) def demonstrate_basic_usage(): """Demonstrate basic NER inference""" print("\nšŸŽÆ BASIC USAGE DEMONSTRATION") print("=" * 50) # Load model tokenizer, model = load_model() if not model or not tokenizer: return # Create pipeline ner_pipeline = create_ner_pipeline(model, tokenizer) # Example texts example_texts = [ "Presiden Joko Widodo menghadiri rapat di Gedung DPR pada 15 Januari 2024.", "Bank Indonesia menetapkan suku bunga 5.75 persen untuk mendorong investasi.", "Kementerian Kesehatan mengalokasikan dana sebesar 10 miliar rupiah untuk program vaksinasi.", "Gubernur Jawa Barat meresmikan Bandara Internasional Kertajati di Majalengka.", "Mahkamah Konstitusi memutuskan UU No. 12 Tahun 2023 tentang Pemilu tidak bertentangan dengan konstitusi." ] for i, text in enumerate(example_texts, 1): print(f"\nšŸ“ Example {i}:") print(f"Text: {text}") print("Entities found:") # Get NER results results = ner_pipeline(text) if results: for entity in results: print(f" šŸ·ļø {entity['entity_group']:>6}: {entity['word']:<20} (confidence: {entity['score']:.3f})") else: print(" No entities found.") print("-" * 80) def demonstrate_custom_inference(): """Demonstrate custom token-level inference""" print("\nšŸ”§ CUSTOM INFERENCE DEMONSTRATION") print("=" * 50) # Load model components tokenizer, model = load_model() if not model or not tokenizer: return def predict_tokens(text): """Perform token-level NER prediction""" # Tokenize inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) # Predict with torch.no_grad(): outputs = model(**inputs) predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) predicted_labels = torch.argmax(predictions, dim=-1) # Convert to readable format tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) labels = [model.config.id2label[label_id.item()] for label_id in predicted_labels[0]] confidences = [torch.max(predictions[0][i]).item() for i in range(len(tokens))] # Filter out special tokens results = [] for token, label, conf in zip(tokens, labels, confidences): if token not in ['[CLS]', '[SEP]', '[PAD]']: results.append((token, label, conf)) return results # Example text text = "Menteri Retno Marsudi bertemu dengan delegasi ASEAN di Hotel Indonesia pada pukul 14.30 WIB." print(f"Text: {text}") print("\nToken-level predictions:") print(f"{'Token':<15} {'Label':<8} {'Confidence':<10}") print("-" * 35) results = predict_tokens(text) for token, label, conf in results: # Clean up subword tokens display_token = token.replace('##', '') print(f"{display_token:<15} {label:<8} {conf:<10.3f}") def demonstrate_entity_types(): """Demonstrate all supported entity types""" print("\nšŸ·ļø SUPPORTED ENTITY TYPES DEMONSTRATION") print("=" * 50) # Load model tokenizer, model = load_model() if not model or not tokenizer: return ner_pipeline = create_ner_pipeline(model, tokenizer) # Examples showcasing different entity types entity_examples = { "Person (PER)": "Menteri Budi Gunadi Sadikin memberikan keterangan pers.", "Organization (ORG)": "PT Telkom Indonesia meluncurkan layanan 5G terbaru.", "Location (LOC)": "Wisatawan mengunjungi Danau Toba dan Gunung Bromo.", "Geopolitical (GPE)": "Delegasi dari Jakarta bertemu dengan perwakilan Surabaya.", "Date (DAT)": "Acara dilaksanakan pada 17 Agustus 2024.", "Time (TIM)": "Rapat dimulai pukul 09.00 WIB.", "Money (MON)": "Anggaran sebesar 50 miliar rupiah telah disetujui.", "Percentage (PCT)": "Inflasi naik 3.2 persen bulan ini.", "Quantity (QTY)": "Bantuan berupa 500 ton beras disalurkan.", "Facility (FAC)": "Peresmian Bandara Soekarno-Hatta Terminal 4.", "Law (LAW)": "UU No. 23 Tahun 2014 tentang Pemerintahan Daerah.", "Event (EVT)": "Konferensi Asia-Pasifik 2024 akan digelar bulan depan." } for category, text in entity_examples.items(): print(f"\nšŸ“‚ {category}:") print(f" Text: {text}") print(" Entities:") results = ner_pipeline(text) if results: for entity in results: print(f" • {entity['entity_group']}: {entity['word']} ({entity['score']:.3f})") else: print(" No entities detected") def main(): """Main demonstration function""" print("šŸ‡®šŸ‡© Indonesian NER BERT - Inference Examples") print("=" * 60) print("This script demonstrates various ways to use the Indonesian NER BERT model") print("for named entity recognition in Indonesian text.") # Run demonstrations demonstrate_basic_usage() demonstrate_custom_inference() demonstrate_entity_types() print("\nšŸŽ‰ Demonstration completed!") print("For more information, see the README.md file or visit the model page.") if __name__ == "__main__": main()