cahya-indonesian-ner-tuned / inference_example.py

Upload folder using huggingface_hub

a1347f3 verified about 2 months ago

6.65 kB

	#!/usr/bin/env python3
	"""
	Indonesian NER BERT - Inference Example
	========================================

	This script demonstrates how to use the Indonesian NER BERT model
	for named entity recognition on Indonesian text.

	Usage:
	python inference_example.py
	"""

	from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
	import torch

	def load_model(model_name_or_path="asmud/cahya-indonesian-ner-tuned"):
	"""Load the Indonesian NER BERT model and tokenizer"""
	print("🔄 Loading Indonesian NER BERT model...")

	try:
	tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
	model = AutoModelForTokenClassification.from_pretrained(model_name_or_path)
	print("✅ Model loaded successfully!")
	return tokenizer, model
	except Exception as e:
	print(f"❌ Error loading model: {e}")
	return None, None

	def create_ner_pipeline(model, tokenizer):
	"""Create a NER pipeline for easy inference"""
	return pipeline(
	"ner",
	model=model,
	tokenizer=tokenizer,
	aggregation_strategy="simple",
	device=0 if torch.cuda.is_available() else -1
	)

	def demonstrate_basic_usage():
	"""Demonstrate basic NER inference"""
	print("\n🎯 BASIC USAGE DEMONSTRATION")
	print("=" * 50)

	# Load model
	tokenizer, model = load_model()
	if not model or not tokenizer:
	return

	# Create pipeline
	ner_pipeline = create_ner_pipeline(model, tokenizer)

	# Example texts
	example_texts = [
	"Presiden Joko Widodo menghadiri rapat di Gedung DPR pada 15 Januari 2024.",
	"Bank Indonesia menetapkan suku bunga 5.75 persen untuk mendorong investasi.",
	"Kementerian Kesehatan mengalokasikan dana sebesar 10 miliar rupiah untuk program vaksinasi.",
	"Gubernur Jawa Barat meresmikan Bandara Internasional Kertajati di Majalengka.",
	"Mahkamah Konstitusi memutuskan UU No. 12 Tahun 2023 tentang Pemilu tidak bertentangan dengan konstitusi."
	]

	for i, text in enumerate(example_texts, 1):
	print(f"\n📝 Example {i}:")
	print(f"Text: {text}")
	print("Entities found:")

	# Get NER results
	results = ner_pipeline(text)

	if results:
	for entity in results:
	print(f" 🏷️ {entity['entity_group']:>6}: {entity['word']:<20} (confidence: {entity['score']:.3f})")
	else:
	print(" No entities found.")

	print("-" * 80)

	def demonstrate_custom_inference():
	"""Demonstrate custom token-level inference"""
	print("\n🔧 CUSTOM INFERENCE DEMONSTRATION")
	print("=" * 50)

	# Load model components
	tokenizer, model = load_model()
	if not model or not tokenizer:
	return

	def predict_tokens(text):
	"""Perform token-level NER prediction"""
	# Tokenize
	inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

	# Predict
	with torch.no_grad():
	outputs = model(**inputs)
	predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
	predicted_labels = torch.argmax(predictions, dim=-1)

	# Convert to readable format
	tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
	labels = [model.config.id2label[label_id.item()] for label_id in predicted_labels[0]]
	confidences = [torch.max(predictions[0][i]).item() for i in range(len(tokens))]

	# Filter out special tokens
	results = []
	for token, label, conf in zip(tokens, labels, confidences):
	if token not in ['[CLS]', '[SEP]', '[PAD]']:
	results.append((token, label, conf))

	return results

	# Example text
	text = "Menteri Retno Marsudi bertemu dengan delegasi ASEAN di Hotel Indonesia pada pukul 14.30 WIB."
	print(f"Text: {text}")
	print("\nToken-level predictions:")
	print(f"{'Token':<15} {'Label':<8} {'Confidence':<10}")
	print("-" * 35)

	results = predict_tokens(text)
	for token, label, conf in results:
	# Clean up subword tokens
	display_token = token.replace('##', '')
	print(f"{display_token:<15} {label:<8} {conf:<10.3f}")

	def demonstrate_entity_types():
	"""Demonstrate all supported entity types"""
	print("\n🏷️ SUPPORTED ENTITY TYPES DEMONSTRATION")
	print("=" * 50)

	# Load model
	tokenizer, model = load_model()
	if not model or not tokenizer:
	return

	ner_pipeline = create_ner_pipeline(model, tokenizer)

	# Examples showcasing different entity types
	entity_examples = {
	"Person (PER)": "Menteri Budi Gunadi Sadikin memberikan keterangan pers.",
	"Organization (ORG)": "PT Telkom Indonesia meluncurkan layanan 5G terbaru.",
	"Location (LOC)": "Wisatawan mengunjungi Danau Toba dan Gunung Bromo.",
	"Geopolitical (GPE)": "Delegasi dari Jakarta bertemu dengan perwakilan Surabaya.",
	"Date (DAT)": "Acara dilaksanakan pada 17 Agustus 2024.",
	"Time (TIM)": "Rapat dimulai pukul 09.00 WIB.",
	"Money (MON)": "Anggaran sebesar 50 miliar rupiah telah disetujui.",
	"Percentage (PCT)": "Inflasi naik 3.2 persen bulan ini.",
	"Quantity (QTY)": "Bantuan berupa 500 ton beras disalurkan.",
	"Facility (FAC)": "Peresmian Bandara Soekarno-Hatta Terminal 4.",
	"Law (LAW)": "UU No. 23 Tahun 2014 tentang Pemerintahan Daerah.",
	"Event (EVT)": "Konferensi Asia-Pasifik 2024 akan digelar bulan depan."
	}

	for category, text in entity_examples.items():
	print(f"\n📂 {category}:")
	print(f" Text: {text}")
	print(" Entities:")

	results = ner_pipeline(text)
	if results:
	for entity in results:
	print(f" • {entity['entity_group']}: {entity['word']} ({entity['score']:.3f})")
	else:
	print(" No entities detected")

	def main():
	"""Main demonstration function"""
	print("🇮🇩 Indonesian NER BERT - Inference Examples")
	print("=" * 60)
	print("This script demonstrates various ways to use the Indonesian NER BERT model")
	print("for named entity recognition in Indonesian text.")

	# Run demonstrations
	demonstrate_basic_usage()
	demonstrate_custom_inference()
	demonstrate_entity_types()

	print("\n🎉 Demonstration completed!")
	print("For more information, see the README.md file or visit the model page.")

	if __name__ == "__main__":
	main()