#!/usr/bin/env python3 """ Indonesian NER BERT - Batch Processing Example ============================================== This script demonstrates how to process multiple Indonesian texts in batch for efficient named entity recognition. Usage: python batch_processing.py --input texts.txt --output results.json python batch_processing.py --demo # Run demonstration """ import argparse import json import time from pathlib import Path from typing import List, Dict, Any from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline import torch class IndonesianNERProcessor: """Batch processor for Indonesian NER""" def __init__(self, model_path="asmud/cahya-indonesian-ner-tuned", batch_size=8): """Initialize the NER processor Args: model_path: Path to the model directory batch_size: Number of texts to process in each batch """ self.batch_size = batch_size self.model_path = model_path self.tokenizer = None self.model = None self.pipeline = None self._load_model() def _load_model(self): """Load the model and create pipeline""" print(f"šŸ”„ Loading Indonesian NER model from {self.model_path}...") try: self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) self.model = AutoModelForTokenClassification.from_pretrained(self.model_path) # Create pipeline with optimal settings for batch processing self.pipeline = pipeline( "ner", model=self.model, tokenizer=self.tokenizer, aggregation_strategy="simple", device=0 if torch.cuda.is_available() else -1, batch_size=self.batch_size ) print("āœ… Model loaded successfully!") print(f"šŸ“Š Device: {'GPU' if torch.cuda.is_available() else 'CPU'}") print(f"šŸ“¦ Batch size: {self.batch_size}") except Exception as e: print(f"āŒ Error loading model: {e}") raise def process_texts(self, texts: List[str]) -> List[Dict[str, Any]]: """Process a list of texts and return NER results Args: texts: List of Indonesian texts to process Returns: List of dictionaries containing NER results for each text """ print(f"šŸš€ Processing {len(texts)} texts...") start_time = time.time() results = [] # Process in batches for i in range(0, len(texts), self.batch_size): batch = texts[i:i + self.batch_size] batch_start = time.time() print(f"šŸ“¦ Processing batch {i//self.batch_size + 1}/{(len(texts)-1)//self.batch_size + 1} ({len(batch)} texts)") # Get NER results for the batch batch_results = self.pipeline(batch) # Process results for j, (text, ner_result) in enumerate(zip(batch, batch_results)): result = { 'text_id': i + j, 'text': text, 'entities': [], 'entity_count': len(ner_result) if ner_result else 0, 'processing_time': time.time() - batch_start } # Add entity information if ner_result: for entity in ner_result: result['entities'].append({ 'text': entity['word'], 'label': entity['entity_group'], 'confidence': round(entity['score'], 4), 'start': entity['start'], 'end': entity['end'] }) results.append(result) batch_time = time.time() - batch_start print(f" ā±ļø Batch completed in {batch_time:.2f}s ({batch_time/len(batch):.3f}s per text)") total_time = time.time() - start_time print(f"āœ… Processing completed in {total_time:.2f}s") print(f"šŸ“ˆ Average: {total_time/len(texts):.3f}s per text") return results def process_file(self, input_file: str, output_file: str = None): """Process texts from a file and save results Args: input_file: Path to input text file (one text per line) output_file: Path to output JSON file (optional) """ input_path = Path(input_file) if not input_path.exists(): raise FileNotFoundError(f"Input file not found: {input_file}") # Read texts from file print(f"šŸ“– Reading texts from {input_file}...") with open(input_path, 'r', encoding='utf-8') as f: texts = [line.strip() for line in f if line.strip()] print(f"šŸ“ Found {len(texts)} texts to process") # Process texts results = self.process_texts(texts) # Generate summary statistics total_entities = sum(r['entity_count'] for r in results) entity_types = {} for result in results: for entity in result['entities']: label = entity['label'] entity_types[label] = entity_types.get(label, 0) + 1 summary = { 'processing_summary': { 'total_texts': len(texts), 'total_entities': total_entities, 'average_entities_per_text': round(total_entities / len(texts), 2) if texts else 0, 'entity_types_found': len(entity_types), 'entity_distribution': entity_types }, 'results': results } # Save results if output_file: output_path = Path(output_file) print(f"šŸ’¾ Saving results to {output_file}...") with open(output_path, 'w', encoding='utf-8') as f: json.dump(summary, f, indent=2, ensure_ascii=False) print("āœ… Results saved successfully!") return summary def run_demonstration(): """Run a demonstration of batch processing""" print("šŸŽÆ BATCH PROCESSING DEMONSTRATION") print("=" * 50) # Sample Indonesian texts demo_texts = [ "Presiden Joko Widodo menghadiri KTT G20 di Bali pada November 2022.", "Bank Indonesia menaikkan suku bunga acuan menjadi 5.75 persen.", "Kementerian Kesehatan meluncurkan program vaksinasi COVID-19 tahap ketiga.", "PT Pertamina bekerja sama dengan Shell mengembangkan energi terbarukan.", "Gubernur DKI Jakarta meresmikan MRT fase 2 dari Bundaran HI ke Kota.", "Mahkamah Konstitusi memutuskan UU Cipta Kerja tidak melanggar konstitusi.", "Tim nasional Indonesia meraih medali emas di SEA Games 2023 di Kamboja.", "Bursa Efek Indonesia mencatat rekor transaksi harian 15 triliun rupiah.", "Menteri Pendidikan meluncurkan kurikulum merdeka untuk seluruh sekolah.", "PLN mengalokasikan investasi 100 miliar dollar untuk infrastruktur listrik." ] # Initialize processor processor = IndonesianNERProcessor(batch_size=4) # Process texts results = processor.process_texts(demo_texts) # Display results print(f"\nšŸ“Š PROCESSING RESULTS") print("=" * 50) total_entities = 0 entity_types = {} for i, result in enumerate(results): print(f"\nšŸ“ Text {i+1}: {result['text'][:60]}...") print(f" Entities found: {result['entity_count']}") if result['entities']: for entity in result['entities']: print(f" • {entity['label']:>6}: {entity['text']:<20} ({entity['confidence']:.3f})") # Count entity types label = entity['label'] entity_types[label] = entity_types.get(label, 0) + 1 total_entities += result['entity_count'] # Summary statistics print(f"\nšŸ“ˆ SUMMARY STATISTICS") print("=" * 50) print(f"Total texts processed: {len(results)}") print(f"Total entities found: {total_entities}") print(f"Average entities per text: {total_entities/len(results):.1f}") print(f"\nEntity type distribution:") for entity_type, count in sorted(entity_types.items()): percentage = (count / total_entities) * 100 print(f" {entity_type:>6}: {count:>3} ({percentage:>5.1f}%)") def main(): """Main function with command-line interface""" parser = argparse.ArgumentParser( description="Batch process Indonesian texts for Named Entity Recognition", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python batch_processing.py --demo python batch_processing.py --input texts.txt --output results.json python batch_processing.py --input news_articles.txt --batch-size 16 """ ) parser.add_argument( '--input', '-i', type=str, help='Input text file (one text per line)' ) parser.add_argument( '--output', '-o', type=str, help='Output JSON file for results' ) parser.add_argument( '--batch-size', '-b', type=int, default=8, help='Batch size for processing (default: 8)' ) parser.add_argument( '--model-path', '-m', type=str, default='asmud/cahya-indonesian-ner-tuned', help='Path to the model directory (default: asmud/cahya-indonesian-ner-tuned)' ) parser.add_argument( '--demo', action='store_true', help='Run demonstration with sample texts' ) args = parser.parse_args() if args.demo: run_demonstration() elif args.input: # Process file processor = IndonesianNERProcessor( model_path=args.model_path, batch_size=args.batch_size ) output_file = args.output or f"{Path(args.input).stem}_ner_results.json" summary = processor.process_file(args.input, output_file) # Print summary print(f"\nšŸ“Š Processing Summary:") print(f" Texts processed: {summary['processing_summary']['total_texts']}") print(f" Entities found: {summary['processing_summary']['total_entities']}") print(f" Average entities per text: {summary['processing_summary']['average_entities_per_text']}") print(f" Entity types: {summary['processing_summary']['entity_types_found']}") else: parser.print_help() if __name__ == "__main__": main()