#!/usr/bin/env python3
"""
Indonesian NER BERT - Batch Processing Example
==============================================

This script demonstrates how to process multiple Indonesian texts
in batch for efficient named entity recognition.

Usage:
    python batch_processing.py --input texts.txt --output results.json
    python batch_processing.py --demo  # Run demonstration
"""

import argparse
import json
import time
from pathlib import Path
from typing import List, Dict, Any
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch

class IndonesianNERProcessor:
    """Batch processor for Indonesian NER"""
    
    def __init__(self, model_path="asmud/cahya-indonesian-ner-tuned", batch_size=8):
        """Initialize the NER processor
        
        Args:
            model_path: Path to the model directory
            batch_size: Number of texts to process in each batch
        """
        self.batch_size = batch_size
        self.model_path = model_path
        self.tokenizer = None
        self.model = None
        self.pipeline = None
        self._load_model()
    
    def _load_model(self):
        """Load the model and create pipeline"""
        print(f"🔄 Loading Indonesian NER model from {self.model_path}...")
        
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
            self.model = AutoModelForTokenClassification.from_pretrained(self.model_path)
            
            # Create pipeline with optimal settings for batch processing
            self.pipeline = pipeline(
                "ner",
                model=self.model,
                tokenizer=self.tokenizer,
                aggregation_strategy="simple",
                device=0 if torch.cuda.is_available() else -1,
                batch_size=self.batch_size
            )
            
            print("✅ Model loaded successfully!")
            print(f"📊 Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
            print(f"📦 Batch size: {self.batch_size}")
            
        except Exception as e:
            print(f"❌ Error loading model: {e}")
            raise
    
    def process_texts(self, texts: List[str]) -> List[Dict[str, Any]]:
        """Process a list of texts and return NER results
        
        Args:
            texts: List of Indonesian texts to process
            
        Returns:
            List of dictionaries containing NER results for each text
        """
        print(f"🚀 Processing {len(texts)} texts...")
        start_time = time.time()
        
        results = []
        
        # Process in batches
        for i in range(0, len(texts), self.batch_size):
            batch = texts[i:i + self.batch_size]
            batch_start = time.time()
            
            print(f"📦 Processing batch {i//self.batch_size + 1}/{(len(texts)-1)//self.batch_size + 1} ({len(batch)} texts)")
            
            # Get NER results for the batch
            batch_results = self.pipeline(batch)
            
            # Process results
            for j, (text, ner_result) in enumerate(zip(batch, batch_results)):
                result = {
                    'text_id': i + j,
                    'text': text,
                    'entities': [],
                    'entity_count': len(ner_result) if ner_result else 0,
                    'processing_time': time.time() - batch_start
                }
                
                # Add entity information
                if ner_result:
                    for entity in ner_result:
                        result['entities'].append({
                            'text': entity['word'],
                            'label': entity['entity_group'],
                            'confidence': round(entity['score'], 4),
                            'start': entity['start'],
                            'end': entity['end']
                        })
                
                results.append(result)
            
            batch_time = time.time() - batch_start
            print(f"   ⏱️  Batch completed in {batch_time:.2f}s ({batch_time/len(batch):.3f}s per text)")
        
        total_time = time.time() - start_time
        print(f"✅ Processing completed in {total_time:.2f}s")
        print(f"📈 Average: {total_time/len(texts):.3f}s per text")
        
        return results
    
    def process_file(self, input_file: str, output_file: str = None):
        """Process texts from a file and save results
        
        Args:
            input_file: Path to input text file (one text per line)
            output_file: Path to output JSON file (optional)
        """
        input_path = Path(input_file)
        if not input_path.exists():
            raise FileNotFoundError(f"Input file not found: {input_file}")
        
        # Read texts from file
        print(f"📖 Reading texts from {input_file}...")
        with open(input_path, 'r', encoding='utf-8') as f:
            texts = [line.strip() for line in f if line.strip()]
        
        print(f"📝 Found {len(texts)} texts to process")
        
        # Process texts
        results = self.process_texts(texts)
        
        # Generate summary statistics
        total_entities = sum(r['entity_count'] for r in results)
        entity_types = {}
        
        for result in results:
            for entity in result['entities']:
                label = entity['label']
                entity_types[label] = entity_types.get(label, 0) + 1
        
        summary = {
            'processing_summary': {
                'total_texts': len(texts),
                'total_entities': total_entities,
                'average_entities_per_text': round(total_entities / len(texts), 2) if texts else 0,
                'entity_types_found': len(entity_types),
                'entity_distribution': entity_types
            },
            'results': results
        }
        
        # Save results
        if output_file:
            output_path = Path(output_file)
            print(f"💾 Saving results to {output_file}...")
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(summary, f, indent=2, ensure_ascii=False)
            print("✅ Results saved successfully!")
        
        return summary

def run_demonstration():
    """Run a demonstration of batch processing"""
    print("🎯 BATCH PROCESSING DEMONSTRATION")
    print("=" * 50)
    
    # Sample Indonesian texts
    demo_texts = [
        "Presiden Joko Widodo menghadiri KTT G20 di Bali pada November 2022.",
        "Bank Indonesia menaikkan suku bunga acuan menjadi 5.75 persen.",
        "Kementerian Kesehatan meluncurkan program vaksinasi COVID-19 tahap ketiga.",
        "PT Pertamina bekerja sama dengan Shell mengembangkan energi terbarukan.",
        "Gubernur DKI Jakarta meresmikan MRT fase 2 dari Bundaran HI ke Kota.",
        "Mahkamah Konstitusi memutuskan UU Cipta Kerja tidak melanggar konstitusi.",
        "Tim nasional Indonesia meraih medali emas di SEA Games 2023 di Kamboja.",
        "Bursa Efek Indonesia mencatat rekor transaksi harian 15 triliun rupiah.",
        "Menteri Pendidikan meluncurkan kurikulum merdeka untuk seluruh sekolah.",
        "PLN mengalokasikan investasi 100 miliar dollar untuk infrastruktur listrik."
    ]
    
    # Initialize processor
    processor = IndonesianNERProcessor(batch_size=4)
    
    # Process texts
    results = processor.process_texts(demo_texts)
    
    # Display results
    print(f"\n📊 PROCESSING RESULTS")
    print("=" * 50)
    
    total_entities = 0
    entity_types = {}
    
    for i, result in enumerate(results):
        print(f"\n📝 Text {i+1}: {result['text'][:60]}...")
        print(f"   Entities found: {result['entity_count']}")
        
        if result['entities']:
            for entity in result['entities']:
                print(f"     • {entity['label']:>6}: {entity['text']:<20} ({entity['confidence']:.3f})")
                
                # Count entity types
                label = entity['label']
                entity_types[label] = entity_types.get(label, 0) + 1
        
        total_entities += result['entity_count']
    
    # Summary statistics
    print(f"\n📈 SUMMARY STATISTICS")
    print("=" * 50)
    print(f"Total texts processed: {len(results)}")
    print(f"Total entities found: {total_entities}")
    print(f"Average entities per text: {total_entities/len(results):.1f}")
    print(f"\nEntity type distribution:")
    
    for entity_type, count in sorted(entity_types.items()):
        percentage = (count / total_entities) * 100
        print(f"  {entity_type:>6}: {count:>3} ({percentage:>5.1f}%)")

def main():
    """Main function with command-line interface"""
    parser = argparse.ArgumentParser(
        description="Batch process Indonesian texts for Named Entity Recognition",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python batch_processing.py --demo
  python batch_processing.py --input texts.txt --output results.json
  python batch_processing.py --input news_articles.txt --batch-size 16
        """
    )
    
    parser.add_argument(
        '--input', '-i',
        type=str,
        help='Input text file (one text per line)'
    )
    
    parser.add_argument(
        '--output', '-o',
        type=str,
        help='Output JSON file for results'
    )
    
    parser.add_argument(
        '--batch-size', '-b',
        type=int,
        default=8,
        help='Batch size for processing (default: 8)'
    )
    
    parser.add_argument(
        '--model-path', '-m',
        type=str,
        default='asmud/cahya-indonesian-ner-tuned',
        help='Path to the model directory (default: asmud/cahya-indonesian-ner-tuned)'
    )
    
    parser.add_argument(
        '--demo',
        action='store_true',
        help='Run demonstration with sample texts'
    )
    
    args = parser.parse_args()
    
    if args.demo:
        run_demonstration()
    elif args.input:
        # Process file
        processor = IndonesianNERProcessor(
            model_path=args.model_path,
            batch_size=args.batch_size
        )
        
        output_file = args.output or f"{Path(args.input).stem}_ner_results.json"
        summary = processor.process_file(args.input, output_file)
        
        # Print summary
        print(f"\n📊 Processing Summary:")
        print(f"  Texts processed: {summary['processing_summary']['total_texts']}")
        print(f"  Entities found: {summary['processing_summary']['total_entities']}")
        print(f"  Average entities per text: {summary['processing_summary']['average_entities_per_text']}")
        print(f"  Entity types: {summary['processing_summary']['entity_types_found']}")
        
    else:
        parser.print_help()

if __name__ == "__main__":
    main()