"""
AI Service for Legal Dashboard
=============================

Advanced AI-powered features for legal document analysis including:
- Intelligent document scoring and classification
- Legal entity extraction and recognition
- Sentiment analysis for legal documents
- Smart search and recommendation engine
- Document similarity analysis
"""

import re
import json
import logging
from typing import Dict, List, Optional, Tuple, Any
from datetime import datetime, timedelta
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import hashlib
import sqlite3
from pathlib import Path

logger = logging.getLogger(__name__)


class AIScoringEngine:
    """
    Advanced AI scoring engine for legal documents
    Provides intelligent analysis, classification, and recommendations
    """

    def __init__(self):
        """Initialize the AI scoring engine"""
        self.vectorizer = TfidfVectorizer(
            max_features=1000,
            stop_words=None,  # Keep Persian stop words for legal context
            ngram_range=(1, 3)
        )
        self.document_vectors = {}
        self.legal_keywords = self._load_legal_keywords()
        self.entity_patterns = self._load_entity_patterns()
        self.sentiment_indicators = self._load_sentiment_indicators()
        self.classification_categories = self._load_classification_categories()

    def _load_legal_keywords(self) -> Dict[str, List[str]]:
        """Load Persian legal keywords for different categories"""
        return {
            "قانون": [
                "قانون", "ماده", "تبصره", "بند", "فصل", "باب", "مصوبه", "تصویب",
                "مجلس", "شورای", "ملی", "اساسی", "مدنی", "جزایی", "تجاری"
            ],
            "قرارداد": [
                "قرارداد", "عقد", "مفاد", "طرفین", "متعاهدین", "شرایط", "ماده",
                "بند", "مبلغ", "پرداخت", "تعهد", "مسئولیت", "ضمانت"
            ],
            "احکام": [
                "حکم", "رای", "دادگاه", "قاضی", "شعبه", "دعوی", "خواهان",
                "خوانده", "شهادت", "دلیل", "اثبات", "قانونی", "محکوم"
            ],
            "مالی": [
                "مالیات", "درآمد", "سود", "زیان", "دارایی", "بدهی", "حساب",
                "ترازنامه", "صورت", "مالی", "دریافتی", "پرداختی"
            ],
            "اداری": [
                "اداره", "سازمان", "وزارت", "دولت", "مقام", "مسئول", "کارمند",
                "مقررات", "دستورالعمل", "بخشنامه", "آیین‌نامه"
            ]
        }

    def _load_entity_patterns(self) -> Dict[str, str]:
        """Load regex patterns for legal entity extraction"""
        return {
            "نام_شخص": r"([آ-ی]{2,}\s+){2,}",
            "نام_شرکت": r"(شرکت|موسسه|سازمان|بنیاد)\s+([آ-ی\s]+)",
            "شماره_قرارداد": r"شماره\s*:?\s*(\d+/\d+/\d+)",
            "تاریخ": r"(\d{1,2}/\d{1,2}/\d{2,4})",
            "مبلغ": r"(\d{1,3}(?:,\d{3})*)\s*(ریال|تومان|دلار|یورو)",
            "شماره_ملی": r"(\d{10})",
            "کد_پستی": r"(\d{10})",
            "شماره_تلفن": r"(\d{2,4}-\d{3,4}-\d{4})"
        }

    def _load_sentiment_indicators(self) -> Dict[str, List[str]]:
        """Load Persian sentiment indicators for legal documents"""
        return {
            "positive": [
                "موافق", "تایید", "قبول", "اجازه", "مجوز", "تصویب", "قانونی",
                "مشروع", "صحیح", "درست", "مناسب", "مطلوب", "سودمند"
            ],
            "negative": [
                "مخالف", "رد", "عدم", "ممنوع", "غیرقانونی", "نامشروع",
                "نادرست", "نامناسب", "مضر", "خطرناک", "ممنوع"
            ],
            "neutral": [
                "ماده", "بند", "تبصره", "قانون", "مقررات", "شرایط",
                "مفاد", "طرفین", "تاریخ", "مبلغ", "شماره"
            ]
        }

    def _load_classification_categories(self) -> Dict[str, Dict]:
        """Load document classification categories with weights"""
        return {
            "قرارداد": {
                "keywords": ["قرارداد", "عقد", "طرفین", "مفاد"],
                "weight": 0.4,
                "patterns": ["طرفین", "متعاهدین", "شرایط"]
            },
            "احکام_قضایی": {
                "keywords": ["حکم", "رای", "دادگاه", "قاضی"],
                "weight": 0.35,
                "patterns": ["شعبه", "خواهان", "خوانده"]
            },
            "قوانین": {
                "keywords": ["قانون", "ماده", "تبصره", "مجلس"],
                "weight": 0.3,
                "patterns": ["مصوبه", "تصویب", "اساسی"]
            },
            "مقررات_اداری": {
                "keywords": ["مقررات", "دستورالعمل", "آیین‌نامه"],
                "weight": 0.25,
                "patterns": ["اداره", "سازمان", "وزارت"]
            },
            "اسناد_مالی": {
                "keywords": ["مالی", "حساب", "ترازنامه", "صورت"],
                "weight": 0.2,
                "patterns": ["درآمد", "سود", "زیان"]
            }
        }

    def analyze_document(self, text: str, metadata: Dict = None) -> Dict[str, Any]:
        """
        Comprehensive document analysis including scoring, classification, and insights

        Args:
            text: Document text content
            metadata: Additional document metadata

        Returns:
            Dictionary containing analysis results
        """
        try:
            # Basic text preprocessing
            cleaned_text = self._preprocess_text(text)

            # Perform various analyses
            analysis = {
                "basic_metrics": self._calculate_basic_metrics(cleaned_text),
                "classification": self._classify_document(cleaned_text),
                "entities": self._extract_entities(cleaned_text),
                "sentiment": self._analyze_sentiment(cleaned_text),
                "keywords": self._extract_keywords(cleaned_text),
                "quality_score": self._calculate_quality_score(cleaned_text, metadata),
                "recommendations": self._generate_recommendations(cleaned_text, metadata),
                "timestamp": datetime.now().isoformat()
            }

            # Add similarity analysis if we have existing documents
            if self.document_vectors:
                analysis["similarity"] = self._find_similar_documents(
                    cleaned_text)

            return analysis

        except Exception as e:
            logger.error(f"Error in document analysis: {e}")
            return {
                "error": str(e),
                "timestamp": datetime.now().isoformat()
            }

    def _preprocess_text(self, text: str) -> str:
        """Clean and normalize Persian text"""
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text.strip())

        # Normalize Persian characters
        text = text.replace('ي', 'ی').replace('ك', 'ک')

        # Remove common noise characters
        text = re.sub(
            r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s\d\-\.\/]', '', text)

        return text

    def _calculate_basic_metrics(self, text: str) -> Dict[str, Any]:
        """Calculate basic document metrics"""
        words = text.split()
        sentences = re.split(r'[.!?؟]', text)
        sentences = [s.strip() for s in sentences if s.strip()]

        return {
            "word_count": len(words),
            "sentence_count": len(sentences),
            "avg_sentence_length": len(words) / len(sentences) if sentences else 0,
            "unique_words": len(set(words)),
            "vocabulary_diversity": len(set(words)) / len(words) if words else 0,
            "legal_terms_count": self._count_legal_terms(text)
        }

    def _count_legal_terms(self, text: str) -> int:
        """Count legal terms in the document"""
        count = 0
        for category_terms in self.legal_keywords.values():
            for term in category_terms:
                count += text.count(term)
        return count

    def _classify_document(self, text: str) -> Dict[str, float]:
        """Classify document into legal categories"""
        scores = {}

        for category, config in self.classification_categories.items():
            score = 0
            weight = config["weight"]

            # Keyword matching
            for keyword in config["keywords"]:
                if keyword in text:
                    score += weight

            # Pattern matching
            for pattern in config["patterns"]:
                if pattern in text:
                    score += weight * 0.5

            scores[category] = min(score, 1.0)

        # Normalize scores
        total_score = sum(scores.values())
        if total_score > 0:
            scores = {k: v/total_score for k, v in scores.items()}

        return scores

    def _extract_entities(self, text: str) -> Dict[str, List[str]]:
        """Extract legal entities from text"""
        entities = {}

        for entity_type, pattern in self.entity_patterns.items():
            matches = re.findall(pattern, text)
            if matches:
                entities[entity_type] = list(set(matches))

        return entities

    def _analyze_sentiment(self, text: str) -> Dict[str, float]:
        """Analyze sentiment of legal document"""
        sentiment_scores = {"positive": 0, "negative": 0, "neutral": 0}
        total_words = len(text.split())

        if total_words == 0:
            return sentiment_scores

        for sentiment, indicators in self.sentiment_indicators.items():
            count = 0
            for indicator in indicators:
                count += text.count(indicator)
            sentiment_scores[sentiment] = count / total_words

        # Normalize scores
        total = sum(sentiment_scores.values())
        if total > 0:
            sentiment_scores = {k: v/total for k,
                                v in sentiment_scores.items()}

        return sentiment_scores

    def _extract_keywords(self, text: str) -> List[Tuple[str, float]]:
        """Extract important keywords with TF-IDF scores"""
        try:
            # Create document-term matrix
            tfidf_matrix = self.vectorizer.fit_transform([text])
            feature_names = self.vectorizer.get_feature_names_out()

            # Get TF-IDF scores
            scores = tfidf_matrix.toarray()[0]

            # Create keyword-score pairs
            keywords = [(feature_names[i], scores[i])
                        for i in range(len(feature_names))]

            # Sort by score and return top keywords
            keywords.sort(key=lambda x: x[1], reverse=True)
            return keywords[:20]  # Return top 20 keywords

        except Exception as e:
            logger.error(f"Error extracting keywords: {e}")
            return []

    def _calculate_quality_score(self, text: str, metadata: Dict = None) -> float:
        """Calculate overall document quality score"""
        score = 0.0

        # Text length factor (optimal length for legal documents)
        word_count = len(text.split())
        if 100 <= word_count <= 2000:
            score += 0.3
        elif word_count > 2000:
            score += 0.2
        else:
            score += 0.1

        # Legal terms density
        legal_terms = self._count_legal_terms(text)
        if legal_terms > 0:
            density = legal_terms / word_count
            if 0.01 <= density <= 0.1:
                score += 0.3
            elif density > 0.1:
                score += 0.2
            else:
                score += 0.1

        # Structure factor (presence of legal document structure)
        structure_indicators = ["ماده", "بند", "تبصره", "فصل", "باب"]
        structure_count = sum(text.count(indicator)
                              for indicator in structure_indicators)
        if structure_count > 0:
            score += 0.2

        # Completeness factor
        completeness_indicators = ["تاریخ", "شماره", "امضا", "مهر"]
        completeness_count = sum(text.count(indicator)
                                 for indicator in completeness_indicators)
        if completeness_count >= 2:
            score += 0.2

        return min(score, 1.0)

    def _generate_recommendations(self, text: str, metadata: Dict = None) -> List[str]:
        """Generate intelligent recommendations for the document"""
        recommendations = []

        # Check document completeness
        if len(text.split()) < 100:
            recommendations.append(
                "مستندات کافی نیست. پیشنهاد می‌شود جزئیات بیشتری اضافه شود.")

        # Check for legal structure
        if "ماده" not in text and "بند" not in text:
            recommendations.append(
                "ساختار حقوقی مشخص نیست. پیشنهاد می‌شود از ساختار ماده و بند استفاده شود.")

        # Check for dates and numbers
        if not re.search(r'\d{1,2}/\d{1,2}/\d{2,4}', text):
            recommendations.append(
                "تاریخ مشخص نشده است. پیشنهاد می‌شود تاریخ مستندات اضافه شود.")

        # Check for signatures
        if "امضا" not in text and "مهر" not in text:
            recommendations.append(
                "امضا یا مهر مشخص نشده است. پیشنهاد می‌شود امضا اضافه شود.")

        # Check for amounts
        if not re.search(r'\d{1,3}(?:,\d{3})*', text):
            recommendations.append(
                "مبالغ مشخص نشده است. پیشنهاد می‌شود مبالغ دقیق ذکر شود.")

        return recommendations

    def _find_similar_documents(self, text: str) -> List[Dict[str, Any]]:
        """Find similar documents using TF-IDF and cosine similarity"""
        try:
            # Vectorize current document
            current_vector = self.vectorizer.transform([text])

            similarities = []
            for doc_id, doc_vector in self.document_vectors.items():
                similarity = cosine_similarity(
                    current_vector, doc_vector)[0][0]
                similarities.append({
                    "document_id": doc_id,
                    "similarity_score": float(similarity),
                    "category": "similar_document"
                })

            # Sort by similarity and return top matches
            similarities.sort(
                key=lambda x: x["similarity_score"], reverse=True)
            return similarities[:5]  # Return top 5 similar documents

        except Exception as e:
            logger.error(f"Error finding similar documents: {e}")
            return []

    def update_document_vector(self, doc_id: str, text: str):
        """Update document vector for similarity analysis"""
        try:
            vector = self.vectorizer.transform([text])
            self.document_vectors[doc_id] = vector
        except Exception as e:
            logger.error(f"Error updating document vector: {e}")

    def get_ai_insights(self, documents: List[Dict]) -> Dict[str, Any]:
        """Generate AI insights from multiple documents"""
        try:
            insights = {
                "document_trends": self._analyze_trends(documents),
                "common_entities": self._find_common_entities(documents),
                "category_distribution": self._analyze_category_distribution(documents),
                "quality_metrics": self._calculate_overall_quality(documents),
                "recommendations": self._generate_system_recommendations(documents)
            }
            return insights
        except Exception as e:
            logger.error(f"Error generating AI insights: {e}")
            return {"error": str(e)}

    def _analyze_trends(self, documents: List[Dict]) -> Dict[str, Any]:
        """Analyze trends across documents"""
        # Implementation for trend analysis
        return {"trend_analysis": "Not implemented yet"}

    def _find_common_entities(self, documents: List[Dict]) -> Dict[str, List[str]]:
        """Find common entities across documents"""
        # Implementation for common entity analysis
        return {"common_entities": "Not implemented yet"}

    def _analyze_category_distribution(self, documents: List[Dict]) -> Dict[str, int]:
        """Analyze distribution of document categories"""
        # Implementation for category distribution
        return {"category_distribution": "Not implemented yet"}

    def _calculate_overall_quality(self, documents: List[Dict]) -> Dict[str, float]:
        """Calculate overall quality metrics"""
        # Implementation for overall quality calculation
        return {"overall_quality": "Not implemented yet"}

    def _generate_system_recommendations(self, documents: List[Dict]) -> List[str]:
        """Generate system-wide recommendations"""
        # Implementation for system recommendations
        return ["سیستم در حال بهبود است"]