import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
from wordcloud import WordCloud
from textstat import flesch_reading_ease, flesch_kincaid_grade
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

class SkinDiseaseEDA:
    def __init__(self, filepath):
        self.filepath = filepath
        self.data = []
        self.articles = []
        self.load_data()
        
    def load_data(self):
        """Parse the structured text file into articles"""
        with open(self.filepath, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Split by separator
        articles = content.split('------------------------------------------------------------')
        
        for article in articles:
            if not article.strip():
                continue
                
            lines = article.strip().split('\n')
            article_data = {
                'title': '',
                'journal': '',
                'authors': '',
                'abstract': '',
                'diagnosis': '',
                'treatment': ''
            }
            
            current_section = None
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                    
                if line.startswith('Journal:'):
                    current_section = 'journal'
                    article_data['journal'] = line.replace('Journal:', '').strip()
                elif line.startswith('Authors:'):
                    current_section = 'authors'
                    article_data['authors'] = line.replace('Authors:', '').strip()
                elif line.startswith('Abstract:'):
                    current_section = 'abstract'
                    article_data['abstract'] = line.replace('Abstract:', '').strip()
                elif line == 'Diagnosis':
                    current_section = 'diagnosis'
                elif line == 'Treatment Remedies':
                    current_section = 'treatment'
                elif current_section == 'abstract' and not line.startswith(('Journal:', 'Authors:', 'Diagnosis', 'Treatment')):
                    article_data['abstract'] += ' ' + line
                elif current_section == 'diagnosis' and not line.startswith(('Journal:', 'Authors:', 'Abstract:', 'Treatment')):
                    article_data['diagnosis'] += ' ' + line
                elif current_section == 'treatment' and not line.startswith(('Journal:', 'Authors:', 'Abstract:', 'Diagnosis')):
                    article_data['treatment'] += ' ' + line
                elif not any(line.startswith(prefix) for prefix in ['Journal:', 'Authors:', 'Abstract:', 'Diagnosis', 'Treatment']) and not current_section:
                    article_data['title'] = line
            
            # Clean up data
            for key in article_data:
                article_data[key] = article_data[key].strip()
            
            if article_data['title']:
                self.articles.append(article_data)
    
    def basic_statistics(self):
        """Generate basic statistics about the corpus"""
        print("=== BASIC CORPUS STATISTICS ===")
        print(f"Total articles: {len(self.articles)}")
        
        # Text length statistics
        abstract_lengths = [len(article['abstract']) for article in self.articles if article['abstract']]
        title_lengths = [len(article['title']) for article in self.articles if article['title']]
        
        print(f"Articles with abstracts: {len(abstract_lengths)}")
        print(f"Average abstract length: {np.mean(abstract_lengths):.1f} characters")
        print(f"Average title length: {np.mean(title_lengths):.1f} characters")
        
        # Word counts
        abstract_words = [len(article['abstract'].split()) for article in self.articles if article['abstract']]
        print(f"Average abstract word count: {np.mean(abstract_words):.1f} words")
        
        # Diagnosis and treatment availability
        with_diagnosis = sum(1 for article in self.articles if article['diagnosis'] and article['diagnosis'] != 'Not specified.')
        with_treatment = sum(1 for article in self.articles if article['treatment'])
        
        print(f"Articles with specific diagnosis: {with_diagnosis} ({with_diagnosis/len(self.articles)*100:.1f}%)")
        print(f"Articles with treatment info: {with_treatment} ({with_treatment/len(self.articles)*100:.1f}%)")
        
        return {
            'total_articles': len(self.articles),
            'abstract_lengths': abstract_lengths,
            'title_lengths': title_lengths,
            'abstract_words': abstract_words,
            'with_diagnosis': with_diagnosis,
            'with_treatment': with_treatment
        }
    
    def journal_analysis(self):
        """Analyze journal distribution"""
        print("\n=== JOURNAL ANALYSIS ===")
        
        journals = [article['journal'] for article in self.articles if article['journal']]
        journal_counts = Counter(journals)
        
        print(f"Total unique journals: {len(journal_counts)}")
        print("Top 10 journals:")
        for journal, count in journal_counts.most_common(10):
            print(f"  {journal}: {count} articles")
        
        # Create visualization
        plt.figure(figsize=(12, 8))
        top_journals = dict(journal_counts.most_common(15))
        plt.barh(list(top_journals.keys()), list(top_journals.values()))
        plt.title('Top 15 Journals by Article Count')
        plt.xlabel('Number of Articles')
        plt.tight_layout()
        plt.show()
        
        return journal_counts
    
    def author_analysis(self):
        """Analyze author patterns"""
        print("\n=== AUTHOR ANALYSIS ===")
        
        all_authors = []
        for article in self.articles:
            if article['authors']:
                # Split authors by comma
                authors = [author.strip() for author in article['authors'].split(',')]
                all_authors.extend(authors)
        
        author_counts = Counter(all_authors)
        
        print(f"Total unique authors: {len(author_counts)}")
        print(f"Total author instances: {len(all_authors)}")
        print(f"Average authors per article: {len(all_authors)/len(self.articles):.1f}")
        
        print("Top 10 most prolific authors:")
        for author, count in author_counts.most_common(10):
            print(f"  {author}: {count} articles")
        
        # Author collaboration network size
        author_counts_per_article = [len(article['authors'].split(',')) for article in self.articles if article['authors']]
        print(f"Average collaboration size: {np.mean(author_counts_per_article):.1f} authors per article")
        
        return author_counts
    
    def disease_analysis(self):
        """Analyze disease mentions and patterns"""
        print("\n=== DISEASE AND CONDITION ANALYSIS ===")
        
        # Common disease terms
        disease_terms = [
            'cancer', 'carcinoma', 'melanoma', 'psoriasis', 'dermatitis', 'eczema',
            'acne', 'rosacea', 'vitiligo', 'lupus', 'scleroderma', 'pemphigus',
            'bullous', 'urticaria', 'mastocytosis', 'lymphoma', 'sarcoma',
            'basal cell', 'squamous cell', 'keratosis', 'mycosis', 'fungal',
            'bacterial', 'viral', 'herpes', 'warts', 'molluscum', 'impetigo'
        ]
        
        # Count mentions in titles and abstracts
        disease_counts = Counter()
        
        for article in self.articles:
            text = (article['title'] + ' ' + article['abstract']).lower()
            for term in disease_terms:
                if term in text:
                    disease_counts[term] += 1
        
        print("Top 15 disease/condition mentions:")
        for disease, count in disease_counts.most_common(15):
            print(f"  {disease}: {count} mentions")
        
        # Create visualization
        plt.figure(figsize=(12, 8))
        top_diseases = dict(disease_counts.most_common(15))
        plt.barh(list(top_diseases.keys()), list(top_diseases.values()))
        plt.title('Top 15 Disease/Condition Mentions')
        plt.xlabel('Number of Mentions')
        plt.tight_layout()
        plt.show()
        
        return disease_counts
    
    def treatment_analysis(self):
        """Analyze treatment patterns"""
        print("\n=== TREATMENT ANALYSIS ===")
        
        # Common treatment terms
        treatment_terms = [
            'therapy', 'treatment', 'drug', 'medication', 'topical', 'oral',
            'systemic', 'immunosuppressive', 'corticosteroid', 'antibiotic',
            'antifungal', 'antiviral', 'chemotherapy', 'radiotherapy',
            'surgical', 'laser', 'phototherapy', 'immunotherapy', 'biologic',
            'methotrexate', 'cyclosporine', 'tacrolimus', 'rituximab'
        ]
        
        treatment_counts = Counter()
        
        for article in self.articles:
            text = (article['treatment'] + ' ' + article['abstract']).lower()
            for term in treatment_terms:
                if term in text:
                    treatment_counts[term] += 1
        
        print("Top 15 treatment mentions:")
        for treatment, count in treatment_counts.most_common(15):
            print(f"  {treatment}: {count} mentions")
        
        # Create visualization
        plt.figure(figsize=(12, 8))
        top_treatments = dict(treatment_counts.most_common(15))
        plt.barh(list(top_treatments.keys()), list(top_treatments.values()))
        plt.title('Top 15 Treatment Mentions')
        plt.xlabel('Number of Mentions')
        plt.tight_layout()
        plt.show()
        
        return treatment_counts
    
    def keyword_analysis(self):
        """Perform keyword analysis using TF-IDF"""
        print("\n=== KEYWORD ANALYSIS ===")
        
        # Combine title and abstract for each article
        documents = []
        for article in self.articles:
            doc = article['title'] + ' ' + article['abstract']
            documents.append(doc)
        
        # TF-IDF analysis
        stop_words = set(stopwords.words('english'))
        stop_words.update(['study', 'research', 'analysis', 'results', 'conclusion', 'background', 'methods'])
        
        vectorizer = TfidfVectorizer(
            max_features=100,
            stop_words=list(stop_words),
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.8
        )
        
        tfidf_matrix = vectorizer.fit_transform(documents)
        feature_names = vectorizer.get_feature_names_out()
        
        # Get top keywords
        mean_scores = np.mean(tfidf_matrix.toarray(), axis=0)
        top_indices = np.argsort(mean_scores)[::-1][:20]
        
        print("Top 20 keywords by TF-IDF score:")
        for i, idx in enumerate(top_indices):
            print(f"  {i+1}. {feature_names[idx]}: {mean_scores[idx]:.4f}")
        
        # Create word cloud
        all_text = ' '.join(documents)
        wordcloud = WordCloud(
            width=800, 
            height=400, 
            background_color='white',
            stopwords=stop_words,
            max_words=100
        ).generate(all_text)
        
        plt.figure(figsize=(12, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Skin Disease Articles')
        plt.tight_layout()
        plt.show()
        
        return feature_names, mean_scores
    
    def readability_analysis(self):
        """Analyze text readability"""
        print("\n=== READABILITY ANALYSIS ===")
        
        flesch_scores = []
        grade_levels = []
        
        for article in self.articles:
            if article['abstract']:
                try:
                    flesch_score = flesch_reading_ease(article['abstract'])
                    grade_level = flesch_kincaid_grade(article['abstract'])
                    flesch_scores.append(flesch_score)
                    grade_levels.append(grade_level)
                except:
                    continue
        
        print(f"Average Flesch Reading Ease Score: {np.mean(flesch_scores):.1f}")
        print(f"Average Grade Level: {np.mean(grade_levels):.1f}")
        
        # Interpretation
        avg_flesch = np.mean(flesch_scores)
        if avg_flesch >= 90:
            difficulty = "Very Easy"
        elif avg_flesch >= 80:
            difficulty = "Easy"
        elif avg_flesch >= 70:
            difficulty = "Fairly Easy"
        elif avg_flesch >= 60:
            difficulty = "Standard"
        elif avg_flesch >= 50:
            difficulty = "Fairly Difficult"
        elif avg_flesch >= 30:
            difficulty = "Difficult"
        else:
            difficulty = "Very Difficult"
        
        print(f"Reading Difficulty: {difficulty}")
        
        return flesch_scores, grade_levels
    
    def generate_summary_report(self):
        """Generate a comprehensive summary report"""
        print("\n" + "="*50)
        print("COMPREHENSIVE EDA SUMMARY REPORT")
        print("="*50)
        
        # Run all analyses
        basic_stats = self.basic_statistics()
        journal_counts = self.journal_analysis()
        author_counts = self.author_analysis()
        disease_counts = self.disease_analysis()
        treatment_counts = self.treatment_analysis()
        keywords, scores = self.keyword_analysis()
        flesch_scores, grade_levels = self.readability_analysis()
        
        # Summary insights
        print("\n=== KEY INSIGHTS ===")
        print(f"1. Corpus contains {basic_stats['total_articles']} articles from {len(journal_counts)} unique journals")
        print(f"2. Most common disease area: {disease_counts.most_common(1)[0][0] if disease_counts else 'N/A'}")
        print(f"3. Most common treatment approach: {treatment_counts.most_common(1)[0][0] if treatment_counts else 'N/A'}")
        print(f"4. Average reading level: Grade {np.mean(grade_levels):.1f}")
        print(f"5. {basic_stats['with_diagnosis']} articles have specific diagnosis information")
        print(f"6. {basic_stats['with_treatment']} articles contain treatment information")

def main():
    # Initialize EDA
    eda = SkinDiseaseEDA('skin_disease_articles_clean.txt')
    
    # Generate comprehensive report
    eda.generate_summary_report()
    
    # Set up plotting style
    plt.style.use('seaborn-v0_8')
    sns.set_palette("husl")
    
    print("\n" + "="*50)
    print("EDA ANALYSIS COMPLETE")
    print("="*50)

if __name__ == "__main__":
    main()