import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from collections import Counter import re from wordcloud import WordCloud from textstat import flesch_reading_ease, flesch_kincaid_grade import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize, sent_tokenize from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import LatentDirichletAllocation import warnings warnings.filterwarnings('ignore') # Download required NLTK data try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') class SkinDiseaseEDA: def __init__(self, filepath): self.filepath = filepath self.data = [] self.articles = [] self.load_data() def load_data(self): """Parse the structured text file into articles""" with open(self.filepath, 'r', encoding='utf-8') as file: content = file.read() # Split by separator articles = content.split('------------------------------------------------------------') for article in articles: if not article.strip(): continue lines = article.strip().split('\n') article_data = { 'title': '', 'journal': '', 'authors': '', 'abstract': '', 'diagnosis': '', 'treatment': '' } current_section = None for line in lines: line = line.strip() if not line: continue if line.startswith('Journal:'): current_section = 'journal' article_data['journal'] = line.replace('Journal:', '').strip() elif line.startswith('Authors:'): current_section = 'authors' article_data['authors'] = line.replace('Authors:', '').strip() elif line.startswith('Abstract:'): current_section = 'abstract' article_data['abstract'] = line.replace('Abstract:', '').strip() elif line == 'Diagnosis': current_section = 'diagnosis' elif line == 'Treatment Remedies': current_section = 'treatment' elif current_section == 'abstract' and not line.startswith(('Journal:', 'Authors:', 'Diagnosis', 'Treatment')): article_data['abstract'] += ' ' + line elif current_section == 'diagnosis' and not line.startswith(('Journal:', 'Authors:', 'Abstract:', 'Treatment')): article_data['diagnosis'] += ' ' + line elif current_section == 'treatment' and not line.startswith(('Journal:', 'Authors:', 'Abstract:', 'Diagnosis')): article_data['treatment'] += ' ' + line elif not any(line.startswith(prefix) for prefix in ['Journal:', 'Authors:', 'Abstract:', 'Diagnosis', 'Treatment']) and not current_section: article_data['title'] = line # Clean up data for key in article_data: article_data[key] = article_data[key].strip() if article_data['title']: self.articles.append(article_data) def basic_statistics(self): """Generate basic statistics about the corpus""" print("=== BASIC CORPUS STATISTICS ===") print(f"Total articles: {len(self.articles)}") # Text length statistics abstract_lengths = [len(article['abstract']) for article in self.articles if article['abstract']] title_lengths = [len(article['title']) for article in self.articles if article['title']] print(f"Articles with abstracts: {len(abstract_lengths)}") print(f"Average abstract length: {np.mean(abstract_lengths):.1f} characters") print(f"Average title length: {np.mean(title_lengths):.1f} characters") # Word counts abstract_words = [len(article['abstract'].split()) for article in self.articles if article['abstract']] print(f"Average abstract word count: {np.mean(abstract_words):.1f} words") # Diagnosis and treatment availability with_diagnosis = sum(1 for article in self.articles if article['diagnosis'] and article['diagnosis'] != 'Not specified.') with_treatment = sum(1 for article in self.articles if article['treatment']) print(f"Articles with specific diagnosis: {with_diagnosis} ({with_diagnosis/len(self.articles)*100:.1f}%)") print(f"Articles with treatment info: {with_treatment} ({with_treatment/len(self.articles)*100:.1f}%)") return { 'total_articles': len(self.articles), 'abstract_lengths': abstract_lengths, 'title_lengths': title_lengths, 'abstract_words': abstract_words, 'with_diagnosis': with_diagnosis, 'with_treatment': with_treatment } def journal_analysis(self): """Analyze journal distribution""" print("\n=== JOURNAL ANALYSIS ===") journals = [article['journal'] for article in self.articles if article['journal']] journal_counts = Counter(journals) print(f"Total unique journals: {len(journal_counts)}") print("Top 10 journals:") for journal, count in journal_counts.most_common(10): print(f" {journal}: {count} articles") # Create visualization plt.figure(figsize=(12, 8)) top_journals = dict(journal_counts.most_common(15)) plt.barh(list(top_journals.keys()), list(top_journals.values())) plt.title('Top 15 Journals by Article Count') plt.xlabel('Number of Articles') plt.tight_layout() plt.show() return journal_counts def author_analysis(self): """Analyze author patterns""" print("\n=== AUTHOR ANALYSIS ===") all_authors = [] for article in self.articles: if article['authors']: # Split authors by comma authors = [author.strip() for author in article['authors'].split(',')] all_authors.extend(authors) author_counts = Counter(all_authors) print(f"Total unique authors: {len(author_counts)}") print(f"Total author instances: {len(all_authors)}") print(f"Average authors per article: {len(all_authors)/len(self.articles):.1f}") print("Top 10 most prolific authors:") for author, count in author_counts.most_common(10): print(f" {author}: {count} articles") # Author collaboration network size author_counts_per_article = [len(article['authors'].split(',')) for article in self.articles if article['authors']] print(f"Average collaboration size: {np.mean(author_counts_per_article):.1f} authors per article") return author_counts def disease_analysis(self): """Analyze disease mentions and patterns""" print("\n=== DISEASE AND CONDITION ANALYSIS ===") # Common disease terms disease_terms = [ 'cancer', 'carcinoma', 'melanoma', 'psoriasis', 'dermatitis', 'eczema', 'acne', 'rosacea', 'vitiligo', 'lupus', 'scleroderma', 'pemphigus', 'bullous', 'urticaria', 'mastocytosis', 'lymphoma', 'sarcoma', 'basal cell', 'squamous cell', 'keratosis', 'mycosis', 'fungal', 'bacterial', 'viral', 'herpes', 'warts', 'molluscum', 'impetigo' ] # Count mentions in titles and abstracts disease_counts = Counter() for article in self.articles: text = (article['title'] + ' ' + article['abstract']).lower() for term in disease_terms: if term in text: disease_counts[term] += 1 print("Top 15 disease/condition mentions:") for disease, count in disease_counts.most_common(15): print(f" {disease}: {count} mentions") # Create visualization plt.figure(figsize=(12, 8)) top_diseases = dict(disease_counts.most_common(15)) plt.barh(list(top_diseases.keys()), list(top_diseases.values())) plt.title('Top 15 Disease/Condition Mentions') plt.xlabel('Number of Mentions') plt.tight_layout() plt.show() return disease_counts def treatment_analysis(self): """Analyze treatment patterns""" print("\n=== TREATMENT ANALYSIS ===") # Common treatment terms treatment_terms = [ 'therapy', 'treatment', 'drug', 'medication', 'topical', 'oral', 'systemic', 'immunosuppressive', 'corticosteroid', 'antibiotic', 'antifungal', 'antiviral', 'chemotherapy', 'radiotherapy', 'surgical', 'laser', 'phototherapy', 'immunotherapy', 'biologic', 'methotrexate', 'cyclosporine', 'tacrolimus', 'rituximab' ] treatment_counts = Counter() for article in self.articles: text = (article['treatment'] + ' ' + article['abstract']).lower() for term in treatment_terms: if term in text: treatment_counts[term] += 1 print("Top 15 treatment mentions:") for treatment, count in treatment_counts.most_common(15): print(f" {treatment}: {count} mentions") # Create visualization plt.figure(figsize=(12, 8)) top_treatments = dict(treatment_counts.most_common(15)) plt.barh(list(top_treatments.keys()), list(top_treatments.values())) plt.title('Top 15 Treatment Mentions') plt.xlabel('Number of Mentions') plt.tight_layout() plt.show() return treatment_counts def keyword_analysis(self): """Perform keyword analysis using TF-IDF""" print("\n=== KEYWORD ANALYSIS ===") # Combine title and abstract for each article documents = [] for article in self.articles: doc = article['title'] + ' ' + article['abstract'] documents.append(doc) # TF-IDF analysis stop_words = set(stopwords.words('english')) stop_words.update(['study', 'research', 'analysis', 'results', 'conclusion', 'background', 'methods']) vectorizer = TfidfVectorizer( max_features=100, stop_words=list(stop_words), ngram_range=(1, 2), min_df=2, max_df=0.8 ) tfidf_matrix = vectorizer.fit_transform(documents) feature_names = vectorizer.get_feature_names_out() # Get top keywords mean_scores = np.mean(tfidf_matrix.toarray(), axis=0) top_indices = np.argsort(mean_scores)[::-1][:20] print("Top 20 keywords by TF-IDF score:") for i, idx in enumerate(top_indices): print(f" {i+1}. {feature_names[idx]}: {mean_scores[idx]:.4f}") # Create word cloud all_text = ' '.join(documents) wordcloud = WordCloud( width=800, height=400, background_color='white', stopwords=stop_words, max_words=100 ).generate(all_text) plt.figure(figsize=(12, 6)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.title('Word Cloud of Skin Disease Articles') plt.tight_layout() plt.show() return feature_names, mean_scores def readability_analysis(self): """Analyze text readability""" print("\n=== READABILITY ANALYSIS ===") flesch_scores = [] grade_levels = [] for article in self.articles: if article['abstract']: try: flesch_score = flesch_reading_ease(article['abstract']) grade_level = flesch_kincaid_grade(article['abstract']) flesch_scores.append(flesch_score) grade_levels.append(grade_level) except: continue print(f"Average Flesch Reading Ease Score: {np.mean(flesch_scores):.1f}") print(f"Average Grade Level: {np.mean(grade_levels):.1f}") # Interpretation avg_flesch = np.mean(flesch_scores) if avg_flesch >= 90: difficulty = "Very Easy" elif avg_flesch >= 80: difficulty = "Easy" elif avg_flesch >= 70: difficulty = "Fairly Easy" elif avg_flesch >= 60: difficulty = "Standard" elif avg_flesch >= 50: difficulty = "Fairly Difficult" elif avg_flesch >= 30: difficulty = "Difficult" else: difficulty = "Very Difficult" print(f"Reading Difficulty: {difficulty}") return flesch_scores, grade_levels def generate_summary_report(self): """Generate a comprehensive summary report""" print("\n" + "="*50) print("COMPREHENSIVE EDA SUMMARY REPORT") print("="*50) # Run all analyses basic_stats = self.basic_statistics() journal_counts = self.journal_analysis() author_counts = self.author_analysis() disease_counts = self.disease_analysis() treatment_counts = self.treatment_analysis() keywords, scores = self.keyword_analysis() flesch_scores, grade_levels = self.readability_analysis() # Summary insights print("\n=== KEY INSIGHTS ===") print(f"1. Corpus contains {basic_stats['total_articles']} articles from {len(journal_counts)} unique journals") print(f"2. Most common disease area: {disease_counts.most_common(1)[0][0] if disease_counts else 'N/A'}") print(f"3. Most common treatment approach: {treatment_counts.most_common(1)[0][0] if treatment_counts else 'N/A'}") print(f"4. Average reading level: Grade {np.mean(grade_levels):.1f}") print(f"5. {basic_stats['with_diagnosis']} articles have specific diagnosis information") print(f"6. {basic_stats['with_treatment']} articles contain treatment information") def main(): # Initialize EDA eda = SkinDiseaseEDA('skin_disease_articles_clean.txt') # Generate comprehensive report eda.generate_summary_report() # Set up plotting style plt.style.use('seaborn-v0_8') sns.set_palette("husl") print("\n" + "="*50) print("EDA ANALYSIS COMPLETE") print("="*50) if __name__ == "__main__": main()