|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from collections import Counter |
|
import re |
|
from wordcloud import WordCloud |
|
from textstat import flesch_reading_ease, flesch_kincaid_grade |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize, sent_tokenize |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.decomposition import LatentDirichletAllocation |
|
import warnings |
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
try: |
|
nltk.data.find('tokenizers/punkt') |
|
except LookupError: |
|
nltk.download('punkt') |
|
|
|
try: |
|
nltk.data.find('corpora/stopwords') |
|
except LookupError: |
|
nltk.download('stopwords') |
|
|
|
class SkinDiseaseEDA: |
|
def __init__(self, filepath): |
|
self.filepath = filepath |
|
self.data = [] |
|
self.articles = [] |
|
self.load_data() |
|
|
|
def load_data(self): |
|
"""Parse the structured text file into articles""" |
|
with open(self.filepath, 'r', encoding='utf-8') as file: |
|
content = file.read() |
|
|
|
|
|
articles = content.split('------------------------------------------------------------') |
|
|
|
for article in articles: |
|
if not article.strip(): |
|
continue |
|
|
|
lines = article.strip().split('\n') |
|
article_data = { |
|
'title': '', |
|
'journal': '', |
|
'authors': '', |
|
'abstract': '', |
|
'diagnosis': '', |
|
'treatment': '' |
|
} |
|
|
|
current_section = None |
|
for line in lines: |
|
line = line.strip() |
|
if not line: |
|
continue |
|
|
|
if line.startswith('Journal:'): |
|
current_section = 'journal' |
|
article_data['journal'] = line.replace('Journal:', '').strip() |
|
elif line.startswith('Authors:'): |
|
current_section = 'authors' |
|
article_data['authors'] = line.replace('Authors:', '').strip() |
|
elif line.startswith('Abstract:'): |
|
current_section = 'abstract' |
|
article_data['abstract'] = line.replace('Abstract:', '').strip() |
|
elif line == 'Diagnosis': |
|
current_section = 'diagnosis' |
|
elif line == 'Treatment Remedies': |
|
current_section = 'treatment' |
|
elif current_section == 'abstract' and not line.startswith(('Journal:', 'Authors:', 'Diagnosis', 'Treatment')): |
|
article_data['abstract'] += ' ' + line |
|
elif current_section == 'diagnosis' and not line.startswith(('Journal:', 'Authors:', 'Abstract:', 'Treatment')): |
|
article_data['diagnosis'] += ' ' + line |
|
elif current_section == 'treatment' and not line.startswith(('Journal:', 'Authors:', 'Abstract:', 'Diagnosis')): |
|
article_data['treatment'] += ' ' + line |
|
elif not any(line.startswith(prefix) for prefix in ['Journal:', 'Authors:', 'Abstract:', 'Diagnosis', 'Treatment']) and not current_section: |
|
article_data['title'] = line |
|
|
|
|
|
for key in article_data: |
|
article_data[key] = article_data[key].strip() |
|
|
|
if article_data['title']: |
|
self.articles.append(article_data) |
|
|
|
def basic_statistics(self): |
|
"""Generate basic statistics about the corpus""" |
|
print("=== BASIC CORPUS STATISTICS ===") |
|
print(f"Total articles: {len(self.articles)}") |
|
|
|
|
|
abstract_lengths = [len(article['abstract']) for article in self.articles if article['abstract']] |
|
title_lengths = [len(article['title']) for article in self.articles if article['title']] |
|
|
|
print(f"Articles with abstracts: {len(abstract_lengths)}") |
|
print(f"Average abstract length: {np.mean(abstract_lengths):.1f} characters") |
|
print(f"Average title length: {np.mean(title_lengths):.1f} characters") |
|
|
|
|
|
abstract_words = [len(article['abstract'].split()) for article in self.articles if article['abstract']] |
|
print(f"Average abstract word count: {np.mean(abstract_words):.1f} words") |
|
|
|
|
|
with_diagnosis = sum(1 for article in self.articles if article['diagnosis'] and article['diagnosis'] != 'Not specified.') |
|
with_treatment = sum(1 for article in self.articles if article['treatment']) |
|
|
|
print(f"Articles with specific diagnosis: {with_diagnosis} ({with_diagnosis/len(self.articles)*100:.1f}%)") |
|
print(f"Articles with treatment info: {with_treatment} ({with_treatment/len(self.articles)*100:.1f}%)") |
|
|
|
return { |
|
'total_articles': len(self.articles), |
|
'abstract_lengths': abstract_lengths, |
|
'title_lengths': title_lengths, |
|
'abstract_words': abstract_words, |
|
'with_diagnosis': with_diagnosis, |
|
'with_treatment': with_treatment |
|
} |
|
|
|
def journal_analysis(self): |
|
"""Analyze journal distribution""" |
|
print("\n=== JOURNAL ANALYSIS ===") |
|
|
|
journals = [article['journal'] for article in self.articles if article['journal']] |
|
journal_counts = Counter(journals) |
|
|
|
print(f"Total unique journals: {len(journal_counts)}") |
|
print("Top 10 journals:") |
|
for journal, count in journal_counts.most_common(10): |
|
print(f" {journal}: {count} articles") |
|
|
|
|
|
plt.figure(figsize=(12, 8)) |
|
top_journals = dict(journal_counts.most_common(15)) |
|
plt.barh(list(top_journals.keys()), list(top_journals.values())) |
|
plt.title('Top 15 Journals by Article Count') |
|
plt.xlabel('Number of Articles') |
|
plt.tight_layout() |
|
plt.show() |
|
|
|
return journal_counts |
|
|
|
def author_analysis(self): |
|
"""Analyze author patterns""" |
|
print("\n=== AUTHOR ANALYSIS ===") |
|
|
|
all_authors = [] |
|
for article in self.articles: |
|
if article['authors']: |
|
|
|
authors = [author.strip() for author in article['authors'].split(',')] |
|
all_authors.extend(authors) |
|
|
|
author_counts = Counter(all_authors) |
|
|
|
print(f"Total unique authors: {len(author_counts)}") |
|
print(f"Total author instances: {len(all_authors)}") |
|
print(f"Average authors per article: {len(all_authors)/len(self.articles):.1f}") |
|
|
|
print("Top 10 most prolific authors:") |
|
for author, count in author_counts.most_common(10): |
|
print(f" {author}: {count} articles") |
|
|
|
|
|
author_counts_per_article = [len(article['authors'].split(',')) for article in self.articles if article['authors']] |
|
print(f"Average collaboration size: {np.mean(author_counts_per_article):.1f} authors per article") |
|
|
|
return author_counts |
|
|
|
def disease_analysis(self): |
|
"""Analyze disease mentions and patterns""" |
|
print("\n=== DISEASE AND CONDITION ANALYSIS ===") |
|
|
|
|
|
disease_terms = [ |
|
'cancer', 'carcinoma', 'melanoma', 'psoriasis', 'dermatitis', 'eczema', |
|
'acne', 'rosacea', 'vitiligo', 'lupus', 'scleroderma', 'pemphigus', |
|
'bullous', 'urticaria', 'mastocytosis', 'lymphoma', 'sarcoma', |
|
'basal cell', 'squamous cell', 'keratosis', 'mycosis', 'fungal', |
|
'bacterial', 'viral', 'herpes', 'warts', 'molluscum', 'impetigo' |
|
] |
|
|
|
|
|
disease_counts = Counter() |
|
|
|
for article in self.articles: |
|
text = (article['title'] + ' ' + article['abstract']).lower() |
|
for term in disease_terms: |
|
if term in text: |
|
disease_counts[term] += 1 |
|
|
|
print("Top 15 disease/condition mentions:") |
|
for disease, count in disease_counts.most_common(15): |
|
print(f" {disease}: {count} mentions") |
|
|
|
|
|
plt.figure(figsize=(12, 8)) |
|
top_diseases = dict(disease_counts.most_common(15)) |
|
plt.barh(list(top_diseases.keys()), list(top_diseases.values())) |
|
plt.title('Top 15 Disease/Condition Mentions') |
|
plt.xlabel('Number of Mentions') |
|
plt.tight_layout() |
|
plt.show() |
|
|
|
return disease_counts |
|
|
|
def treatment_analysis(self): |
|
"""Analyze treatment patterns""" |
|
print("\n=== TREATMENT ANALYSIS ===") |
|
|
|
|
|
treatment_terms = [ |
|
'therapy', 'treatment', 'drug', 'medication', 'topical', 'oral', |
|
'systemic', 'immunosuppressive', 'corticosteroid', 'antibiotic', |
|
'antifungal', 'antiviral', 'chemotherapy', 'radiotherapy', |
|
'surgical', 'laser', 'phototherapy', 'immunotherapy', 'biologic', |
|
'methotrexate', 'cyclosporine', 'tacrolimus', 'rituximab' |
|
] |
|
|
|
treatment_counts = Counter() |
|
|
|
for article in self.articles: |
|
text = (article['treatment'] + ' ' + article['abstract']).lower() |
|
for term in treatment_terms: |
|
if term in text: |
|
treatment_counts[term] += 1 |
|
|
|
print("Top 15 treatment mentions:") |
|
for treatment, count in treatment_counts.most_common(15): |
|
print(f" {treatment}: {count} mentions") |
|
|
|
|
|
plt.figure(figsize=(12, 8)) |
|
top_treatments = dict(treatment_counts.most_common(15)) |
|
plt.barh(list(top_treatments.keys()), list(top_treatments.values())) |
|
plt.title('Top 15 Treatment Mentions') |
|
plt.xlabel('Number of Mentions') |
|
plt.tight_layout() |
|
plt.show() |
|
|
|
return treatment_counts |
|
|
|
def keyword_analysis(self): |
|
"""Perform keyword analysis using TF-IDF""" |
|
print("\n=== KEYWORD ANALYSIS ===") |
|
|
|
|
|
documents = [] |
|
for article in self.articles: |
|
doc = article['title'] + ' ' + article['abstract'] |
|
documents.append(doc) |
|
|
|
|
|
stop_words = set(stopwords.words('english')) |
|
stop_words.update(['study', 'research', 'analysis', 'results', 'conclusion', 'background', 'methods']) |
|
|
|
vectorizer = TfidfVectorizer( |
|
max_features=100, |
|
stop_words=list(stop_words), |
|
ngram_range=(1, 2), |
|
min_df=2, |
|
max_df=0.8 |
|
) |
|
|
|
tfidf_matrix = vectorizer.fit_transform(documents) |
|
feature_names = vectorizer.get_feature_names_out() |
|
|
|
|
|
mean_scores = np.mean(tfidf_matrix.toarray(), axis=0) |
|
top_indices = np.argsort(mean_scores)[::-1][:20] |
|
|
|
print("Top 20 keywords by TF-IDF score:") |
|
for i, idx in enumerate(top_indices): |
|
print(f" {i+1}. {feature_names[idx]}: {mean_scores[idx]:.4f}") |
|
|
|
|
|
all_text = ' '.join(documents) |
|
wordcloud = WordCloud( |
|
width=800, |
|
height=400, |
|
background_color='white', |
|
stopwords=stop_words, |
|
max_words=100 |
|
).generate(all_text) |
|
|
|
plt.figure(figsize=(12, 6)) |
|
plt.imshow(wordcloud, interpolation='bilinear') |
|
plt.axis('off') |
|
plt.title('Word Cloud of Skin Disease Articles') |
|
plt.tight_layout() |
|
plt.show() |
|
|
|
return feature_names, mean_scores |
|
|
|
def readability_analysis(self): |
|
"""Analyze text readability""" |
|
print("\n=== READABILITY ANALYSIS ===") |
|
|
|
flesch_scores = [] |
|
grade_levels = [] |
|
|
|
for article in self.articles: |
|
if article['abstract']: |
|
try: |
|
flesch_score = flesch_reading_ease(article['abstract']) |
|
grade_level = flesch_kincaid_grade(article['abstract']) |
|
flesch_scores.append(flesch_score) |
|
grade_levels.append(grade_level) |
|
except: |
|
continue |
|
|
|
print(f"Average Flesch Reading Ease Score: {np.mean(flesch_scores):.1f}") |
|
print(f"Average Grade Level: {np.mean(grade_levels):.1f}") |
|
|
|
|
|
avg_flesch = np.mean(flesch_scores) |
|
if avg_flesch >= 90: |
|
difficulty = "Very Easy" |
|
elif avg_flesch >= 80: |
|
difficulty = "Easy" |
|
elif avg_flesch >= 70: |
|
difficulty = "Fairly Easy" |
|
elif avg_flesch >= 60: |
|
difficulty = "Standard" |
|
elif avg_flesch >= 50: |
|
difficulty = "Fairly Difficult" |
|
elif avg_flesch >= 30: |
|
difficulty = "Difficult" |
|
else: |
|
difficulty = "Very Difficult" |
|
|
|
print(f"Reading Difficulty: {difficulty}") |
|
|
|
return flesch_scores, grade_levels |
|
|
|
def generate_summary_report(self): |
|
"""Generate a comprehensive summary report""" |
|
print("\n" + "="*50) |
|
print("COMPREHENSIVE EDA SUMMARY REPORT") |
|
print("="*50) |
|
|
|
|
|
basic_stats = self.basic_statistics() |
|
journal_counts = self.journal_analysis() |
|
author_counts = self.author_analysis() |
|
disease_counts = self.disease_analysis() |
|
treatment_counts = self.treatment_analysis() |
|
keywords, scores = self.keyword_analysis() |
|
flesch_scores, grade_levels = self.readability_analysis() |
|
|
|
|
|
print("\n=== KEY INSIGHTS ===") |
|
print(f"1. Corpus contains {basic_stats['total_articles']} articles from {len(journal_counts)} unique journals") |
|
print(f"2. Most common disease area: {disease_counts.most_common(1)[0][0] if disease_counts else 'N/A'}") |
|
print(f"3. Most common treatment approach: {treatment_counts.most_common(1)[0][0] if treatment_counts else 'N/A'}") |
|
print(f"4. Average reading level: Grade {np.mean(grade_levels):.1f}") |
|
print(f"5. {basic_stats['with_diagnosis']} articles have specific diagnosis information") |
|
print(f"6. {basic_stats['with_treatment']} articles contain treatment information") |
|
|
|
def main(): |
|
|
|
eda = SkinDiseaseEDA('skin_disease_articles_clean.txt') |
|
|
|
|
|
eda.generate_summary_report() |
|
|
|
|
|
plt.style.use('seaborn-v0_8') |
|
sns.set_palette("husl") |
|
|
|
print("\n" + "="*50) |
|
print("EDA ANALYSIS COMPLETE") |
|
print("="*50) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|