Spaces:

Teeburg
/

skin

Sleeping

App Files Files Community

Kalyangotimothy commited on 20 days ago

Commit

217a100

1 Parent(s): d88d781

new

Browse files

Files changed (14) hide show

API.py +17 -0
Dockerfile +28 -0
README.md +36 -14
app.py +128 -0
cleaning.py +10 -0
deploy.py +63 -0
eda_analysis.py +381 -0
extraction.py +6 -0
finetune.py +46 -0
finetune_tinyllama.py +39 -0
gradio-app.py +7 -0
llama2_inference.py +13 -0
requirements.txt +10 -0
xai_analysis.py +436 -0

API.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from fastapi import FastAPI, Request
+from transformers import AutoTokenizer, AutoModelForCausalLM
+app = FastAPI()
+model_path = "tinyllama-finetuned-skin"
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = AutoModelForCausalLM.from_pretrained(model_path)
+@app.post("/generate")
+async def generate(request: Request):
+    data = await request.json()
+    prompt = data["prompt"]
+    inputs = tokenizer(prompt, return_tensors="pt")
+    outputs = model.generate(**inputs, max_new_tokens=100)
+    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return {"response": result}

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM python:3.11-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc \
+    g++ \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first (for better caching)
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Create model directory
+RUN mkdir -p tinyllama-finetuned-skin
+# Expose Hugging Face Spaces port
+EXPOSE 7860
+# Start Gradio app for Hugging Face Spaces
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,14 +1,36 @@
----
-title: Skin
-emoji: 🐨
-colorFrom: indigo
-colorTo: red
-sdk: gradio
-sdk_version: 5.38.0
-app_file: app.py
-pinned: false
-license: mit
-short_description: skin diseases Llama
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# 🏥 Skin Disease AI Assistant
+An AI-powered assistant for skin disease analysis and diagnosis support, built with fine-tuned TinyLlama model.
+## Features
+- 🤖 AI-powered skin disease analysis
+- 🩺 Medical consultation support
+- 📊 Treatment recommendations
+- 🔬 Research-backed responses
+## Usage
+Enter your medical query and get AI-powered insights for skin conditions, symptoms, and treatment options.
+**Note:** This is for educational purposes only. Always consult with medical professionals for actual diagnosis and treatment.
+## Model Information
+- Base Model: TinyLlama-1.1B
+- Fine-tuned on: Skin disease medical literature
+- Specialization: Dermatology and skin conditions
+## Example Queries
+- "Patient presents with red scaly patches on elbows. What could this be?"
+- "Describe treatment options for psoriasis"
+- "What are the symptoms of eczema?"
+## Deployment
+This app is deployed on Hugging Face Spaces with Gradio interface.
+## Disclaimer
+This AI assistant is for educational and research purposes only. It should not be used as a substitute for professional medical advice, diagnosis, or treatment.

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#!/usr/bin/env python3
+"""
+Hugging Face Spaces deployment file for Skin Disease AI API
+"""
+import os
+import gradio as gr
+import requests
+import json
+from threading import Thread
+import time
+import subprocess
+import sys
+# Start FastAPI in background
+def start_fastapi():
+    """Start FastAPI server in background"""
+    subprocess.run([
+        sys.executable, "-m", "uvicorn", "API:app",
+        "--host", "0.0.0.0", "--port", "8000"
+    ])
+# Start FastAPI in a separate thread
+api_thread = Thread(target=start_fastapi, daemon=True)
+api_thread.start()
+# Wait for API to start
+time.sleep(10)
+def generate_text(prompt, max_tokens, temperature):
+    """Generate text using the API"""
+    try:
+        url = "http://localhost:8000/generate"
+        payload = {
+            "prompt": prompt,
+            "max_new_tokens": int(max_tokens),
+            "temperature": float(temperature)
+        }
+        response = requests.post(url, json=payload, timeout=30)
+        if response.status_code == 200:
+            result = response.json()
+            return result["response"]
+        else:
+            return f"Error: {response.status_code} - {response.text}"
+    except Exception as e:
+        return f"Error: {str(e)}"
+def check_api_health():
+    """Check if API is running"""
+    try:
+        response = requests.get("http://localhost:8000/health", timeout=5)
+        if response.status_code == 200:
+            return "✅ API is running"
+        else:
+            return "❌ API error"
+    except:
+        return "❌ API not responding"
+# Create Gradio interface
+with gr.Blocks(title="Skin Disease AI", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🏥 Skin Disease AI Assistant")
+    gr.Markdown("AI-powered assistant for skin disease analysis and diagnosis support.")
+    with gr.Row():
+        with gr.Column(scale=2):
+            prompt_input = gr.Textbox(
+                label="Enter your medical query",
+                placeholder="Patient presents with red scaly patches on elbows. Diagnosis:",
+                lines=3
+            )
+            with gr.Row():
+                max_tokens = gr.Slider(
+                    minimum=10,
+                    maximum=200,
+                    value=100,
+                    step=10,
+                    label="Max tokens"
+                )
+                temperature = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.0,
+                    value=0.7,
+                    step=0.1,
+                    label="Temperature"
+                )
+            generate_btn = gr.Button("Generate Response", variant="primary")
+        with gr.Column(scale=1):
+            api_status = gr.Textbox(label="API Status", value="Starting...", interactive=False)
+            check_btn = gr.Button("Check API")
+    output = gr.Textbox(
+        label="AI Response",
+        lines=8,
+        placeholder="AI response will appear here..."
+    )
+    # Examples
+    gr.Examples(
+        examples=[
+            ["Patient has red scaly patches on elbows and knees. What could this be?", 80, 0.7],
+            ["Describe treatment options for psoriasis", 100, 0.6],
+            ["What are the symptoms of eczema?", 120, 0.5],
+        ],
+        inputs=[prompt_input, max_tokens, temperature],
+    )
+    # Event handlers
+    generate_btn.click(
+        fn=generate_text,
+        inputs=[prompt_input, max_tokens, temperature],
+        outputs=output
+    )
+    check_btn.click(
+        fn=check_api_health,
+        outputs=api_status
+    )
+    # Auto-check API status on load
+    demo.load(fn=check_api_health, outputs=api_status)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

cleaning.py ADDED Viewed

	@@ -0,0 +1,10 @@

+input_file = "skin_disease_articles.txt"
+output_file = "skin_disease_articles_clean.txt"
+with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
+    for line in infile:
+        cleaned = line.strip()  # Remove leading/trailing whitespace
+        if cleaned:  # Skip empty lines
+            outfile.write(cleaned + "\n")
+print(f"Cleaned file saved as {output_file}")

deploy.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python3
+"""
+Simple deployment script for Skin Disease AI API
+"""
+import os
+import sys
+import subprocess
+def create_demo_model():
+    """Create a demo model for testing"""
+    print("Creating demo model...")
+    try:
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+        print(f"Downloading {model_name}...")
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        os.makedirs("tinyllama-finetuned-skin", exist_ok=True)
+        tokenizer.save_pretrained("tinyllama-finetuned-skin")
+        model.save_pretrained("tinyllama-finetuned-skin")
+        print("✅ Demo model created successfully!")
+        return True
+    except Exception as e:
+        print(f"❌ Error creating model: {e}")
+        return False
+def start_server():
+    """Start the API server"""
+    print("🚀 Starting API server on http://localhost:8000")
+    print("Press Ctrl+C to stop")
+    try:
+        subprocess.run([
+            sys.executable, "-m", "uvicorn", "API:app",
+            "--host", "0.0.0.0", "--port", "8000", "--reload"
+        ])
+    except KeyboardInterrupt:
+        print("\n🛑 Server stopped.")
+def main():
+    print("🚀 Skin Disease AI Deployment")
+    print("=" * 40)
+    # Check if model exists
+    if not os.path.exists("tinyllama-finetuned-skin"):
+        print("Model not found. Creating demo model...")
+        if not create_demo_model():
+            print("Failed to create model. Exiting.")
+            return
+    else:
+        print("✅ Model found!")
+    # Start server
+    start_server()
+if __name__ == "__main__":
+    main()

eda_analysis.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from collections import Counter
+import re
+from wordcloud import WordCloud
+from textstat import flesch_reading_ease, flesch_kincaid_grade
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize, sent_tokenize
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.decomposition import LatentDirichletAllocation
+import warnings
+warnings.filterwarnings('ignore')
+# Download required NLTK data
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt')
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+class SkinDiseaseEDA:
+    def __init__(self, filepath):
+        self.filepath = filepath
+        self.data = []
+        self.articles = []
+        self.load_data()
+    def load_data(self):
+        """Parse the structured text file into articles"""
+        with open(self.filepath, 'r', encoding='utf-8') as file:
+            content = file.read()
+        # Split by separator
+        articles = content.split('------------------------------------------------------------')
+        for article in articles:
+            if not article.strip():
+                continue
+            lines = article.strip().split('\n')
+            article_data = {
+                'title': '',
+                'journal': '',
+                'authors': '',
+                'abstract': '',
+                'diagnosis': '',
+                'treatment': ''
+            }
+            current_section = None
+            for line in lines:
+                line = line.strip()
+                if not line:
+                    continue
+                if line.startswith('Journal:'):
+                    current_section = 'journal'
+                    article_data['journal'] = line.replace('Journal:', '').strip()
+                elif line.startswith('Authors:'):
+                    current_section = 'authors'
+                    article_data['authors'] = line.replace('Authors:', '').strip()
+                elif line.startswith('Abstract:'):
+                    current_section = 'abstract'
+                    article_data['abstract'] = line.replace('Abstract:', '').strip()
+                elif line == 'Diagnosis':
+                    current_section = 'diagnosis'
+                elif line == 'Treatment Remedies':
+                    current_section = 'treatment'
+                elif current_section == 'abstract' and not line.startswith(('Journal:', 'Authors:', 'Diagnosis', 'Treatment')):
+                    article_data['abstract'] += ' ' + line
+                elif current_section == 'diagnosis' and not line.startswith(('Journal:', 'Authors:', 'Abstract:', 'Treatment')):
+                    article_data['diagnosis'] += ' ' + line
+                elif current_section == 'treatment' and not line.startswith(('Journal:', 'Authors:', 'Abstract:', 'Diagnosis')):
+                    article_data['treatment'] += ' ' + line
+                elif not any(line.startswith(prefix) for prefix in ['Journal:', 'Authors:', 'Abstract:', 'Diagnosis', 'Treatment']) and not current_section:
+                    article_data['title'] = line
+            # Clean up data
+            for key in article_data:
+                article_data[key] = article_data[key].strip()
+            if article_data['title']:
+                self.articles.append(article_data)
+    def basic_statistics(self):
+        """Generate basic statistics about the corpus"""
+        print("=== BASIC CORPUS STATISTICS ===")
+        print(f"Total articles: {len(self.articles)}")
+        # Text length statistics
+        abstract_lengths = [len(article['abstract']) for article in self.articles if article['abstract']]
+        title_lengths = [len(article['title']) for article in self.articles if article['title']]
+        print(f"Articles with abstracts: {len(abstract_lengths)}")
+        print(f"Average abstract length: {np.mean(abstract_lengths):.1f} characters")
+        print(f"Average title length: {np.mean(title_lengths):.1f} characters")
+        # Word counts
+        abstract_words = [len(article['abstract'].split()) for article in self.articles if article['abstract']]
+        print(f"Average abstract word count: {np.mean(abstract_words):.1f} words")
+        # Diagnosis and treatment availability
+        with_diagnosis = sum(1 for article in self.articles if article['diagnosis'] and article['diagnosis'] != 'Not specified.')
+        with_treatment = sum(1 for article in self.articles if article['treatment'])
+        print(f"Articles with specific diagnosis: {with_diagnosis} ({with_diagnosis/len(self.articles)*100:.1f}%)")
+        print(f"Articles with treatment info: {with_treatment} ({with_treatment/len(self.articles)*100:.1f}%)")
+        return {
+            'total_articles': len(self.articles),
+            'abstract_lengths': abstract_lengths,
+            'title_lengths': title_lengths,
+            'abstract_words': abstract_words,
+            'with_diagnosis': with_diagnosis,
+            'with_treatment': with_treatment
+        }
+    def journal_analysis(self):
+        """Analyze journal distribution"""
+        print("\n=== JOURNAL ANALYSIS ===")
+        journals = [article['journal'] for article in self.articles if article['journal']]
+        journal_counts = Counter(journals)
+        print(f"Total unique journals: {len(journal_counts)}")
+        print("Top 10 journals:")
+        for journal, count in journal_counts.most_common(10):
+            print(f"  {journal}: {count} articles")
+        # Create visualization
+        plt.figure(figsize=(12, 8))
+        top_journals = dict(journal_counts.most_common(15))
+        plt.barh(list(top_journals.keys()), list(top_journals.values()))
+        plt.title('Top 15 Journals by Article Count')
+        plt.xlabel('Number of Articles')
+        plt.tight_layout()
+        plt.show()
+        return journal_counts
+    def author_analysis(self):
+        """Analyze author patterns"""
+        print("\n=== AUTHOR ANALYSIS ===")
+        all_authors = []
+        for article in self.articles:
+            if article['authors']:
+                # Split authors by comma
+                authors = [author.strip() for author in article['authors'].split(',')]
+                all_authors.extend(authors)
+        author_counts = Counter(all_authors)
+        print(f"Total unique authors: {len(author_counts)}")
+        print(f"Total author instances: {len(all_authors)}")
+        print(f"Average authors per article: {len(all_authors)/len(self.articles):.1f}")
+        print("Top 10 most prolific authors:")
+        for author, count in author_counts.most_common(10):
+            print(f"  {author}: {count} articles")
+        # Author collaboration network size
+        author_counts_per_article = [len(article['authors'].split(',')) for article in self.articles if article['authors']]
+        print(f"Average collaboration size: {np.mean(author_counts_per_article):.1f} authors per article")
+        return author_counts
+    def disease_analysis(self):
+        """Analyze disease mentions and patterns"""
+        print("\n=== DISEASE AND CONDITION ANALYSIS ===")
+        # Common disease terms
+        disease_terms = [
+            'cancer', 'carcinoma', 'melanoma', 'psoriasis', 'dermatitis', 'eczema',
+            'acne', 'rosacea', 'vitiligo', 'lupus', 'scleroderma', 'pemphigus',
+            'bullous', 'urticaria', 'mastocytosis', 'lymphoma', 'sarcoma',
+            'basal cell', 'squamous cell', 'keratosis', 'mycosis', 'fungal',
+            'bacterial', 'viral', 'herpes', 'warts', 'molluscum', 'impetigo'
+        ]
+        # Count mentions in titles and abstracts
+        disease_counts = Counter()
+        for article in self.articles:
+            text = (article['title'] + ' ' + article['abstract']).lower()
+            for term in disease_terms:
+                if term in text:
+                    disease_counts[term] += 1
+        print("Top 15 disease/condition mentions:")
+        for disease, count in disease_counts.most_common(15):
+            print(f"  {disease}: {count} mentions")
+        # Create visualization
+        plt.figure(figsize=(12, 8))
+        top_diseases = dict(disease_counts.most_common(15))
+        plt.barh(list(top_diseases.keys()), list(top_diseases.values()))
+        plt.title('Top 15 Disease/Condition Mentions')
+        plt.xlabel('Number of Mentions')
+        plt.tight_layout()
+        plt.show()
+        return disease_counts
+    def treatment_analysis(self):
+        """Analyze treatment patterns"""
+        print("\n=== TREATMENT ANALYSIS ===")
+        # Common treatment terms
+        treatment_terms = [
+            'therapy', 'treatment', 'drug', 'medication', 'topical', 'oral',
+            'systemic', 'immunosuppressive', 'corticosteroid', 'antibiotic',
+            'antifungal', 'antiviral', 'chemotherapy', 'radiotherapy',
+            'surgical', 'laser', 'phototherapy', 'immunotherapy', 'biologic',
+            'methotrexate', 'cyclosporine', 'tacrolimus', 'rituximab'
+        ]
+        treatment_counts = Counter()
+        for article in self.articles:
+            text = (article['treatment'] + ' ' + article['abstract']).lower()
+            for term in treatment_terms:
+                if term in text:
+                    treatment_counts[term] += 1
+        print("Top 15 treatment mentions:")
+        for treatment, count in treatment_counts.most_common(15):
+            print(f"  {treatment}: {count} mentions")
+        # Create visualization
+        plt.figure(figsize=(12, 8))
+        top_treatments = dict(treatment_counts.most_common(15))
+        plt.barh(list(top_treatments.keys()), list(top_treatments.values()))
+        plt.title('Top 15 Treatment Mentions')
+        plt.xlabel('Number of Mentions')
+        plt.tight_layout()
+        plt.show()
+        return treatment_counts
+    def keyword_analysis(self):
+        """Perform keyword analysis using TF-IDF"""
+        print("\n=== KEYWORD ANALYSIS ===")
+        # Combine title and abstract for each article
+        documents = []
+        for article in self.articles:
+            doc = article['title'] + ' ' + article['abstract']
+            documents.append(doc)
+        # TF-IDF analysis
+        stop_words = set(stopwords.words('english'))
+        stop_words.update(['study', 'research', 'analysis', 'results', 'conclusion', 'background', 'methods'])
+        vectorizer = TfidfVectorizer(
+            max_features=100,
+            stop_words=list(stop_words),
+            ngram_range=(1, 2),
+            min_df=2,
+            max_df=0.8
+        )
+        tfidf_matrix = vectorizer.fit_transform(documents)
+        feature_names = vectorizer.get_feature_names_out()
+        # Get top keywords
+        mean_scores = np.mean(tfidf_matrix.toarray(), axis=0)
+        top_indices = np.argsort(mean_scores)[::-1][:20]
+        print("Top 20 keywords by TF-IDF score:")
+        for i, idx in enumerate(top_indices):
+            print(f"  {i+1}. {feature_names[idx]}: {mean_scores[idx]:.4f}")
+        # Create word cloud
+        all_text = ' '.join(documents)
+        wordcloud = WordCloud(
+            width=800,
+            height=400,
+            background_color='white',
+            stopwords=stop_words,
+            max_words=100
+        ).generate(all_text)
+        plt.figure(figsize=(12, 6))
+        plt.imshow(wordcloud, interpolation='bilinear')
+        plt.axis('off')
+        plt.title('Word Cloud of Skin Disease Articles')
+        plt.tight_layout()
+        plt.show()
+        return feature_names, mean_scores
+    def readability_analysis(self):
+        """Analyze text readability"""
+        print("\n=== READABILITY ANALYSIS ===")
+        flesch_scores = []
+        grade_levels = []
+        for article in self.articles:
+            if article['abstract']:
+                try:
+                    flesch_score = flesch_reading_ease(article['abstract'])
+                    grade_level = flesch_kincaid_grade(article['abstract'])
+                    flesch_scores.append(flesch_score)
+                    grade_levels.append(grade_level)
+                except:
+                    continue
+        print(f"Average Flesch Reading Ease Score: {np.mean(flesch_scores):.1f}")
+        print(f"Average Grade Level: {np.mean(grade_levels):.1f}")
+        # Interpretation
+        avg_flesch = np.mean(flesch_scores)
+        if avg_flesch >= 90:
+            difficulty = "Very Easy"
+        elif avg_flesch >= 80:
+            difficulty = "Easy"
+        elif avg_flesch >= 70:
+            difficulty = "Fairly Easy"
+        elif avg_flesch >= 60:
+            difficulty = "Standard"
+        elif avg_flesch >= 50:
+            difficulty = "Fairly Difficult"
+        elif avg_flesch >= 30:
+            difficulty = "Difficult"
+        else:
+            difficulty = "Very Difficult"
+        print(f"Reading Difficulty: {difficulty}")
+        return flesch_scores, grade_levels
+    def generate_summary_report(self):
+        """Generate a comprehensive summary report"""
+        print("\n" + "="*50)
+        print("COMPREHENSIVE EDA SUMMARY REPORT")
+        print("="*50)
+        # Run all analyses
+        basic_stats = self.basic_statistics()
+        journal_counts = self.journal_analysis()
+        author_counts = self.author_analysis()
+        disease_counts = self.disease_analysis()
+        treatment_counts = self.treatment_analysis()
+        keywords, scores = self.keyword_analysis()
+        flesch_scores, grade_levels = self.readability_analysis()
+        # Summary insights
+        print("\n=== KEY INSIGHTS ===")
+        print(f"1. Corpus contains {basic_stats['total_articles']} articles from {len(journal_counts)} unique journals")
+        print(f"2. Most common disease area: {disease_counts.most_common(1)[0][0] if disease_counts else 'N/A'}")
+        print(f"3. Most common treatment approach: {treatment_counts.most_common(1)[0][0] if treatment_counts else 'N/A'}")
+        print(f"4. Average reading level: Grade {np.mean(grade_levels):.1f}")
+        print(f"5. {basic_stats['with_diagnosis']} articles have specific diagnosis information")
+        print(f"6. {basic_stats['with_treatment']} articles contain treatment information")
+def main():
+    # Initialize EDA
+    eda = SkinDiseaseEDA('skin_disease_articles_clean.txt')
+    # Generate comprehensive report
+    eda.generate_summary_report()
+    # Set up plotting style
+    plt.style.use('seaborn-v0_8')
+    sns.set_palette("husl")
+    print("\n" + "="*50)
+    print("EDA ANALYSIS COMPLETE")
+    print("="*50)
+if __name__ == "__main__":
+    main()

extraction.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from docx import Document
+doc = Document("skin_disease_articles.docx")
+with open("skin_disease_articles.txt", "w", encoding="utf-8") as f:
+    for para in doc.paragraphs:
+        f.write(para.text + "\n")

finetune.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
+from datasets import load_dataset
+import os
+os.environ["USE_TF"] = "0"
+model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+model = AutoModelForCausalLM.from_pretrained(model_name)
+# Load your text file as a dataset
+dataset = load_dataset("text", data_files={"train": "skin_disease_articles_clean.txt"})
+# Tokenize the dataset
+def tokenize_function(examples):
+    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
+tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
+train_dataset = tokenized_datasets["train"]
+data_collator = DataCollatorForLanguageModeling(
+    tokenizer=tokenizer, mlm=False
+)
+training_args = TrainingArguments(
+    output_dir="./tinyllama-finetuned-skin",
+    overwrite_output_dir=True,
+    num_train_epochs=1,
+    per_device_train_batch_size=2,
+    save_steps=500,
+    save_total_limit=2,
+    prediction_loss_only=True,
+    fp16=True  # Set True if using GPU with float16 support
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    data_collator=data_collator,
+    train_dataset=train_dataset,
+)
+trainer.train()

finetune_tinyllama.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
+model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)
+# Prepare dataset
+def load_dataset(file_path, tokenizer, block_size=128):
+    return TextDataset(
+        tokenizer=tokenizer,
+        file_path=file_path,
+        block_size=block_size
+    )
+train_dataset = load_dataset("skin_disease_articles_clean.txt", tokenizer)
+data_collator = DataCollatorForLanguageModeling(
+    tokenizer=tokenizer, mlm=False
+)
+training_args = TrainingArguments(
+    output_dir="./tinyllama-finetuned-skin",
+    overwrite_output_dir=True,
+    num_train_epochs=1,
+    per_device_train_batch_size=2,
+    save_steps=500,
+    save_total_limit=2,
+    prediction_loss_only=True,
+    fp16=False  # Set True if using GPU with float16 support
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    data_collator=data_collator,
+    train_dataset=train_dataset,
+)
+trainer.train()

gradio-app.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import gradio as gr
+def greet(name):
+    return "Hello " + name + "!!"
+demo = gr.Interface(fn=greet, inputs="text", outputs="text")
+demo.launch()

llama2_inference.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from transformers import AutoTokenizer, AutoModelForCausalLM
+model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"  # Or local path if downloaded
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)
+# Example: Use a line from your cleaned file as a prompt
+with open("skin_disease_articles_clean.txt", "r", encoding="utf-8") as f:
+    prompt = f.readline().strip()
+inputs = tokenizer(prompt, return_tensors="pt")
+outputs = model.generate(**inputs, max_new_tokens=100)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+transformers==4.35.2
+torch>=2.0.0
+datasets==2.14.7
+accelerate==0.24.1
+pydantic==2.5.0
+python-multipart==0.0.6
+gradio==4.15.0
+requests>=2.25.0

xai_analysis.py ADDED Viewed

	@@ -0,0 +1,436 @@

+import torch
+import torch.nn.functional as F
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from transformers import LlamaTokenizer, LlamaForCausalLM
+import pandas as pd
+from sklearn.metrics.pairwise import cosine_similarity
+import lime
+from lime.lime_text import LimeTextExplainer
+import shap
+import re
+import warnings
+warnings.filterwarnings('ignore')
+class LLMExplainabilityAnalyzer:
+    def __init__(self, model_path, tokenizer_path=None):
+        """Initialize with model and tokenizer paths"""
+        self.model_path = model_path
+        self.tokenizer_path = tokenizer_path or model_path
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # Load model and tokenizer
+        self.load_model()
+        # Initialize explanation tools
+        self.lime_explainer = LimeTextExplainer(class_names=['Generated Text'])
+    def load_model(self):
+        """Load the fine-tuned model and tokenizer"""
+        try:
+            print(f"Loading model from: {self.model_path}")
+            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_path,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                device_map="auto" if torch.cuda.is_available() else None
+            )
+            # Set padding token if not exists
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            print("Model loaded successfully!")
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            # Fallback to base model
+            print("Loading base TinyLlama model...")
+            self.tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+            self.model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+    def extract_attention_weights(self, text, max_length=512):
+        """Extract attention weights for visualization"""
+        inputs = self.tokenizer(
+            text,
+            return_tensors="pt",
+            max_length=max_length,
+            truncation=True,
+            padding=True
+        ).to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**inputs, output_attentions=True)
+            attentions = outputs.attentions
+        # Get tokens
+        tokens = self.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
+        return attentions, tokens
+    def visualize_attention_heads(self, text, layer_idx=0, head_idx=0, max_length=512):
+        """Visualize attention patterns for specific layer and head"""
+        attentions, tokens = self.extract_attention_weights(text, max_length)
+        # Get attention weights for specific layer and head
+        attention_weights = attentions[layer_idx][0, head_idx].cpu().numpy()
+        # Create heatmap
+        plt.figure(figsize=(12, 8))
+        sns.heatmap(
+            attention_weights,
+            xticklabels=tokens,
+            yticklabels=tokens,
+            cmap='Blues',
+            cbar=True
+        )
+        plt.title(f'Attention Weights - Layer {layer_idx}, Head {head_idx}')
+        plt.xlabel('Key Tokens')
+        plt.ylabel('Query Tokens')
+        plt.xticks(rotation=45)
+        plt.yticks(rotation=0)
+        plt.tight_layout()
+        plt.show()
+        return attention_weights, tokens
+    def attention_rollout(self, text, max_length=512):
+        """Compute attention rollout for global attention patterns"""
+        attentions, tokens = self.extract_attention_weights(text, max_length)
+        # Convert to numpy
+        attention_matrices = [att[0].mean(dim=0).cpu().numpy() for att in attentions]
+        # Compute rollout
+        rollout = attention_matrices[0]
+        for attention_matrix in attention_matrices[1:]:
+            rollout = np.matmul(rollout, attention_matrix)
+        # Visualize rollout
+        plt.figure(figsize=(12, 8))
+        sns.heatmap(
+            rollout,
+            xticklabels=tokens,
+            yticklabels=tokens,
+            cmap='Reds',
+            cbar=True
+        )
+        plt.title('Attention Rollout - Global Attention Flow')
+        plt.xlabel('Key Tokens')
+        plt.ylabel('Query Tokens')
+        plt.xticks(rotation=45)
+        plt.yticks(rotation=0)
+        plt.tight_layout()
+        plt.show()
+        return rollout, tokens
+    def gradient_saliency(self, text, target_token_idx=None, max_length=512):
+        """Compute gradient-based saliency maps"""
+        inputs = self.tokenizer(
+            text,
+            return_tensors="pt",
+            max_length=max_length,
+            truncation=True,
+            padding=True
+        ).to(self.device)
+        # Enable gradients for embeddings
+        embeddings = self.model.get_input_embeddings()
+        inputs_embeds = embeddings(inputs['input_ids'])
+        inputs_embeds.requires_grad_(True)
+        # Forward pass
+        outputs = self.model(inputs_embeds=inputs_embeds, attention_mask=inputs['attention_mask'])
+        # Get target logits (last token if not specified)
+        if target_token_idx is None:
+            target_token_idx = -1
+        target_logits = outputs.logits[0, target_token_idx]
+        target_prob = F.softmax(target_logits, dim=-1)
+        # Compute gradients
+        target_prob.max().backward()
+        # Get saliency scores
+        saliency_scores = inputs_embeds.grad.norm(dim=-1).squeeze().cpu().numpy()
+        tokens = self.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
+        # Visualize saliency
+        plt.figure(figsize=(15, 6))
+        colors = plt.cm.Reds(saliency_scores / saliency_scores.max())
+        for i, (token, score) in enumerate(zip(tokens, saliency_scores)):
+            plt.bar(i, score, color=colors[i])
+            plt.text(i, score + 0.001, token, rotation=45, ha='left', va='bottom')
+        plt.title('Gradient Saliency Scores')
+        plt.xlabel('Token Position')
+        plt.ylabel('Saliency Score')
+        plt.tight_layout()
+        plt.show()
+        return saliency_scores, tokens
+    def lime_explanation(self, text, num_samples=1000):
+        """Generate LIME explanations"""
+        def predict_fn(texts):
+            """Prediction function for LIME"""
+            predictions = []
+            for text in texts:
+                try:
+                    inputs = self.tokenizer(
+                        text,
+                        return_tensors="pt",
+                        max_length=512,
+                        truncation=True,
+                        padding=True
+                    ).to(self.device)
+                    with torch.no_grad():
+                        outputs = self.model(**inputs)
+                        logits = outputs.logits[0, -1]
+                        probs = F.softmax(logits, dim=-1)
+                    # Return probability distribution
+                    predictions.append(probs.cpu().numpy())
+                except:
+                    # Return uniform distribution if error
+                    predictions.append(np.ones(self.tokenizer.vocab_size) / self.tokenizer.vocab_size)
+            return np.array(predictions)
+        # Generate explanation
+        explanation = self.lime_explainer.explain_instance(
+            text,
+            predict_fn,
+            num_features=20,
+            num_samples=num_samples
+        )
+        # Visualize explanation
+        explanation.show_in_notebook(text=True)
+        return explanation
+    def activation_analysis(self, text, layer_indices=None, max_length=512):
+        """Analyze hidden layer activations"""
+        inputs = self.tokenizer(
+            text,
+            return_tensors="pt",
+            max_length=max_length,
+            truncation=True,
+            padding=True
+        ).to(self.device)
+        # Hook to capture activations
+        activations = {}
+        def hook_fn(name):
+            def hook(module, input, output):
+                activations[name] = output.detach()
+            return hook
+        # Register hooks
+        if layer_indices is None:
+            layer_indices = [0, len(self.model.model.layers)//2, len(self.model.model.layers)-1]
+        hooks = []
+        for idx in layer_indices:
+            if idx < len(self.model.model.layers):
+                hook = self.model.model.layers[idx].register_forward_hook(hook_fn(f'layer_{idx}'))
+                hooks.append(hook)
+        # Forward pass
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        # Remove hooks
+        for hook in hooks:
+            hook.remove()
+        # Analyze activations
+        tokens = self.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
+        for layer_name, activation in activations.items():
+            # Get activation statistics
+            activation_np = activation[0].cpu().numpy()
+            # Plot activation distribution
+            plt.figure(figsize=(12, 6))
+            # Heatmap of activations
+            plt.subplot(1, 2, 1)
+            sns.heatmap(activation_np.T, cmap='viridis', cbar=True)
+            plt.title(f'{layer_name} Activations')
+            plt.xlabel('Token Position')
+            plt.ylabel('Hidden Dimension')
+            # Activation magnitude per token
+            plt.subplot(1, 2, 2)
+            activation_magnitudes = np.linalg.norm(activation_np, axis=1)
+            plt.bar(range(len(tokens)), activation_magnitudes)
+            plt.title(f'{layer_name} Activation Magnitudes')
+            plt.xlabel('Token Position')
+            plt.ylabel('Magnitude')
+            plt.xticks(range(len(tokens)), tokens, rotation=45)
+            plt.tight_layout()
+            plt.show()
+    def token_importance_analysis(self, text, method='attention', max_length=512):
+        """Analyze token importance using different methods"""
+        results = {}
+        if method == 'attention':
+            # Attention-based importance
+            attentions, tokens = self.extract_attention_weights(text, max_length)
+            # Average attention across layers and heads
+            avg_attention = torch.stack([att.mean(dim=1) for att in attentions]).mean(dim=0)
+            importance_scores = avg_attention[0].sum(dim=0).cpu().numpy()
+        elif method == 'gradient':
+            # Gradient-based importance
+            importance_scores, tokens = self.gradient_saliency(text, max_length=max_length)
+        # Create importance dataframe
+        importance_df = pd.DataFrame({
+            'token': tokens,
+            'importance': importance_scores
+        })
+        # Sort by importance
+        importance_df = importance_df.sort_values('importance', ascending=False)
+        # Visualize top important tokens
+        plt.figure(figsize=(12, 6))
+        top_tokens = importance_df.head(20)
+        plt.barh(range(len(top_tokens)), top_tokens['importance'])
+        plt.yticks(range(len(top_tokens)), top_tokens['token'])
+        plt.title(f'Top 20 Important Tokens ({method.title()} Method)')
+        plt.xlabel('Importance Score')
+        plt.tight_layout()
+        plt.show()
+        return importance_df
+    def semantic_similarity_analysis(self, texts, max_length=512):
+        """Analyze semantic similarity between different texts"""
+        embeddings = []
+        for text in texts:
+            inputs = self.tokenizer(
+                text,
+                return_tensors="pt",
+                max_length=max_length,
+                truncation=True,
+                padding=True
+            ).to(self.device)
+            with torch.no_grad():
+                outputs = self.model(**inputs, output_hidden_states=True)
+                # Use last layer, last token embedding
+                embedding = outputs.hidden_states[-1][0, -1].cpu().numpy()
+                embeddings.append(embedding)
+        # Compute similarity matrix
+        similarity_matrix = cosine_similarity(embeddings)
+        # Visualize similarity matrix
+        plt.figure(figsize=(10, 8))
+        sns.heatmap(
+            similarity_matrix,
+            annot=True,
+            cmap='viridis',
+            xticklabels=[f'Text {i+1}' for i in range(len(texts))],
+            yticklabels=[f'Text {i+1}' for i in range(len(texts))]
+        )
+        plt.title('Semantic Similarity Matrix')
+        plt.tight_layout()
+        plt.show()
+        return similarity_matrix
+    def generate_explanation_report(self, text, output_file='xai_report.html'):
+        """Generate comprehensive explanation report"""
+        print("Generating comprehensive XAI report...")
+        # Run all analyses
+        print("1. Extracting attention patterns...")
+        attention_weights, tokens = self.visualize_attention_heads(text)
+        print("2. Computing attention rollout...")
+        rollout, _ = self.attention_rollout(text)
+        print("3. Calculating gradient saliency...")
+        saliency_scores, _ = self.gradient_saliency(text)
+        print("4. Analyzing activations...")
+        self.activation_analysis(text)
+        print("5. Computing token importance...")
+        importance_df = self.token_importance_analysis(text)
+        # Create summary
+        print("\n=== XAI ANALYSIS SUMMARY ===")
+        print(f"Input text: {text[:100]}...")
+        print(f"Number of tokens: {len(tokens)}")
+        print(f"Most important tokens: {importance_df.head(5)['token'].tolist()}")
+        print(f"Average attention entropy: {np.mean(-np.sum(attention_weights * np.log(attention_weights + 1e-10), axis=1)):.4f}")
+        return {
+            'attention_weights': attention_weights,
+            'rollout': rollout,
+            'saliency_scores': saliency_scores,
+            'importance_df': importance_df,
+            'tokens': tokens
+        }
+def main():
+    """Main function to run XAI analysis"""
+    # Initialize analyzer (adjust model path as needed)
+    try:
+        analyzer = LLMExplainabilityAnalyzer("./fine_tuned_model")
+    except:
+        print("Fine-tuned model not found. Using base model for demonstration.")
+        analyzer = LLMExplainabilityAnalyzer("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+    # Sample skin disease text for analysis
+    sample_text = """
+    Patient presents with erythematous scaly patches on the elbows and knees,
+    consistent with psoriasis. The condition appears to be chronic with periods
+    of exacerbation. Treatment options include topical corticosteroids and
+    phototherapy for mild to moderate cases.
+    """
+    print("Starting XAI Analysis...")
+    print("=" * 50)
+    # Generate comprehensive report
+    results = analyzer.generate_explanation_report(sample_text)
+    # Additional analyses
+    print("\n6. Semantic similarity analysis...")
+    test_texts = [
+        "Psoriasis treatment with topical corticosteroids",
+        "Eczema management using moisturizers",
+        "Melanoma diagnosis and surgical intervention"
+    ]
+    similarity_matrix = analyzer.semantic_similarity_analysis(test_texts)
+    print("\n" + "=" * 50)
+    print("XAI ANALYSIS COMPLETE")
+    print("=" * 50)
+    return results
+if __name__ == "__main__":
+    main()