Spaces:

cconsti
/

micrograph-training

Runtime error

App Files Files Community

cconsti commited on Mar 16

Commit

bd39841

verified ·

1 Parent(s): 2ec6283

Upload 5 files

Browse files

Files changed (5) hide show

AI Cancer Cell Types App.py +86 -0
AI Cancer Cell Types Dataset.py +109 -0
AI Cancer Cell Types Kaggle.json +1 -0
AI Cancer Cell Types Model.py +155 -0
AI Cancer Cell Types Requirements.txt +11 -0

AI Cancer Cell Types App.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+import torch
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
+from pytorch_lightning.loggers import TensorBoardLogger
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from torch.utils.data import DataLoader
+from datasets import TrainMicrographDataset, ValidationMicrographDataset, InferenceMicrographDataset
+from model import MicrographCleaner, find_best_model, find_optimal_threshold, prepare_submission
+import gradio as gr
+# Kaggle Setup explicitly
+os.system("mkdir -p ~/.kaggle")
+os.system("cp kaggle.json ~/.kaggle/")
+os.system("chmod 600 ~/.kaggle/kaggle.json")
+os.system("kaggle competitions download -c micrographs-competition")
+os.system("unzip -n micrographs-competition.zip")
+# Verify data
+assert os.path.isfile("train.csv"), "Error, train.csv not found"
+assert os.path.isfile("test.csv"), "Error, test.csv not found"
+# Hyperparameters explicitly
+WINDOW_SIZE = 512
+BATCH_SIZE = 8
+N_EPOCHS = 20
+def train_and_generate_submission():
+    # Load data explicitly
+    train_df = pd.read_csv('train.csv')
+    test_df = pd.read_csv('test.csv')
+    train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
+    train_dataset = TrainMicrographDataset(train_df, WINDOW_SIZE)
+    val_dataset = ValidationMicrographDataset(val_df, WINDOW_SIZE)
+    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
+    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=2)
+    # Initialize model explicitly
+    model = MicrographCleaner(n_hidden_layers=12, n_kernels=24, kernel_size=5)
+    # Logger and callbacks explicitly
+    logger = TensorBoardLogger('lightning_logs', name='micrograph_cleaner')
+    checkpoint_callback = ModelCheckpoint(monitor='val_iou', mode='max', dirpath='checkpoints', filename='best-model')
+    early_stop_callback = EarlyStopping(monitor='val_iou', patience=5, mode='max')
+    # Trainer explicitly
+    trainer = pl.Trainer(max_epochs=N_EPOCHS, accelerator='auto', devices=1,
+                         logger=logger, callbacks=[checkpoint_callback, early_stop_callback])
+    # Explicitly train the model
+    trainer.fit(model, train_loader, val_loader)
+    # Find the best model explicitly
+    best_model_path = find_best_model('checkpoints')
+    best_model = MicrographCleaner.load_from_checkpoint(best_model_path)
+    # Find optimal threshold explicitly
+    optimal_threshold = find_optimal_threshold(best_model, val_loader)
+    # Prepare submission explicitly
+    submission_df = prepare_submission(
+        best_model, test_df,
+        window_size=WINDOW_SIZE,
+        threshold=optimal_threshold,
+        overlap=0.65,
+        post_process=True,
+        use_tta=True
+    )
+    submission_df.to_csv('submission.csv', index=False)
+    return "submission.csv generated successfully!"
+iface = gr.Interface(
+    fn=train_and_generate_submission,
+    inputs=None,
+    outputs="text",
+    title="Micrograph Model Trainer & Submission",
+    description="Click submit to explicitly train your model and generate submission.csv"
+)
+iface.launch()

AI Cancer Cell Types Dataset.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import base64
+import io
+import zlib
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+import torchvision.transforms.v2 as transforms
+def decode_array(encoded_base64_str):
+    decoded = base64.b64decode(encoded_base64_str)
+    decompressed = zlib.decompress(decoded)
+    return np.load(io.BytesIO(decompressed))
+class BaseMicrographDataset(Dataset):
+    def __init__(self, df, window_size: int):
+        self.df = df
+        self.window_size = window_size
+    def __len__(self):
+        return len(self.df)
+    def load_and_normalize_image(self, encoded_image: str):
+        image = decode_array(encoded_image).astype(np.float32)
+        p_low, p_high = np.percentile(image, [2, 98])
+        image = np.clip((image - p_low) / (p_high - p_low + 1e-8), 0, 1)
+        if len(image.shape) == 2:
+            image = image[np.newaxis, ...]
+        return torch.from_numpy(image)
+    def load_mask(self, encoded_mask: str):
+        mask = decode_array(encoded_mask).astype(np.float32)
+        if len(mask.shape) == 2:
+            mask = mask[np.newaxis, ...]
+        return torch.from_numpy(mask)
+    def pad_to_min_size(self, image: torch.Tensor, min_size: int):
+        _, h, w = image.shape
+        pad_h = max(0, min_size - h)
+        pad_w = max(0, min_size - w)
+        padded = torch.nn.functional.pad(image, (0, pad_w, 0, pad_h), mode="reflect")
+        return padded, (pad_h, pad_w)
+class TrainMicrographDataset(BaseMicrographDataset):
+    def __init__(self, df, window_size: int):
+        super().__init__(df, window_size)
+        self.shared_transform = transforms.Compose([
+            transforms.RandomCrop(window_size),
+            transforms.RandomVerticalFlip(p=0.5),
+            transforms.RandomHorizontalFlip(p=0.5),
+            transforms.RandomAffine(degrees=45, translate=(0.15, 0.15), scale=(0.85, 1.15), fill=0)
+        ])
+        self.image_only_transforms = transforms.Compose([
+            transforms.GaussianBlur(7, sigma=(0.1, 2.0)),
+            transforms.ColorJitter(brightness=0.3, contrast=0.3),
+            transforms.Lambda(lambda x: x + torch.randn_like(x) * 0.05 if np.random.random() < 0.3 else x)
+        ])
+    def __getitem__(self, idx):
+        row = self.df.iloc[idx]
+        image = self.load_and_normalize_image(row['image'])
+        image, _ = self.pad_to_min_size(image, self.window_size)
+        mask = self.load_mask(row['mask'])
+        mask, _ = self.pad_to_min_size(mask, self.window_size)
+        stacked = torch.cat([image, mask], dim=0)
+        stacked = self.shared_transform(stacked)
+        image, mask = torch.split(stacked, [1, 1], dim=0)
+        return image, mask
+class ValidationMicrographDataset(BaseMicrographDataset):
+    def __init__(self, df, window_size: int):
+        super().__init__(df, window_size)
+        self.n_crops = 5
+    def __len__(self):
+        return len(self.df) * self.n_crops
+    def get_crop_coordinates(self, image_shape, crop_idx):
+        h, w = image_shape
+        if crop_idx == 4:
+            h_start = (h - self.window_size) // 2
+            w_start = (w - self.window_size) // 2
+        else:
+            h_start = 0 if crop_idx < 2 else h - self.window_size
+            w_start = 0 if crop_idx % 2 == 0 else w - self.window_size
+        return h_start, w_start
+    def crop_tensors(self, image, mask, h_start, w_start):
+        h_end = h_start + self.window_size
+        w_end = w_start + self.window_size
+        return (image[:, h_start:h_end, w_start:w_end], mask[:, h_start:h_end, w_start:w_end])
+    def __getitem__(self, idx):
+        image_idx = idx // self.n_crops
+        crop_idx = idx % self.n_crops
+        row = self.df.iloc[image_idx]
+        image = self.load_and_normalize_image(row['image'])
+        image, _ = self.pad_to_min_size(image, self.window_size)
+        mask = self.load_mask(row['mask'])
+        mask, _ = self.pad_to_min_size(mask, self.window_size)
+        h_start, w_start = self.get_crop_coordinates(image.shape[1:], crop_idx)
+        image, mask = self.crop_tensors(image, mask, h_start, w_start)
+        return image, mask
+class InferenceMicrographDataset(BaseMicrographDataset):
+    def __getitem__(self, idx):
+        row = self.df.iloc[idx]
+        image = self.load_and_normalize_image(row['image'])
+        image, padding = self.pad_to_min_size(image, self.window_size)
+        return image, row['Id'], padding

AI Cancer Cell Types Kaggle.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"username":"constantinhatecke7","key":"25b8ef89f2f12e940c7c693ac4083c06"}

AI Cancer Cell Types Model.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import torch
+import torch.nn as nn
+import pytorch_lightning as pl
+import numpy as np
+import tqdm
+import os
+import re
+from pathlib import Path
+import pandas as pd
+from torch.utils.data import DataLoader
+from datasets import ValidationMicrographDataset, InferenceMicrographDataset
+from scipy import ndimage
+from utils import encode_array, decode_array
+# SimpleCNN explicitly defined
+class SimpleCNN(nn.Module):
+    def __init__(self, n_hidden_layers, n_kernels, kernel_size):
+        super().__init__()
+        layers = [nn.Conv2d(1, n_kernels, kernel_size=kernel_size, padding='same'),
+                  nn.GroupNorm(4, n_kernels),
+                  nn.PReLU()]
+        for _ in range(n_hidden_layers):
+            layers.extend([
+                nn.Conv2d(n_kernels, n_kernels, kernel_size=kernel_size, padding='same'),
+                nn.GroupNorm(4, n_kernels),
+                nn.PReLU(),
+            ])
+        layers.extend([
+            nn.Conv2d(n_kernels, 1, kernel_size=1),
+            nn.Sigmoid()
+        ])
+        self.conv_layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.conv_layers(x)
+# Lightning module wrapper explicitly defined
+class MicrographCleaner(pl.LightningModule):
+    def __init__(self, n_hidden_layers=12, n_kernels=24, kernel_size=5, learning_rate=0.001):
+        super().__init__()
+        self.save_hyperparameters()
+        self.model = SimpleCNN(n_hidden_layers, n_kernels, kernel_size)
+        self.lossF = nn.BCELoss()
+        self.learning_rate = learning_rate
+    def forward(self, x):
+        return self.model(x)
+    def dice_loss(self, pred, target):
+        smooth = 1.0
+        pred_flat = pred.view(-1)
+        target_flat = target.view(-1)
+        intersection = (pred_flat * target_flat).sum()
+        union = pred_flat.sum() + target_flat.sum()
+        dice = (2.0 * intersection + smooth) / (union + smooth)
+        return 1.0 - dice
+    def focal_loss(self, pred, target, alpha=0.8, gamma=2.0):
+        bce = self.lossF(pred, target)
+        pt = target * pred + (1 - target) * (1 - pred)
+        focal_weight = (1 - pt) ** gamma
+        alpha_weight = target * alpha + (1 - target) * (1 - alpha)
+        return (focal_weight * alpha_weight * bce).mean()
+    def iou_score(self, pred, target, threshold=0.5):
+        pred_binary = (pred > threshold).float()
+        target_binary = (target > threshold).float()
+        intersection = (pred_binary * target_binary).sum()
+        union = pred_binary.sum() + target_binary.sum() - intersection
+        return (intersection + 1e-6) / (union + 1e-6)
+    def training_step(self, batch, batch_idx):
+        images, masks = batch
+        outputs = self(images)
+        loss = (0.2 * self.lossF(outputs, masks) +
+                0.5 * self.dice_loss(outputs, masks) +
+                0.3 * self.focal_loss(outputs, masks))
+        self.log('train_loss', loss, prog_bar=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        images, masks = batch
+        outputs = self(images)
+        loss = (0.2 * self.lossF(outputs, masks) +
+                0.5 * self.dice_loss(outputs, masks) +
+                0.3 * self.focal_loss(outputs, masks))
+        iou = self.iou_score(outputs, masks)
+        self.log('val_loss', loss, prog_bar=True)
+        self.log('val_iou', iou, prog_bar=True)
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=1e-4)
+        return optimizer
+# explicitly helper functions related to the model:
+def find_best_model(checkpoint_dir: str = "checkpoints") -> str:
+    pattern = r"micrograph-epoch=(\d+)-val_loss=(\d+\.\d+)\.ckpt"
+    best_loss = float('inf')
+    best_checkpoint = None
+    for filename in os.listdir(checkpoint_dir):
+        match = re.match(pattern, filename)
+        if match:
+            val_loss = float(match.group(2))
+            if val_loss < best_loss:
+                best_loss = val_loss
+                best_checkpoint = filename
+    if best_checkpoint is None:
+        raise ValueError("No valid checkpoint files found")
+    return str(Path(checkpoint_dir) / best_checkpoint)
+def find_optimal_threshold(model, val_loader, thresholds=np.arange(0.3, 0.7, 0.05)):
+    best_iou = 0
+    best_threshold = 0.5
+    all_preds, all_targets = [], []
+    with torch.no_grad():
+        for images, masks in tqdm.tqdm(val_loader):
+            outputs = model(images)
+            all_preds.append(outputs.cpu())
+            all_targets.append(masks.cpu())
+    all_preds = torch.cat(all_preds)
+    all_targets = torch.cat(all_targets)
+    for threshold in thresholds:
+        iou = model.iou_score(all_preds, all_targets, threshold=threshold)
+        if iou > best_iou:
+            best_iou = iou
+            best_threshold = threshold
+    return best_threshold
+def prepare_submission(model, test_df, window_size, threshold=0.5):
+    test_dataset = InferenceMicrographDataset(test_df, window_size=window_size)
+    predictions = []
+    model.eval()
+    with torch.no_grad():
+        for idx in tqdm.tqdm(range(len(test_dataset))):
+            image, image_id, (pad_h, pad_w) = test_dataset[idx]
+            pred = model(image.unsqueeze(0)).squeeze().cpu().numpy()
+            if pad_h > 0: pred = pred[:-pad_h,:]
+            if pad_w > 0: pred = pred[:,:-pad_w]
+            pred_mask = (pred > threshold).astype(np.uint8)
+            encoded_pred = encode_array(pred_mask)
+            predictions.append({'Id': image_id, 'mask': encoded_pred})
+    submission_df = pd.DataFrame(predictions)
+    return submission_df

AI Cancer Cell Types Requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch
+torchvision
+numpy
+pandas
+pytorch-lightning
+gradio
+matplotlib
+scipy
+scikit-learn
+tqdm
+kaggle