Spaces:

archis99
/

clinical-trial-prediction

Sleeping

File size: 10,223 Bytes

d587b0b

# preprocessing_all.py
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoModel

# ------------------------
# Load saved artifacts
# ------------------------

scaler = joblib.load("models\scaler_enrollment.pkl")  # StandardScaler for 'Enrollment'
label_encoders = joblib.load("models\label_encoders.pkl")  # Dict of LabelEncoders for categorical columns
unique_attributes = joblib.load("models\study_design_attributes.pkl")  # List of Study Design attributes

def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop_duplicates()

def select_required_columns(df: pd.DataFrame, required_cols: list)  -> pd.DataFrame:
    return df[required_cols].copy()

def transform_numeric(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply sqrt transform to 'Enrollment' column
    """
    df['Enrollment'] = np.sqrt(df['Enrollment'] + 1e-6)
    return df

def fill_missing_numerical(df: pd.DataFrame, numerical_cols: list) -> pd.DataFrame:
    """
    Fill missing numerical values with the median of each column.
    """
    for col in numerical_cols:
        df[col] = df[col].fillna(df[col].median())
    return df

def fill_missing_categorical(df: pd.DataFrame, columns_to_clean: list) -> pd.DataFrame:
    """
    Replace 'Unknown', 'NA', '', ' ' and NaN with 'Unknown' in given categorical columns.
    """
    for col in columns_to_clean:
        df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown')
        df[col] = df[col].fillna('Unknown')
    return df

def drop_irrelevant_columns(df, columns_to_drop):
    return df.drop(columns=columns_to_drop, errors='ignore')

# ------------------------
# Study Design Parsing
# ------------------------

def parse_study_design(study_design, all_attributes):
    attributes = {attr: "Unknown" for attr in all_attributes}
    if study_design != "Unknown" and pd.notna(study_design):
        for part in study_design.split('|'):
            if ':' in part:
                key, value = part.split(':', 1)
                attributes[key.strip()] = value.strip()
    return attributes

def expand_study_design(df, unique_attributes):
    parsed = df['Study Design'].apply(lambda x: parse_study_design(x, unique_attributes))
    study_df = pd.DataFrame(parsed.tolist(), index=df.index)
    df = pd.concat([df, study_df], axis=1)
    df = df.drop(columns=['Study Design'], errors='ignore')
    return df

# ------------------------
# Encoding Categorical Columns
# ------------------------

def encode_categorical(df, label_encoders):
    for col, le in label_encoders.items():
        # Transform using saved encoder; handle unseen labels
        df[col] = df[col].map(lambda x: x if x in le.classes_ else "Unknown")
        df[col] = le.transform(df[col])
    return df

def clean_categorical_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean and standardize certain categorical columns for inference.
    
    Replaces missing or malformed values with 'Unknown' to match training preprocessing.
    
    Args:
        df (pd.DataFrame): Input dataframe with user data.
        
    Returns:
        pd.DataFrame: DataFrame with cleaned categorical columns.
    """
    columns_to_clean = ['Allocation', 'Intervention Model', 'Masking', 'Primary Purpose']
    
    for col in columns_to_clean:
        # Replace known missing/malformed values with 'Unknown'
        df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown')
        # Replace actual NaN values with 'Unknown'
        df[col] = df[col].fillna('Unknown')
    
    return df

# ------------------------
# Scaling numeric columns
# ------------------------

def scale_numeric(df, scaler):
    """
    Standardize numerical columns using StandardScaler.
    """
    df['Enrollment'] = scaler.transform(df[['Enrollment']])
    return df

# ------------------------
# Text preprocessing
# ------------------------

def clean_text(text):
    if pd.isna(text):  # Handle missing values
        return ""
    text = text.lower()  # Convert to lowercase
    text = ''.join(char for char in text if char.isalnum() or char.isspace())  # Remove special characters
    return ' '.join(text.split())  # Remove extra whitespaces

def preprocess_text_columns(df, text_columns):
    for col in text_columns:
        df[col] = df[col].fillna("No info provided")
        df[col] = df[col].apply(clean_text)
    return df

# ------------------------
# Tokenization of textual Columns
# ------------------------

def tokenize_text_columns(df, textual_columns, tokenizer, batch_size=50, max_length=256):
    """
    Tokenizes multiple textual columns in batches for inference.

    Args:
        df (pd.DataFrame): DataFrame containing textual columns.
        textual_columns (list): List of column names to tokenize.
        tokenizer: HuggingFace tokenizer.
        batch_size (int): Number of samples per batch.
        max_length (int): Maximum token length per sequence.

    Returns:
        dict: Dictionary with column names as keys and tokenized tensors as values.
    """
    def tokenize_in_batches(column_texts):
        tokenized_batches = []
        for i in range(0, len(column_texts), batch_size):
            batch = column_texts[i:i + batch_size].tolist()
            tokenized_batch = tokenizer(
                batch,
                padding="max_length",
                truncation=True,
                max_length=max_length,
                return_tensors="pt"
            )
            tokenized_batches.append(tokenized_batch)
        # Combine batches
        return {
            "input_ids": torch.cat([batch["input_ids"] for batch in tokenized_batches], dim=0),
            "attention_mask": torch.cat([batch["attention_mask"] for batch in tokenized_batches], dim=0)
        }

    tokenized_data = {}
    for col in textual_columns:
        tokenized_data[col] = tokenize_in_batches(df[col])
    return tokenized_data

# ------------------------
# Extract Embeddings
# ------------------------


def extract_text_embeddings(tokenized_data_dict, model, device=None, batch_size=32, save_to_disk=False):
    """
    Extract embeddings from tokenized textual data using BioBERT.

    Args:
        tokenized_data_dict (dict): Dictionary of tokenized columns (output of `tokenize_text_columns`).
        model (transformers.PreTrainedModel): BioBERT model (without classification head).
        device (torch.device, optional): Device to run the model on. Defaults to GPU if available.
        batch_size (int): Batch size for embedding extraction.
        save_to_disk (bool): Whether to save embeddings as .pt files for each column.

    Returns:
        dict: Dictionary of embeddings for each column.
    """
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()  # Ensure model is in evaluation mode

    embeddings_dict = {}

    for col, tokenized_data in tokenized_data_dict.items():
        print(f"Extracting embeddings for column: {col}")

        input_ids = tokenized_data["input_ids"]
        attention_mask = tokenized_data["attention_mask"]

        dataset = TensorDataset(input_ids, attention_mask)
        dataloader = DataLoader(dataset, batch_size=batch_size)

        all_embeddings = []

        with torch.no_grad():
            for batch in dataloader:
                input_ids_batch, attention_mask_batch = batch
                input_ids_batch = input_ids_batch.to(device)
                attention_mask_batch = attention_mask_batch.to(device)

                outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)
                hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_dim]

                # Mean pooling across sequence length
                embeddings = hidden_states.mean(dim=1)
                all_embeddings.append(embeddings.cpu())

        embeddings_col = torch.cat(all_embeddings, dim=0)
        embeddings_dict[col] = embeddings_col

        if save_to_disk:
            torch.save(embeddings_col, f"{col}_embeddings.pt")
            print(f"Saved embeddings for column: {col}")

        print(f"Shape of embeddings for column {col}: {embeddings_col.shape}")

    return embeddings_dict

# ------------------------
# Main preprocessing function
# ------------------------

def preprocess(df, required_cols, categorical_cols, columns_to_drop, text_columns,
               tokenizer=None, biobert_model=None, device='cpu'):
    """
    Full preprocessing pipeline.
    
    Args:
        df (pd.DataFrame): Input DataFrame (single row or batch).
        required_cols (list): Columns to select from df.
        categorical_cols (list): Categorical columns to encode.
        columns_to_drop (list): Columns to drop from df.
        text_columns (list): Textual columns to preprocess.
        tokenizer (transformers.AutoTokenizer, optional): BioBERT tokenizer for text.
        biobert_model (transformers.AutoModel, optional): BioBERT model (no classification head).
        device (str): 'cpu' or 'cuda'.
    
    Returns:
        df (pd.DataFrame): Preprocessed tabular DataFrame.
        embeddings (dict or None): Dict of embeddings for text columns, if model provided.
    """
    # Tabular preprocessing
    df = drop_duplicates(df)
    df = select_required_columns(df, required_cols)
    df = transform_numeric(df)
    df = fill_missing_numerical(df, ["Enrollment"])  # median fill for Enrollment
    df = fill_missing_categorical(df, categorical_cols)
    df = drop_irrelevant_columns(df, columns_to_drop)
    df = expand_study_design(df, unique_attributes)
    df = clean_categorical_columns(df)
    df = encode_categorical(df, label_encoders)
    df = scale_numeric(df, scaler)
    df = preprocess_text_columns(df, text_columns)

    embeddings = None
    if tokenizer is not None and biobert_model is not None:
        tokenized_dict = tokenize_text_columns(df, text_columns, tokenizer)
        embeddings = extract_text_embeddings(tokenized_dict, biobert_model, device=device)

    return df, embeddings