# preprocessing_all.py import pandas as pd import numpy as np import joblib from sklearn.preprocessing import LabelEncoder, StandardScaler import torch from torch.utils.data import TensorDataset, DataLoader from transformers import AutoModel # ------------------------ # Load saved artifacts # ------------------------ scaler = joblib.load("models\scaler_enrollment.pkl") # StandardScaler for 'Enrollment' label_encoders = joblib.load("models\label_encoders.pkl") # Dict of LabelEncoders for categorical columns unique_attributes = joblib.load("models\study_design_attributes.pkl") # List of Study Design attributes def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame: return df.drop_duplicates() def select_required_columns(df: pd.DataFrame, required_cols: list) -> pd.DataFrame: return df[required_cols].copy() def transform_numeric(df: pd.DataFrame) -> pd.DataFrame: """ Apply sqrt transform to 'Enrollment' column """ df['Enrollment'] = np.sqrt(df['Enrollment'] + 1e-6) return df def fill_missing_numerical(df: pd.DataFrame, numerical_cols: list) -> pd.DataFrame: """ Fill missing numerical values with the median of each column. """ for col in numerical_cols: df[col] = df[col].fillna(df[col].median()) return df def fill_missing_categorical(df: pd.DataFrame, columns_to_clean: list) -> pd.DataFrame: """ Replace 'Unknown', 'NA', '', ' ' and NaN with 'Unknown' in given categorical columns. """ for col in columns_to_clean: df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown') df[col] = df[col].fillna('Unknown') return df def drop_irrelevant_columns(df, columns_to_drop): return df.drop(columns=columns_to_drop, errors='ignore') # ------------------------ # Study Design Parsing # ------------------------ def parse_study_design(study_design, all_attributes): attributes = {attr: "Unknown" for attr in all_attributes} if study_design != "Unknown" and pd.notna(study_design): for part in study_design.split('|'): if ':' in part: key, value = part.split(':', 1) attributes[key.strip()] = value.strip() return attributes def expand_study_design(df, unique_attributes): parsed = df['Study Design'].apply(lambda x: parse_study_design(x, unique_attributes)) study_df = pd.DataFrame(parsed.tolist(), index=df.index) df = pd.concat([df, study_df], axis=1) df = df.drop(columns=['Study Design'], errors='ignore') return df # ------------------------ # Encoding Categorical Columns # ------------------------ def encode_categorical(df, label_encoders): for col, le in label_encoders.items(): # Transform using saved encoder; handle unseen labels df[col] = df[col].map(lambda x: x if x in le.classes_ else "Unknown") df[col] = le.transform(df[col]) return df def clean_categorical_columns(df: pd.DataFrame) -> pd.DataFrame: """ Clean and standardize certain categorical columns for inference. Replaces missing or malformed values with 'Unknown' to match training preprocessing. Args: df (pd.DataFrame): Input dataframe with user data. Returns: pd.DataFrame: DataFrame with cleaned categorical columns. """ columns_to_clean = ['Allocation', 'Intervention Model', 'Masking', 'Primary Purpose'] for col in columns_to_clean: # Replace known missing/malformed values with 'Unknown' df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown') # Replace actual NaN values with 'Unknown' df[col] = df[col].fillna('Unknown') return df # ------------------------ # Scaling numeric columns # ------------------------ def scale_numeric(df, scaler): """ Standardize numerical columns using StandardScaler. """ df['Enrollment'] = scaler.transform(df[['Enrollment']]) return df # ------------------------ # Text preprocessing # ------------------------ def clean_text(text): if pd.isna(text): # Handle missing values return "" text = text.lower() # Convert to lowercase text = ''.join(char for char in text if char.isalnum() or char.isspace()) # Remove special characters return ' '.join(text.split()) # Remove extra whitespaces def preprocess_text_columns(df, text_columns): for col in text_columns: df[col] = df[col].fillna("No info provided") df[col] = df[col].apply(clean_text) return df # ------------------------ # Tokenization of textual Columns # ------------------------ def tokenize_text_columns(df, textual_columns, tokenizer, batch_size=50, max_length=256): """ Tokenizes multiple textual columns in batches for inference. Args: df (pd.DataFrame): DataFrame containing textual columns. textual_columns (list): List of column names to tokenize. tokenizer: HuggingFace tokenizer. batch_size (int): Number of samples per batch. max_length (int): Maximum token length per sequence. Returns: dict: Dictionary with column names as keys and tokenized tensors as values. """ def tokenize_in_batches(column_texts): tokenized_batches = [] for i in range(0, len(column_texts), batch_size): batch = column_texts[i:i + batch_size].tolist() tokenized_batch = tokenizer( batch, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt" ) tokenized_batches.append(tokenized_batch) # Combine batches return { "input_ids": torch.cat([batch["input_ids"] for batch in tokenized_batches], dim=0), "attention_mask": torch.cat([batch["attention_mask"] for batch in tokenized_batches], dim=0) } tokenized_data = {} for col in textual_columns: tokenized_data[col] = tokenize_in_batches(df[col]) return tokenized_data # ------------------------ # Extract Embeddings # ------------------------ def extract_text_embeddings(tokenized_data_dict, model, device=None, batch_size=32, save_to_disk=False): """ Extract embeddings from tokenized textual data using BioBERT. Args: tokenized_data_dict (dict): Dictionary of tokenized columns (output of `tokenize_text_columns`). model (transformers.PreTrainedModel): BioBERT model (without classification head). device (torch.device, optional): Device to run the model on. Defaults to GPU if available. batch_size (int): Batch size for embedding extraction. save_to_disk (bool): Whether to save embeddings as .pt files for each column. Returns: dict: Dictionary of embeddings for each column. """ if device is None: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) model.eval() # Ensure model is in evaluation mode embeddings_dict = {} for col, tokenized_data in tokenized_data_dict.items(): print(f"Extracting embeddings for column: {col}") input_ids = tokenized_data["input_ids"] attention_mask = tokenized_data["attention_mask"] dataset = TensorDataset(input_ids, attention_mask) dataloader = DataLoader(dataset, batch_size=batch_size) all_embeddings = [] with torch.no_grad(): for batch in dataloader: input_ids_batch, attention_mask_batch = batch input_ids_batch = input_ids_batch.to(device) attention_mask_batch = attention_mask_batch.to(device) outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch) hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_dim] # Mean pooling across sequence length embeddings = hidden_states.mean(dim=1) all_embeddings.append(embeddings.cpu()) embeddings_col = torch.cat(all_embeddings, dim=0) embeddings_dict[col] = embeddings_col if save_to_disk: torch.save(embeddings_col, f"{col}_embeddings.pt") print(f"Saved embeddings for column: {col}") print(f"Shape of embeddings for column {col}: {embeddings_col.shape}") return embeddings_dict # ------------------------ # Main preprocessing function # ------------------------ def preprocess(df, required_cols, categorical_cols, columns_to_drop, text_columns, tokenizer=None, biobert_model=None, device='cpu'): """ Full preprocessing pipeline. Args: df (pd.DataFrame): Input DataFrame (single row or batch). required_cols (list): Columns to select from df. categorical_cols (list): Categorical columns to encode. columns_to_drop (list): Columns to drop from df. text_columns (list): Textual columns to preprocess. tokenizer (transformers.AutoTokenizer, optional): BioBERT tokenizer for text. biobert_model (transformers.AutoModel, optional): BioBERT model (no classification head). device (str): 'cpu' or 'cuda'. Returns: df (pd.DataFrame): Preprocessed tabular DataFrame. embeddings (dict or None): Dict of embeddings for text columns, if model provided. """ # Tabular preprocessing df = drop_duplicates(df) df = select_required_columns(df, required_cols) df = transform_numeric(df) df = fill_missing_numerical(df, ["Enrollment"]) # median fill for Enrollment df = fill_missing_categorical(df, categorical_cols) df = drop_irrelevant_columns(df, columns_to_drop) df = expand_study_design(df, unique_attributes) df = clean_categorical_columns(df) df = encode_categorical(df, label_encoders) df = scale_numeric(df, scaler) df = preprocess_text_columns(df, text_columns) embeddings = None if tokenizer is not None and biobert_model is not None: tokenized_dict = tokenize_text_columns(df, text_columns, tokenizer) embeddings = extract_text_embeddings(tokenized_dict, biobert_model, device=device) return df, embeddings