File size: 10,223 Bytes
d587b0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
# preprocessing_all.py
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoModel

# ------------------------
# Load saved artifacts
# ------------------------

scaler = joblib.load("models\scaler_enrollment.pkl")  # StandardScaler for 'Enrollment'
label_encoders = joblib.load("models\label_encoders.pkl")  # Dict of LabelEncoders for categorical columns
unique_attributes = joblib.load("models\study_design_attributes.pkl")  # List of Study Design attributes

def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop_duplicates()

def select_required_columns(df: pd.DataFrame, required_cols: list)  -> pd.DataFrame:
    return df[required_cols].copy()

def transform_numeric(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply sqrt transform to 'Enrollment' column
    """
    df['Enrollment'] = np.sqrt(df['Enrollment'] + 1e-6)
    return df

def fill_missing_numerical(df: pd.DataFrame, numerical_cols: list) -> pd.DataFrame:
    """
    Fill missing numerical values with the median of each column.
    """
    for col in numerical_cols:
        df[col] = df[col].fillna(df[col].median())
    return df

def fill_missing_categorical(df: pd.DataFrame, columns_to_clean: list) -> pd.DataFrame:
    """
    Replace 'Unknown', 'NA', '', ' ' and NaN with 'Unknown' in given categorical columns.
    """
    for col in columns_to_clean:
        df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown')
        df[col] = df[col].fillna('Unknown')
    return df

def drop_irrelevant_columns(df, columns_to_drop):
    return df.drop(columns=columns_to_drop, errors='ignore')

# ------------------------
# Study Design Parsing
# ------------------------

def parse_study_design(study_design, all_attributes):
    attributes = {attr: "Unknown" for attr in all_attributes}
    if study_design != "Unknown" and pd.notna(study_design):
        for part in study_design.split('|'):
            if ':' in part:
                key, value = part.split(':', 1)
                attributes[key.strip()] = value.strip()
    return attributes

def expand_study_design(df, unique_attributes):
    parsed = df['Study Design'].apply(lambda x: parse_study_design(x, unique_attributes))
    study_df = pd.DataFrame(parsed.tolist(), index=df.index)
    df = pd.concat([df, study_df], axis=1)
    df = df.drop(columns=['Study Design'], errors='ignore')
    return df

# ------------------------
# Encoding Categorical Columns
# ------------------------

def encode_categorical(df, label_encoders):
    for col, le in label_encoders.items():
        # Transform using saved encoder; handle unseen labels
        df[col] = df[col].map(lambda x: x if x in le.classes_ else "Unknown")
        df[col] = le.transform(df[col])
    return df

def clean_categorical_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean and standardize certain categorical columns for inference.
    
    Replaces missing or malformed values with 'Unknown' to match training preprocessing.
    
    Args:
        df (pd.DataFrame): Input dataframe with user data.
        
    Returns:
        pd.DataFrame: DataFrame with cleaned categorical columns.
    """
    columns_to_clean = ['Allocation', 'Intervention Model', 'Masking', 'Primary Purpose']
    
    for col in columns_to_clean:
        # Replace known missing/malformed values with 'Unknown'
        df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown')
        # Replace actual NaN values with 'Unknown'
        df[col] = df[col].fillna('Unknown')
    
    return df

# ------------------------
# Scaling numeric columns
# ------------------------

def scale_numeric(df, scaler):
    """
    Standardize numerical columns using StandardScaler.
    """
    df['Enrollment'] = scaler.transform(df[['Enrollment']])
    return df

# ------------------------
# Text preprocessing
# ------------------------

def clean_text(text):
    if pd.isna(text):  # Handle missing values
        return ""
    text = text.lower()  # Convert to lowercase
    text = ''.join(char for char in text if char.isalnum() or char.isspace())  # Remove special characters
    return ' '.join(text.split())  # Remove extra whitespaces

def preprocess_text_columns(df, text_columns):
    for col in text_columns:
        df[col] = df[col].fillna("No info provided")
        df[col] = df[col].apply(clean_text)
    return df

# ------------------------
# Tokenization of textual Columns
# ------------------------

def tokenize_text_columns(df, textual_columns, tokenizer, batch_size=50, max_length=256):
    """
    Tokenizes multiple textual columns in batches for inference.

    Args:
        df (pd.DataFrame): DataFrame containing textual columns.
        textual_columns (list): List of column names to tokenize.
        tokenizer: HuggingFace tokenizer.
        batch_size (int): Number of samples per batch.
        max_length (int): Maximum token length per sequence.

    Returns:
        dict: Dictionary with column names as keys and tokenized tensors as values.
    """
    def tokenize_in_batches(column_texts):
        tokenized_batches = []
        for i in range(0, len(column_texts), batch_size):
            batch = column_texts[i:i + batch_size].tolist()
            tokenized_batch = tokenizer(
                batch,
                padding="max_length",
                truncation=True,
                max_length=max_length,
                return_tensors="pt"
            )
            tokenized_batches.append(tokenized_batch)
        # Combine batches
        return {
            "input_ids": torch.cat([batch["input_ids"] for batch in tokenized_batches], dim=0),
            "attention_mask": torch.cat([batch["attention_mask"] for batch in tokenized_batches], dim=0)
        }

    tokenized_data = {}
    for col in textual_columns:
        tokenized_data[col] = tokenize_in_batches(df[col])
    return tokenized_data

# ------------------------
# Extract Embeddings
# ------------------------


def extract_text_embeddings(tokenized_data_dict, model, device=None, batch_size=32, save_to_disk=False):
    """
    Extract embeddings from tokenized textual data using BioBERT.

    Args:
        tokenized_data_dict (dict): Dictionary of tokenized columns (output of `tokenize_text_columns`).
        model (transformers.PreTrainedModel): BioBERT model (without classification head).
        device (torch.device, optional): Device to run the model on. Defaults to GPU if available.
        batch_size (int): Batch size for embedding extraction.
        save_to_disk (bool): Whether to save embeddings as .pt files for each column.

    Returns:
        dict: Dictionary of embeddings for each column.
    """
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()  # Ensure model is in evaluation mode

    embeddings_dict = {}

    for col, tokenized_data in tokenized_data_dict.items():
        print(f"Extracting embeddings for column: {col}")

        input_ids = tokenized_data["input_ids"]
        attention_mask = tokenized_data["attention_mask"]

        dataset = TensorDataset(input_ids, attention_mask)
        dataloader = DataLoader(dataset, batch_size=batch_size)

        all_embeddings = []

        with torch.no_grad():
            for batch in dataloader:
                input_ids_batch, attention_mask_batch = batch
                input_ids_batch = input_ids_batch.to(device)
                attention_mask_batch = attention_mask_batch.to(device)

                outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)
                hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_dim]

                # Mean pooling across sequence length
                embeddings = hidden_states.mean(dim=1)
                all_embeddings.append(embeddings.cpu())

        embeddings_col = torch.cat(all_embeddings, dim=0)
        embeddings_dict[col] = embeddings_col

        if save_to_disk:
            torch.save(embeddings_col, f"{col}_embeddings.pt")
            print(f"Saved embeddings for column: {col}")

        print(f"Shape of embeddings for column {col}: {embeddings_col.shape}")

    return embeddings_dict

# ------------------------
# Main preprocessing function
# ------------------------

def preprocess(df, required_cols, categorical_cols, columns_to_drop, text_columns,
               tokenizer=None, biobert_model=None, device='cpu'):
    """
    Full preprocessing pipeline.
    
    Args:
        df (pd.DataFrame): Input DataFrame (single row or batch).
        required_cols (list): Columns to select from df.
        categorical_cols (list): Categorical columns to encode.
        columns_to_drop (list): Columns to drop from df.
        text_columns (list): Textual columns to preprocess.
        tokenizer (transformers.AutoTokenizer, optional): BioBERT tokenizer for text.
        biobert_model (transformers.AutoModel, optional): BioBERT model (no classification head).
        device (str): 'cpu' or 'cuda'.
    
    Returns:
        df (pd.DataFrame): Preprocessed tabular DataFrame.
        embeddings (dict or None): Dict of embeddings for text columns, if model provided.
    """
    # Tabular preprocessing
    df = drop_duplicates(df)
    df = select_required_columns(df, required_cols)
    df = transform_numeric(df)
    df = fill_missing_numerical(df, ["Enrollment"])  # median fill for Enrollment
    df = fill_missing_categorical(df, categorical_cols)
    df = drop_irrelevant_columns(df, columns_to_drop)
    df = expand_study_design(df, unique_attributes)
    df = clean_categorical_columns(df)
    df = encode_categorical(df, label_encoders)
    df = scale_numeric(df, scaler)
    df = preprocess_text_columns(df, text_columns)

    embeddings = None
    if tokenizer is not None and biobert_model is not None:
        tokenized_dict = tokenize_text_columns(df, text_columns, tokenizer)
        embeddings = extract_text_embeddings(tokenized_dict, biobert_model, device=device)

    return df, embeddings