|
|
|
|
|
import joblib |
|
|
import pandas as pd |
|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoModel |
|
|
|
|
|
|
|
|
from ..preprocessing.cleaning import ( |
|
|
drop_duplicates, select_required_columns, transform_numeric, |
|
|
fill_missing_numerical, fill_missing_categorical, drop_irrelevant_columns |
|
|
) |
|
|
from ..preprocessing.categorical import expand_study_design, encode_categorical, clean_categorical_columns |
|
|
from ..preprocessing.scaling import scale_numeric |
|
|
from ..preprocessing.text_processing import preprocess_text_columns, tokenize_text_columns |
|
|
from ..preprocessing.embeddings import extract_text_embeddings |
|
|
from ..preprocessing.globals import scaler, label_encoders, unique_attributes |
|
|
|
|
|
|
|
|
class Preprocessor: |
|
|
def __init__(self, required_cols, categorical_cols, columns_to_drop, text_columns, |
|
|
tokenizer=None, biobert_model=None, device="cpu"): |
|
|
self.required_cols = required_cols |
|
|
self.categorical_cols = categorical_cols |
|
|
self.columns_to_drop = columns_to_drop |
|
|
self.text_columns = text_columns |
|
|
self.tokenizer = tokenizer |
|
|
self.biobert_model = biobert_model |
|
|
self.device = device |
|
|
|
|
|
def transform(self, df: pd.DataFrame): |
|
|
"""Run full preprocessing on a dataframe.""" |
|
|
df = drop_duplicates(df) |
|
|
df = select_required_columns(df, self.required_cols) |
|
|
df = transform_numeric(df) |
|
|
df = fill_missing_numerical(df, ["Enrollment"]) |
|
|
df = fill_missing_categorical(df, self.categorical_cols) |
|
|
df = expand_study_design(df, unique_attributes) |
|
|
df = drop_irrelevant_columns(df, self.columns_to_drop) |
|
|
df = clean_categorical_columns(df) |
|
|
df = encode_categorical(df, label_encoders) |
|
|
df = scale_numeric(df, scaler) |
|
|
df = preprocess_text_columns(df, self.text_columns) |
|
|
|
|
|
embeddings = None |
|
|
if self.tokenizer is not None and self.biobert_model is not None: |
|
|
tokenized_dict = tokenize_text_columns(df, self.text_columns, self.tokenizer) |
|
|
embeddings = extract_text_embeddings( |
|
|
tokenized_dict, |
|
|
self.biobert_model, |
|
|
device=self.device |
|
|
) |
|
|
|
|
|
return df, embeddings |
|
|
|
|
|
def save(self, path="models/preprocessor.pkl"): |
|
|
"""Save preprocessor object.""" |
|
|
joblib.dump(self, path) |
|
|
|
|
|
@staticmethod |
|
|
def load(path="models/preprocessor.pkl"): |
|
|
"""Load preprocessor object.""" |
|
|
return joblib.load(path) |
|
|
|