|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import joblib |
|
|
from sklearn.preprocessing import LabelEncoder, StandardScaler |
|
|
import torch |
|
|
from torch.utils.data import TensorDataset, DataLoader |
|
|
from transformers import AutoModel |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_study_design(study_design, all_attributes): |
|
|
|
|
|
attributes = {attr: "Unknown" for attr in all_attributes} |
|
|
|
|
|
if study_design and study_design != "Unknown" and pd.notna(study_design): |
|
|
for part in study_design.split('|'): |
|
|
if ':' in part: |
|
|
key, value = part.split(':', 1) |
|
|
key, value = key.strip(), value.strip() |
|
|
|
|
|
|
|
|
if key in all_attributes: |
|
|
attributes[key] = value |
|
|
|
|
|
|
|
|
return attributes |
|
|
|
|
|
def expand_study_design(df, unique_attributes): |
|
|
parsed = df['Study Design'].apply(lambda x: parse_study_design(x, unique_attributes)) |
|
|
study_df = pd.DataFrame(parsed.tolist(), index=df.index) |
|
|
|
|
|
|
|
|
df = pd.concat([df, study_df], axis=1) |
|
|
|
|
|
|
|
|
df = df.drop(columns=['Study Design'], errors='ignore') |
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def encode_categorical(df, label_encoders): |
|
|
for col, le in label_encoders.items(): |
|
|
|
|
|
df[col] = df[col].map(lambda x: x if x in le.classes_ else "Unknown") |
|
|
df[col] = le.transform(df[col]) |
|
|
return df |
|
|
|
|
|
def clean_categorical_columns(df: pd.DataFrame) -> pd.DataFrame: |
|
|
""" |
|
|
Clean and standardize certain categorical columns for inference. |
|
|
|
|
|
Replaces missing or malformed values with 'Unknown' to match training preprocessing. |
|
|
|
|
|
Args: |
|
|
df (pd.DataFrame): Input dataframe with user data. |
|
|
|
|
|
Returns: |
|
|
pd.DataFrame: DataFrame with cleaned categorical columns. |
|
|
""" |
|
|
columns_to_clean = ['Allocation', 'Intervention Model', 'Masking', 'Primary Purpose'] |
|
|
|
|
|
for col in columns_to_clean: |
|
|
|
|
|
df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown') |
|
|
|
|
|
df[col] = df[col].fillna('Unknown') |
|
|
|
|
|
return df |
|
|
|
|
|
|