|
import pandas as pd
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.base import BaseEstimator, TransformerMixin
|
|
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.feature_selection import SelectKBest, chi2
|
|
import pandas as pd
|
|
from sklearn.base import BaseEstimator, TransformerMixin
|
|
from imblearn.over_sampling import SMOTE
|
|
import kagglehub
|
|
import pickle
|
|
|
|
|
|
|
|
class Encoder(BaseEstimator, TransformerMixin):
|
|
def __init__(self, categorical_columns, target_column):
|
|
self.categorical_columns = categorical_columns
|
|
self.target_column = target_column
|
|
self.ohe = OneHotEncoder(sparse_output=False)
|
|
self.le = LabelEncoder()
|
|
self.encoded_feature_names = []
|
|
|
|
def fit(self, X, y=None):
|
|
self.ohe.fit(X[self.categorical_columns])
|
|
self.le.fit(X[self.target_column])
|
|
self.encoded_feature_names = self.ohe.get_feature_names_out(self.categorical_columns).tolist()
|
|
return self
|
|
|
|
def transform(self, X):
|
|
encoded = self.ohe.transform(X[self.categorical_columns])
|
|
|
|
encoded_df = pd.DataFrame(
|
|
encoded,
|
|
columns=self.encoded_feature_names,
|
|
index=X.index
|
|
)
|
|
|
|
result = pd.concat([
|
|
X.drop(self.categorical_columns + [self.target_column], axis=1),
|
|
encoded_df
|
|
], axis=1)
|
|
result[self.target_column] = self.le.transform(X[self.target_column])
|
|
return result
|
|
|
|
|
|
class FeatureSelector(BaseEstimator, TransformerMixin):
|
|
def __init__(self, numeric_features, encoded_features, target_column, num_k=5, cat_k=5):
|
|
"""
|
|
:param numeric_features: List of numeric feature names
|
|
:param encoded_features: List of encoded feature names
|
|
:param target_column: Target column name
|
|
:param num_k: Number of top numeric features to select
|
|
:param cat_k: Number of top encoded features to select
|
|
"""
|
|
self.numeric_features = numeric_features
|
|
self.encoded_features = encoded_features
|
|
self.target_column = target_column
|
|
self.num_k = num_k
|
|
self.cat_k = cat_k
|
|
self.chi2_selector = None
|
|
self.numeric_selector = None
|
|
|
|
def fit(self, X, y=None):
|
|
|
|
self.numeric_selector = X[self.numeric_features].corrwith(X[self.target_column]).abs().nlargest(self.num_k).index.tolist()
|
|
|
|
|
|
X_encoded = X[self.encoded_features]
|
|
y = X[self.target_column]
|
|
|
|
|
|
self.chi2_selector = SelectKBest(chi2, k=self.cat_k).fit(X_encoded, y)
|
|
return self
|
|
|
|
def transform(self, X):
|
|
|
|
X_selected_num = X[self.numeric_selector]
|
|
y = X[self.target_column]
|
|
|
|
|
|
X_encoded = X[self.encoded_features]
|
|
X_selected_cat = pd.DataFrame(self.chi2_selector.transform(X_encoded), columns=self.chi2_selector.get_feature_names_out(), index=X.index)
|
|
|
|
|
|
return pd.concat([X_selected_num, X_selected_cat, y], axis=1)
|
|
|
|
|
|
class Splitter(BaseEstimator, TransformerMixin):
|
|
def __init__(self, target_column, test_size=0.3, random_state=42):
|
|
self.target_column = target_column
|
|
self.test_size = test_size
|
|
self.random_state = random_state
|
|
|
|
def fit(self, X, y=None):
|
|
return self
|
|
|
|
def transform(self, X):
|
|
y = X[self.target_column]
|
|
X = X.drop(self.target_column, axis=1)
|
|
return tuple(train_test_split(X, y, test_size=self.test_size, random_state=self.random_state))
|
|
|
|
|
|
|
|
class Scaler(BaseEstimator, TransformerMixin):
|
|
def __init__(self, scaler_type='standard'):
|
|
self.scaler = StandardScaler() if scaler_type == 'standard' else MinMaxScaler()
|
|
|
|
def fit(self, X, y=None):
|
|
return self
|
|
|
|
def transform(self, X):
|
|
if isinstance(X, tuple) and len(X) == 4:
|
|
X_train, X_test, y_train, y_test = X
|
|
X_train_scaled = self.scaler.fit_transform(X_train)
|
|
X_test_scaled = self.scaler.transform(X_test)
|
|
return X_train_scaled, X_test_scaled, y_train, y_test
|
|
else:
|
|
return self.scaler.fit_transform(X)
|
|
|
|
|
|
|
|
class FullPipeline:
|
|
def __init__(self, categorical_columns, target_column, numeric_features, num_k=5, cat_k=5):
|
|
self.encoder = Encoder(categorical_columns, target_column)
|
|
self.feature_selector = None
|
|
self.splitter = Splitter(target_column)
|
|
self.scaler = Scaler()
|
|
self.numeric_features = numeric_features
|
|
self.num_k = num_k
|
|
self.cat_k = cat_k
|
|
|
|
def fit_transform(self, X):
|
|
|
|
X = self.encoder.fit_transform(X)
|
|
self.feature_selector = FeatureSelector(
|
|
numeric_features=self.numeric_features,
|
|
encoded_features=self.encoder.encoded_feature_names,
|
|
target_column=self.encoder.target_column,
|
|
num_k=self.num_k, cat_k=self.cat_k
|
|
)
|
|
X = self.feature_selector.fit_transform(X)
|
|
X_train, X_test, y_train, y_test = self.splitter.transform(X)
|
|
return self.scaler.transform((X_train, X_test, y_train, y_test))
|
|
|
|
class FullPipeline:
|
|
def __init__(self, categorical_columns, target_column, numeric_features, num_k=5, cat_k=5):
|
|
self.encoder = Encoder(categorical_columns, target_column)
|
|
self.feature_selector = None
|
|
self.splitter = Splitter(target_column)
|
|
self.scaler = Scaler()
|
|
self.numeric_features = numeric_features
|
|
self.num_k = num_k
|
|
self.cat_k = cat_k
|
|
|
|
def fit_transform(self, X):
|
|
X = self.encoder.fit_transform(X)
|
|
|
|
pickle.dump(self.encoder, open("encoder.pkl", "wb"))
|
|
|
|
self.feature_selector = FeatureSelector(
|
|
numeric_features=self.numeric_features,
|
|
encoded_features=self.encoder.encoded_feature_names,
|
|
target_column=self.encoder.target_column,
|
|
num_k=self.num_k, cat_k=self.cat_k
|
|
)
|
|
X = self.feature_selector.fit_transform(X)
|
|
|
|
pickle.dump(self.feature_selector, open("feature_selector.pkl", "wb"))
|
|
|
|
X_train, X_test, y_train, y_test = self.splitter.transform(X)
|
|
|
|
pickle.dump(self.splitter, open("splitter.pkl", "wb"))
|
|
|
|
X_train_scaled, X_test_scaled, y_train, y_test = self.scaler.transform((X_train, X_test, y_train, y_test))
|
|
|
|
pickle.dump(self.scaler, open("scaler.pkl", "wb"))
|
|
|
|
return (X_train_scaled, X_test_scaled, y_train, y_test)
|
|
|
|
|
|
def main():
|
|
path = kagglehub.dataset_download("fedesoriano/heart-failure-prediction")
|
|
df = pd.read_csv(path + r"\heart.csv")
|
|
|
|
df.drop_duplicates(inplace=True)
|
|
|
|
|
|
pipeline = FullPipeline(
|
|
categorical_columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'],
|
|
target_column='HeartDisease',
|
|
numeric_features=['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak'],
|
|
num_k=3,
|
|
cat_k=3
|
|
)
|
|
|
|
|
|
X_train, X_test, y_train, y_test = pipeline.fit_transform(df)
|
|
|
|
with open("transformed_data.pkl", "wb") as f:
|
|
pickle.dump((X_train, X_test, y_train, y_test), f)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |