import pandas as pd import numpy as np import joblib from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.preprocessing import PolynomialFeatures from sklearn.impute import SimpleImputer class XGBoostInsuranceModel: """ A Gradient Boosting-based insurance model class with: 1. Data loading & dropping unnecessary columns 2. Dummy-encoding for categorical variables 3. SimpleImputer for missing data 4. PolynomialFeatures for interactions 5. Train/test splits and final evaluation 6. A consistent API with `preprocessing`, `predict`, `postprocessing` """ def __init__(self, csv_path): # ----------------------------------------------------- # 1. Load & prepare the data # ----------------------------------------------------- df = pd.read_csv(csv_path) # Drop these columns if present (ignore errors if they're missing) df = df.drop(columns=['index', 'PatientID'], errors='ignore') # Separate features & target X = df.drop(columns=['claim']) y = df['claim'].values # We'll discover the categorical columns by using get_dummies once # But to replicate these transformations on new data, # we must track all dummy columns. X_dummies = pd.get_dummies(X, drop_first=True) self.all_dummy_cols = X_dummies.columns.tolist() # Create a SimpleImputer self.imputer = SimpleImputer(strategy='mean') # Create a PolynomialFeatures transformer (degree=2, interaction_only=True) self.poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) # ----------------------------------------------------- # 2. Fit transformations on training set only # (but we do an initial get_dummies for the entire data # to discover all possible dummy columns). # ----------------------------------------------------- # We'll do the train-test split on X_dummies (since that's how you had it). X_train_dummies, X_test_dummies, y_train, y_test = train_test_split( X_dummies, y, test_size=0.2, random_state=42 ) # A second split from X_train to get a validation set (as in your code) X_train_dummies, X_val_dummies, y_train, y_val = train_test_split( X_train_dummies, y_train, test_size=0.25, random_state=42 ) # Fit the imputer on the training set X_train_imputed = self.imputer.fit_transform(X_train_dummies) # Fit polynomial on the training set X_train_poly = self.poly.fit_transform(X_train_imputed) # ----------------------------------------------------- # 3. Initialize and train the GradientBoostingRegressor # ----------------------------------------------------- self.model = GradientBoostingRegressor( n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42 ) self.model.fit(X_train_poly, y_train) # ----------------------------------------------------- # 4. Evaluate on the test set # ----------------------------------------------------- X_test_imputed = self.imputer.transform(X_test_dummies) X_test_poly = self.poly.transform(X_test_imputed) y_test_pred = self.model.predict(X_test_poly) test_mse = mean_squared_error(y_test, y_test_pred) test_mae = mean_absolute_error(y_test, y_test_pred) test_r2 = r2_score(y_test, y_test_pred) self.__scores = [test_mae, test_mse, test_r2] print(f"[XGBoostInsuranceModel] MAE: {test_mae:.3f} | MSE: {test_mse:.3f} | R^2: {test_r2:.3f}") def preprocessing(self, raw_df): """ Takes a new DataFrame with the same columns as the original CSV minus 'claim'. Then: 1. Turns categorical features into dummy columns 2. Ensures the dummy columns match those used during training 3. Imputes missing values 4. Applies PolynomialFeatures transform Returns transformed data (numpy array). """ # 1. Convert raw_df to dummies, potentially missing or extra columns temp_dummies = pd.get_dummies(raw_df, drop_first=True) # Ensure it has exactly the same dummy columns as the training data for col in self.all_dummy_cols: if col not in temp_dummies.columns: temp_dummies[col] = 0 # If there are extra columns not in self.all_dummy_cols, drop them temp_dummies = temp_dummies[self.all_dummy_cols] # 2. Imputation temp_imputed = self.imputer.transform(temp_dummies) # 3. Polynomial Features temp_poly = self.poly.transform(temp_imputed) return temp_poly def predict(self, preprocessed_data): """ Receives data already output by `preprocessing`. Returns predictions in the original scale (no inverse transform needed). """ preds = self.model.predict(preprocessed_data) return self.postprocessing(preds) def postprocessing(self, preds): """ Currently a pass-through, because we haven't scaled 'claim'. In a different scenario, you might invert-scale predictions here. """ return preds def getScores(self): """ Returns a string with the test metrics. (MAE, MSE, R^2) from the last training process. """ return f"MAE: {self.__scores[0]} | MSE: {self.__scores[1]} | R^2: {self.__scores[2]}" if __name__ == "__main__": # Instantiate, train, and evaluate the model on "cleaned_insurance_data.csv" xgb_model = XGBoostInsuranceModel("cleaned_insurance_data.csv") # Export the entire class instance for future use joblib.dump(xgb_model, "XGBoostInsuranceModel.joblib") print("Exported XGBoostInsuranceModel to XGBoostInsuranceModel.joblib")