import pandas as pd import numpy as np from sklearn.svm import NuSVR from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, MinMaxScaler from sklearn.pipeline import Pipeline from sklearn.base import BaseEstimator, TransformerMixin from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error, r2_score import joblib class NuSVRInsuranceModel: """ This class encapsulates: 1. Preprocessing: column transformations, scaling 2. Prediction: using NuSVR 3. Postprocessing: inverse-transform predictions to original scale """ # --- Custom Transformer defined INSIDE the class --- class MultiplyScaler(BaseEstimator, TransformerMixin): def __init__(self, factor=2): self.factor = factor def fit(self, X, y=None): return self def transform(self, X): return X * self.factor def __init__(self): """ In the constructor, define the column pipelines, the main ColumnTransformer, the target scaler, and the model. """ # Example pipelines (adjust as needed) text_pipeline = Pipeline([ ('one-hot', OneHotEncoder()) ]) nums_pipeline = Pipeline([ ('normalize', StandardScaler(with_mean=True)), ]) nums_pipeline_strong = Pipeline([ ('normalize', StandardScaler(with_mean=True)), # Note we reference the nested class here ('scalarMultiply', NuSVRInsuranceModel.MultiplyScaler(factor=2)) ]) smoke_pipeline = Pipeline([ ('one-hot', OneHotEncoder()), ('normalize', StandardScaler(with_mean=False)), ('scalar-multiply', NuSVRInsuranceModel.MultiplyScaler(factor=5)) ]) region_pipeline = Pipeline([ ('categories', OrdinalEncoder()) ]) # Create ColumnTransformer # Adjust columns to match your dataset's actual column names self.ct = ColumnTransformer([ ('str_handler', text_pipeline, ['diabetic', 'gender']), ('smoke_handle', smoke_pipeline, ['smoker']), ('floats_ints_weak', nums_pipeline, ['children', 'age']), ('floats_ints_strong', nums_pipeline_strong, ['bmi', 'bloodpressure']), ]) # Target scaler (for the 'claim' column) self.target_scaler = MinMaxScaler(feature_range=(-0.5, 0.5)) # NuSVR model with desired hyperparameters self.model = NuSVR(C=10, gamma='scale', kernel='rbf', nu=0.80) def preprocessing(self, df): """ Takes a raw dataframe (with the relevant columns) and applies the fitted ColumnTransformer used in training. Returns the transformed feature matrix. """ return self.ct.transform(df) def predict(self, preprocessed_data): """ Takes already-preprocessed data (matrix/array) and outputs the final predictions in the original scale. """ y_pred_scaled = self.model.predict(preprocessed_data) return self.postprocessing(y_pred_scaled) def postprocessing(self, y_pred_scaled): """ Takes scaled predictions (in the target_scaler domain) and inversely transforms them back to the original target domain. """ y_pred_original = self.target_scaler.inverse_transform( y_pred_scaled.reshape(-1, 1) ) return y_pred_original.ravel() if __name__ == "__main__": # ------------------------------------------------- # 1. Load data # ------------------------------------------------- df = pd.read_csv('cleaned_insurance_data.csv') # Separate features and target features = df.drop(columns=['claim', 'PatientID', 'index']) target = df['claim'] # ------------------------------------------------- # 2. Instantiate our NuSVRInsuranceModel # ------------------------------------------------- nusvr_wrapper = NuSVRInsuranceModel() # ------------------------------------------------- # 3. Train-test split # ------------------------------------------------- X_train_raw, X_test_raw, y_train, y_test = train_test_split( features, target, test_size=0.25, random_state=42 ) # ------------------------------------------------- # 4. Fit ColumnTransformer & target scaler on TRAIN data # ------------------------------------------------- # Fit the ColumnTransformer X_train_t = nusvr_wrapper.ct.fit_transform(X_train_raw) # Fit the target scaler y_train_t = nusvr_wrapper.target_scaler.fit_transform(y_train.values.reshape(-1, 1)).ravel() # ------------------------------------------------- # 5. Train the NuSVR model # ------------------------------------------------- nusvr_wrapper.model.fit(X_train_t, y_train_t) # ------------------------------------------------- # 6. Evaluate on test data # ------------------------------------------------- # Preprocess the test features with the same pipeline X_test_t = nusvr_wrapper.preprocessing(X_test_raw) # Make predictions (in original scale) y_pred = nusvr_wrapper.predict(X_test_t) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) print(f"Test MAE (original scale): {mae:.3f}") print(f"Test R^2 (original scale): {r2:.3f}") # ------------------------------------------------- # 7. Export the fitted model # ------------------------------------------------- joblib.dump(nusvr_wrapper, "nusvr_insurance_model.joblib") print("Fitted NuSVRInsuranceModel saved to nusvr_insurance_model.joblib")