# polynomial_regression_model.py import pandas as pd import numpy as np import joblib from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_absolute_error, r2_score class PolynomialRegressionInsuranceModel: """ A Polynomial Regression-based insurance claim prediction model with: 1. Data loading & cleaning 2. Preprocessing (categorical encoding, numerical scaling, polynomial features) 3. Model training and evaluation 4. Consistent API: preprocessing, predict, postprocessing """ def __init__(self, csv_path): """ Initializes the model by loading data, preprocessing, training, and evaluating. Parameters: csv_path (str): Path to the cleaned insurance data CSV file. """ # ----------------------------------------------------- # 1. Load and clean the data # ----------------------------------------------------- df = pd.read_csv(csv_path) # Drop irrelevant columns and handle missing values df = df.drop(columns=['index', 'PatientID'], errors='ignore').dropna() # ----------------------------------------------------- # 2. Handle outliers in the target variable 'claim' # ----------------------------------------------------- target_column = 'claim' mean_y = df[target_column].mean() std_y = df[target_column].std() threshold_low = mean_y - 3.5 * std_y threshold_high = mean_y + 3.5 * std_y df = df[(df[target_column] >= threshold_low) & (df[target_column] <= threshold_high)] # ----------------------------------------------------- # 3. Define features and target # ----------------------------------------------------- self.features = df.drop(columns=[target_column]) self.target = df[target_column].values # or df['claim'].to_numpy() # ----------------------------------------------------- # 4. Define preprocessing pipelines # ----------------------------------------------------- categorical_columns = ['gender', 'smoker', 'region', 'diabetic'] numerical_columns = ['bmi', 'bloodpressure', 'children', 'age'] # Pipeline for categorical features categorical_pipeline = Pipeline([ ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) # Pipeline for numerical features numerical_pipeline = Pipeline([ ('scaler', StandardScaler()) ]) # Combine pipelines using ColumnTransformer self.preprocessor = ColumnTransformer([ ('categorical', categorical_pipeline, categorical_columns), ('numerical', numerical_pipeline, numerical_columns) ]) # Pipeline for polynomial features self.poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False) # ----------------------------------------------------- # 5. Combine preprocessing and polynomial features # ----------------------------------------------------- self.full_preprocessor = Pipeline([ ('preprocessor', self.preprocessor), ('poly', self.poly) ]) # Transform the features X_preprocessed = self.full_preprocessor.fit_transform(self.features) # ----------------------------------------------------- # 6. Train-test split # ----------------------------------------------------- X_train, X_test, y_train, y_test = train_test_split( X_preprocessed, self.target, test_size=0.2, random_state=42 ) # ----------------------------------------------------- # 7. Initialize and train the Linear Regression model # ----------------------------------------------------- self.model = LinearRegression() # Perform 5-fold cross-validation on training data cv_scores = cross_val_score(self.model, X_train, y_train, cv=5, scoring='r2') print(f"[Polynomial Regression] Cross-Validation R2 Scores: {cv_scores}") print(f"[Polynomial Regression] Average CV R2 Score: {cv_scores.mean():.3f}") # Train the model on the full training data self.model.fit(X_train, y_train) # ----------------------------------------------------- # 8. Evaluate the model on the test set # ----------------------------------------------------- y_pred = self.model.predict(X_test) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) self.__scores = { 'MAE': mae, 'R2': r2, 'Cross-Validation R2 Scores': cv_scores, 'Average CV R2': cv_scores.mean() } print(f"[Polynomial Regression] Test MAE: {mae:.3f}") print(f"[Polynomial Regression] Test R^2: {r2:.3f}") def preprocessing(self, raw_df): """ Preprocesses new raw data by applying the same transformations as training. Parameters: raw_df (pd.DataFrame): New data with the same feature columns as training (excluding 'claim'). Returns: np.ndarray: Transformed feature matrix ready for prediction. """ return self.full_preprocessor.transform(raw_df) def predict(self, preprocessed_data): """ Makes predictions on preprocessed data. Parameters: preprocessed_data (np.ndarray): Transformed feature matrix. Returns: np.ndarray: Predicted claim amounts. """ preds = self.model.predict(preprocessed_data) return self.postprocessing(preds) def postprocessing(self, preds): """ Postprocesses predictions. Currently a pass-through, but can be extended. Parameters: preds (np.ndarray): Raw predictions from the model. Returns: np.ndarray: Final predictions. """ return preds def get_scores(self): """ Retrieves the evaluation metrics. Returns: dict: Dictionary containing MAE, R2, and cross-validation scores. """ return self.__scores def get_coefficients(self): """ Retrieves the model's coefficients. Returns: pd.DataFrame: DataFrame of feature coefficients. """ # Extract feature names after preprocessing and polynomial transformation categorical_features = self.preprocessor.named_transformers_['categorical'].named_steps['onehot'].get_feature_names_out(['gender', 'smoker', 'region', 'diabetic']) numerical_features = self.preprocessor.named_transformers_['numerical'].named_steps['scaler'].get_feature_names_out(['bmi', 'bloodpressure', 'children', 'age']) all_features = np.concatenate([categorical_features, numerical_features]) # Get feature names after polynomial transformation poly_feature_names = self.poly.get_feature_names_out(all_features) # Create DataFrame of coefficients coefficients = pd.DataFrame({ 'Feature': poly_feature_names, 'Coefficient': self.model.coef_ }).sort_values(by='Coefficient', ascending=False) return coefficients if __name__ == "__main__": # ----------------------------------------------------- # 9. Instantiate and train the model # ----------------------------------------------------- model = PolynomialRegressionInsuranceModel("cleaned_insurance_data.csv") # ----------------------------------------------------- # 10. Export the entire model class instance # ----------------------------------------------------- joblib.dump(model, "PolynomialRegressionInsuranceModel.joblib") print("Exported PolynomialRegressionInsuranceModel to PolynomialRegressionInsuranceModel.joblib")