# linreg_model.py import pandas as pd import numpy as np import joblib from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_absolute_error, r2_score class LinearRegressionInsuranceModel: """ A Linear Regression-based insurance claim prediction model with: 1. Data loading & cleaning 2. Preprocessing (categorical encoding, numerical scaling) 3. Model training and evaluation 4. Consistent API: preprocessing, predict, postprocessing """ def __init__(self, csv_path): """ Initializes the model by loading data, preprocessing, training, and evaluating. Parameters: csv_path (str): Path to the cleaned insurance data CSV file. """ # ----------------------------------------------------- # 1. Load and clean the data # ----------------------------------------------------- df = pd.read_csv(csv_path) # Drop irrelevant columns and handle missing values df = df.drop(columns=['index', 'PatientID'], errors='ignore').dropna() # ----------------------------------------------------- # 2. Handle outliers in the target variable 'claim' # ----------------------------------------------------- target_column = 'claim' mean_y = df[target_column].mean() std_y = df[target_column].std() threshold_low = mean_y - 3.5 * std_y threshold_high = mean_y + 3.5 * std_y df = df[(df[target_column] >= threshold_low) & (df[target_column] <= threshold_high)] # ----------------------------------------------------- # 3. Define features and target # ----------------------------------------------------- self.features = df.drop(columns=[target_column]) self.target = df[target_column].values # or df['claim'].to_numpy() # ----------------------------------------------------- # 4. Define preprocessing pipelines # ----------------------------------------------------- categorical_columns = ['gender', 'smoker', 'region', 'diabetic'] numerical_columns = ['bmi', 'bloodpressure', 'children', 'age'] # Pipeline for categorical features categorical_pipeline = Pipeline([ ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) # Pipeline for numerical features numerical_pipeline = Pipeline([ ('scaler', StandardScaler()) ]) # Combine pipelines using ColumnTransformer self.preprocessor = ColumnTransformer([ ('categorical', categorical_pipeline, categorical_columns), ('numerical', numerical_pipeline, numerical_columns) ]) # ----------------------------------------------------- # 5. Fit the preprocessor # ----------------------------------------------------- self.preprocessor.fit(self.features) # ----------------------------------------------------- # 6. Transform the features # ----------------------------------------------------- X_preprocessed = self.preprocessor.transform(self.features) # ----------------------------------------------------- # 7. Train-test split # ----------------------------------------------------- X_train, X_test, y_train, y_test = train_test_split( X_preprocessed, self.target, test_size=0.2, random_state=42 ) # ----------------------------------------------------- # 8. Initialize and train the Linear Regression model # ----------------------------------------------------- self.model = LinearRegression() self.model.fit(X_train, y_train) # ----------------------------------------------------- # 9. Evaluate the model on the test set # ----------------------------------------------------- y_pred = self.model.predict(X_test) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) self.__scores = { 'MAE': mae, 'R2': r2 } print(f"[Linear Regression] Test MAE: {mae:.3f}") print(f"[Linear Regression] Test R^2: {r2:.3f}") def preprocessing(self, raw_df): """ Preprocesses new raw data by applying the same transformations as training. Parameters: raw_df (pd.DataFrame): New data with the same feature columns as training (excluding 'claim'). Returns: np.ndarray: Transformed feature matrix ready for prediction. """ return self.preprocessor.transform(raw_df) def predict(self, preprocessed_data): """ Makes predictions on preprocessed data. Parameters: preprocessed_data (np.ndarray): Transformed feature matrix. Returns: np.ndarray: Predicted claim amounts. """ preds = self.model.predict(preprocessed_data) return self.postprocessing(preds) def postprocessing(self, preds): """ Postprocesses predictions. Currently a pass-through, but can be extended if needed. Parameters: preds (np.ndarray): Raw predictions from the model. Returns: np.ndarray: Final predictions. """ return preds def get_scores(self): """ Retrieves the evaluation metrics. Returns: dict: Dictionary containing MAE and R2. """ return self.__scores def get_coefficients(self): """ Retrieves the model's coefficients. Returns: pd.DataFrame: DataFrame of feature coefficients. """ # Extract feature names after preprocessing categorical_features = self.preprocessor.named_transformers_['categorical'].named_steps['onehot'].get_feature_names_out(['gender', 'smoker', 'region', 'diabetic']) numerical_features = self.preprocessor.named_transformers_['numerical'].named_steps['scaler'].get_feature_names_out(['bmi', 'bloodpressure', 'children', 'age']) all_features = np.concatenate([categorical_features, numerical_features]) # Retrieve coefficients from the Linear Regression model coefficients = pd.DataFrame({ 'Feature': all_features, 'Coefficient': self.model.coef_ }).sort_values(by='Coefficient', ascending=False).reset_index(drop=True) return coefficients if __name__ == "__main__": # ----------------------------------------------------- # 10. Instantiate and train the model # ----------------------------------------------------- # Replace the CSV path with your actual path model = LinearRegressionInsuranceModel("cleaned_insurance_data.csv") # ----------------------------------------------------- # 11. Export the entire model class instance # ----------------------------------------------------- joblib.dump(model, "LinearRegressionInsuranceModel.joblib") print("Exported LinearRegressionInsuranceModel to LinearRegressionInsuranceModel.joblib")