# linreg_model.py

import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score


class LinearRegressionInsuranceModel:
    """
    A Linear Regression-based insurance claim prediction model with:
      1. Data loading & cleaning
      2. Preprocessing (categorical encoding, numerical scaling)
      3. Model training and evaluation
      4. Consistent API: preprocessing, predict, postprocessing
    """

    def __init__(self, csv_path):
        """
        Initializes the model by loading data, preprocessing, training, and evaluating.
        
        Parameters:
            csv_path (str): Path to the cleaned insurance data CSV file.
        """
        # -----------------------------------------------------
        # 1. Load and clean the data
        # -----------------------------------------------------
        df = pd.read_csv(csv_path)
        # Drop irrelevant columns and handle missing values
        df = df.drop(columns=['index', 'PatientID'], errors='ignore').dropna()

        # -----------------------------------------------------
        # 2. Handle outliers in the target variable 'claim'
        # -----------------------------------------------------
        target_column = 'claim'
        mean_y = df[target_column].mean()
        std_y = df[target_column].std()
        threshold_low = mean_y - 3.5 * std_y
        threshold_high = mean_y + 3.5 * std_y
        df = df[(df[target_column] >= threshold_low) & (df[target_column] <= threshold_high)]

        # -----------------------------------------------------
        # 3. Define features and target
        # -----------------------------------------------------
        self.features = df.drop(columns=[target_column])
        self.target = df[target_column].values  # or df['claim'].to_numpy()

        # -----------------------------------------------------
        # 4. Define preprocessing pipelines
        # -----------------------------------------------------
        categorical_columns = ['gender', 'smoker', 'region', 'diabetic']
        numerical_columns = ['bmi', 'bloodpressure', 'children', 'age']

        # Pipeline for categorical features
        categorical_pipeline = Pipeline([
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        # Pipeline for numerical features
        numerical_pipeline = Pipeline([
            ('scaler', StandardScaler())
        ])

        # Combine pipelines using ColumnTransformer
        self.preprocessor = ColumnTransformer([
            ('categorical', categorical_pipeline, categorical_columns),
            ('numerical', numerical_pipeline, numerical_columns)
        ])

        # -----------------------------------------------------
        # 5. Fit the preprocessor
        # -----------------------------------------------------
        self.preprocessor.fit(self.features)

        # -----------------------------------------------------
        # 6. Transform the features
        # -----------------------------------------------------
        X_preprocessed = self.preprocessor.transform(self.features)

        # -----------------------------------------------------
        # 7. Train-test split
        # -----------------------------------------------------
        X_train, X_test, y_train, y_test = train_test_split(
            X_preprocessed,
            self.target,
            test_size=0.2,
            random_state=42
        )

        # -----------------------------------------------------
        # 8. Initialize and train the Linear Regression model
        # -----------------------------------------------------
        self.model = LinearRegression()
        self.model.fit(X_train, y_train)

        # -----------------------------------------------------
        # 9. Evaluate the model on the test set
        # -----------------------------------------------------
        y_pred = self.model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        self.__scores = {
            'MAE': mae,
            'R2': r2
        }

        print(f"[Linear Regression] Test MAE: {mae:.3f}")
        print(f"[Linear Regression] Test R^2: {r2:.3f}")

    def preprocessing(self, raw_df):
        """
        Preprocesses new raw data by applying the same transformations as training.
        
        Parameters:
            raw_df (pd.DataFrame): New data with the same feature columns as training (excluding 'claim').
        
        Returns:
            np.ndarray: Transformed feature matrix ready for prediction.
        """
        return self.preprocessor.transform(raw_df)

    def predict(self, preprocessed_data):
        """
        Makes predictions on preprocessed data.
        
        Parameters:
            preprocessed_data (np.ndarray): Transformed feature matrix.
        
        Returns:
            np.ndarray: Predicted claim amounts.
        """
        preds = self.model.predict(preprocessed_data)
        return self.postprocessing(preds)

    def postprocessing(self, preds):
        """
        Postprocesses predictions. Currently a pass-through, but can be extended if needed.
        
        Parameters:
            preds (np.ndarray): Raw predictions from the model.
        
        Returns:
            np.ndarray: Final predictions.
        """
        return preds

    def get_scores(self):
        """
        Retrieves the evaluation metrics.
        
        Returns:
            dict: Dictionary containing MAE and R2.
        """
        return self.__scores

    def get_coefficients(self):
        """
        Retrieves the model's coefficients.
        
        Returns:
            pd.DataFrame: DataFrame of feature coefficients.
        """
        # Extract feature names after preprocessing
        categorical_features = self.preprocessor.named_transformers_['categorical'].named_steps['onehot'].get_feature_names_out(['gender', 'smoker', 'region', 'diabetic'])
        numerical_features = self.preprocessor.named_transformers_['numerical'].named_steps['scaler'].get_feature_names_out(['bmi', 'bloodpressure', 'children', 'age'])
        all_features = np.concatenate([categorical_features, numerical_features])

        # Retrieve coefficients from the Linear Regression model
        coefficients = pd.DataFrame({
            'Feature': all_features,
            'Coefficient': self.model.coef_
        }).sort_values(by='Coefficient', ascending=False).reset_index(drop=True)

        return coefficients


if __name__ == "__main__":
    # -----------------------------------------------------
    # 10. Instantiate and train the model
    # -----------------------------------------------------
    # Replace the CSV path with your actual path
    model = LinearRegressionInsuranceModel("cleaned_insurance_data.csv")

    # -----------------------------------------------------
    # 11. Export the entire model class instance
    # -----------------------------------------------------
    joblib.dump(model, "LinearRegressionInsuranceModel.joblib")
    print("Exported LinearRegressionInsuranceModel to LinearRegressionInsuranceModel.joblib")