import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer

class XGBoostInsuranceModel:
    """
    A Gradient Boosting-based insurance model class with:
      1. Data loading & dropping unnecessary columns
      2. Dummy-encoding for categorical variables
      3. SimpleImputer for missing data
      4. PolynomialFeatures for interactions
      5. Train/test splits and final evaluation
      6. A consistent API with `preprocessing`, `predict`, `postprocessing`
    """

    def __init__(self, csv_path):
        # -----------------------------------------------------
        # 1. Load & prepare the data
        # -----------------------------------------------------
        df = pd.read_csv(csv_path)
        # Drop these columns if present (ignore errors if they're missing)
        df = df.drop(columns=['index', 'PatientID'], errors='ignore')

        # Separate features & target
        X = df.drop(columns=['claim'])
        y = df['claim'].values

        # We'll discover the categorical columns by using get_dummies once
        # But to replicate these transformations on new data,
        # we must track all dummy columns.
        X_dummies = pd.get_dummies(X, drop_first=True)
        self.all_dummy_cols = X_dummies.columns.tolist()

        # Create a SimpleImputer
        self.imputer = SimpleImputer(strategy='mean')
        # Create a PolynomialFeatures transformer (degree=2, interaction_only=True)
        self.poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

        # -----------------------------------------------------
        # 2. Fit transformations on training set only
        #    (but we do an initial get_dummies for the entire data
        #     to discover all possible dummy columns).
        # -----------------------------------------------------
        # We'll do the train-test split on X_dummies (since that's how you had it).
        X_train_dummies, X_test_dummies, y_train, y_test = train_test_split(
            X_dummies, y, test_size=0.2, random_state=42
        )
        # A second split from X_train to get a validation set (as in your code)
        X_train_dummies, X_val_dummies, y_train, y_val = train_test_split(
            X_train_dummies, y_train, test_size=0.25, random_state=42
        )
        # Fit the imputer on the training set
        X_train_imputed = self.imputer.fit_transform(X_train_dummies)
        # Fit polynomial on the training set
        X_train_poly = self.poly.fit_transform(X_train_imputed)

        # -----------------------------------------------------
        # 3. Initialize and train the GradientBoostingRegressor
        # -----------------------------------------------------
        self.model = GradientBoostingRegressor(
            n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
        )
        self.model.fit(X_train_poly, y_train)

        # -----------------------------------------------------
        # 4. Evaluate on the test set
        # -----------------------------------------------------
        X_test_imputed = self.imputer.transform(X_test_dummies)
        X_test_poly = self.poly.transform(X_test_imputed)
        y_test_pred = self.model.predict(X_test_poly)

        test_mse = mean_squared_error(y_test, y_test_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        self.__scores = [test_mae, test_mse, test_r2]

        print(f"[XGBoostInsuranceModel] MAE: {test_mae:.3f} | MSE: {test_mse:.3f} | R^2: {test_r2:.3f}")

    def preprocessing(self, raw_df):
        """
        Takes a new DataFrame with the same columns as the original CSV minus 'claim'.
        Then:
          1. Turns categorical features into dummy columns
          2. Ensures the dummy columns match those used during training
          3. Imputes missing values
          4. Applies PolynomialFeatures transform
        Returns transformed data (numpy array).
        """

        # 1. Convert raw_df to dummies, potentially missing or extra columns
        temp_dummies = pd.get_dummies(raw_df, drop_first=True)

        # Ensure it has exactly the same dummy columns as the training data
        for col in self.all_dummy_cols:
            if col not in temp_dummies.columns:
                temp_dummies[col] = 0
        # If there are extra columns not in self.all_dummy_cols, drop them
        temp_dummies = temp_dummies[self.all_dummy_cols]

        # 2. Imputation
        temp_imputed = self.imputer.transform(temp_dummies)

        # 3. Polynomial Features
        temp_poly = self.poly.transform(temp_imputed)

        return temp_poly

    def predict(self, preprocessed_data):
        """
        Receives data already output by `preprocessing`.
        Returns predictions in the original scale (no inverse transform needed).
        """
        preds = self.model.predict(preprocessed_data)
        return self.postprocessing(preds)

    def postprocessing(self, preds):
        """
        Currently a pass-through, because we haven't scaled 'claim'.
        In a different scenario, you might invert-scale predictions here.
        """
        return preds

    def getScores(self):
        """
        Returns a string with the test metrics. 
        (MAE, MSE, R^2) from the last training process.
        """
        return f"MAE: {self.__scores[0]} | MSE: {self.__scores[1]} | R^2: {self.__scores[2]}"

if __name__ == "__main__":
    # Instantiate, train, and evaluate the model on "cleaned_insurance_data.csv"
    xgb_model = XGBoostInsuranceModel("cleaned_insurance_data.csv")

    # Export the entire class instance for future use
    joblib.dump(xgb_model, "XGBoostInsuranceModel.joblib")
    print("Exported XGBoostInsuranceModel to XGBoostInsuranceModel.joblib")