diff --git "a/lib/shap/explainers/_tree.py" "b/lib/shap/explainers/_tree.py" new file mode 100644--- /dev/null +++ "b/lib/shap/explainers/_tree.py" @@ -0,0 +1,1979 @@ +import json +import os +import struct +import tempfile +import time +import warnings +from typing import Any, Dict + +import numpy as np +import pandas as pd +import scipy.sparse +import scipy.special +from packaging import version + +from .. import maskers +from .._explanation import Explanation +from ..utils import assert_import, record_import_error, safe_isinstance +from ..utils._exceptions import ( + DimensionError, + ExplainerError, + InvalidFeaturePerturbationError, + InvalidMaskerError, + InvalidModelError, +) +from ..utils._legacy import DenseData +from ._explainer import Explainer +from .other._ubjson import decode_ubjson_buffer + +warnings.formatwarning = lambda msg, *args, **kwargs: str(msg) + '\n' # ignore everything except the message + +try: + from .. import _cext +except ImportError as e: + record_import_error("cext", "C extension was not built during install!", e) + +try: + import pyspark # noqa +except ImportError as e: + record_import_error("pyspark", "PySpark could not be imported!", e) + +output_transform_codes = { + "identity": 0, + "logistic": 1, + "logistic_nlogloss": 2, + "squared_loss": 3, +} + +feature_perturbation_codes = { + "interventional": 0, + "tree_path_dependent": 1, + "global_path_dependent": 2, +} + + +class TreeExplainer(Explainer): + """ Uses Tree SHAP algorithms to explain the output of ensemble tree models. + + Tree SHAP is a fast and exact method to estimate SHAP values for tree models and ensembles of trees, + under several different possible assumptions about feature dependence. It depends on fast C++ + implementations either inside an external model package or in the local compiled C extension. + """ + + def __init__( + self, + model, + data=None, + model_output="raw", + feature_perturbation="interventional", + feature_names=None, + approximate=False, + **deprecated_options, + ): + """ Build a new Tree explainer for the passed model. + + Parameters + ---------- + model : model object + The tree based machine learning model that we want to explain. XGBoost, LightGBM, CatBoost, Pyspark + and most tree-based scikit-learn models are supported. + + data : numpy.array or pandas.DataFrame + The background dataset to use for integrating out features. This argument is optional when + ``feature_perturbation="tree_path_dependent"``, since in that case we can use the number of training + samples that went down each tree path as our background dataset (this is recorded in the ``model`` + object). + + feature_perturbation : "interventional" (default) or "tree_path_dependent" (default when data=None) + Since SHAP values rely on conditional expectations, we need to decide how to handle correlated + (or otherwise dependent) input features. + The "interventional" approach breaks the dependencies between features according to the rules + dictated by causal inference (Janzing et al. 2019). Note that the "interventional" option + requires a background dataset ``data``, and its runtime scales linearly with the size of the + background dataset you use. Anywhere from 100 to 1000 random background samples are good + sizes to use. + The "tree_path_dependent" approach is to just follow the trees and use the number of training + examples that went down each leaf to represent the background distribution. This approach + does not require a background dataset, and so is used by default when no background dataset + is provided. + + model_output : "raw", "probability", "log_loss", or model method name + What output of the model should be explained. + If "raw", then we explain the raw output of the trees, which varies by model. For regression models, + "raw" is the standard output. For binary classification in XGBoost, this is the log odds ratio. + If ``model_output`` is the name of a supported prediction method on the ``model`` object, then we + explain the output of that model method name. For example, ``model_output="predict_proba"`` + explains the result of calling ``model.predict_proba``. + If "probability", then we explain the output of the model transformed into probability space + (note that this means the SHAP values now sum to the probability output of the model). + If "log_loss", then we explain the log base e of the model loss function, so that the SHAP values + sum up to the log loss of the model for each sample. This is helpful for breaking down model + performance by feature. + Currently the "probability" and "log_loss" options are only supported when + ``feature_perturbation="interventional"``. + + Examples + -------- + See `Tree explainer examples `_ + """ + if feature_names is not None: + self.data_feature_names = feature_names + elif isinstance(data, pd.DataFrame): + self.data_feature_names = list(data.columns) + + masker = data + super().__init__(model, masker, feature_names=feature_names) + + if type(self.masker) is maskers.Independent: + data = self.masker.data + elif masker is not None: + raise InvalidMaskerError("Unsupported masker type: %s!" % str(type(self.masker))) + + if getattr(self.masker, "clustering", None) is not None: + raise ExplainerError("TreeExplainer does not support clustered data inputs! Please use shap.Explainer or pass an unclustered masker!") + + # check for deprecated options + if model_output == "margin": + warnings.warn("model_output = \"margin\" has been renamed to model_output = \"raw\"") + model_output = "raw" + if model_output == "logloss": + warnings.warn("model_output = \"logloss\" has been renamed to model_output = \"log_loss\"") + model_output = "log_loss" + if "feature_dependence" in deprecated_options: + dep_val = deprecated_options["feature_dependence"] + if dep_val == "independent" and feature_perturbation == "interventional": + warnings.warn("feature_dependence = \"independent\" has been renamed to feature_perturbation" \ + " = \"interventional\"! See GitHub issue #882.") + elif feature_perturbation != "interventional": + warnings.warn("feature_dependence = \"independent\" has been renamed to feature_perturbation" \ + " = \"interventional\", you can't supply both options! See GitHub issue #882.") + if dep_val == "tree_path_dependent" and feature_perturbation == "interventional": + raise ValueError("The feature_dependence option has been renamed to feature_perturbation! " \ + "Please update the option name before calling TreeExplainer. See GitHub issue #882.") + if feature_perturbation == "independent": + raise InvalidFeaturePerturbationError("feature_perturbation = \"independent\" is not a valid option value, please use " \ + "feature_perturbation = \"interventional\" instead. See GitHub issue #882.") + + if isinstance(data, pd.DataFrame): + self.data = data.values + elif isinstance(data, DenseData): + self.data = data.data + else: + self.data = data + if self.data is None: + feature_perturbation = "tree_path_dependent" + #warnings.warn("Setting feature_perturbation = \"tree_path_dependent\" because no background data was given.") + elif feature_perturbation == "interventional" and self.data.shape[0] > 1_000: + wmsg = ( + f"Passing {self.data.shape[0]} background samples may lead to slow runtimes. Consider " + "using shap.sample(data, 100) to create a smaller background data set." + ) + warnings.warn(wmsg) + self.data_missing = None if self.data is None else pd.isna(self.data) + self.feature_perturbation = feature_perturbation + self.expected_value = None + self.model = TreeEnsemble(model, self.data, self.data_missing, model_output) + self.model_output = model_output + #self.model_output = self.model.model_output # this allows the TreeEnsemble to translate model outputs types by how it loads the model + + self.approximate = approximate + + if feature_perturbation not in feature_perturbation_codes: + raise InvalidFeaturePerturbationError("Invalid feature_perturbation option!") + + # check for unsupported combinations of feature_perturbation and model_outputs + if feature_perturbation == "tree_path_dependent": + if self.model.model_output != "raw": + raise ValueError("Only model_output=\"raw\" is supported for feature_perturbation=\"tree_path_dependent\"") + elif data is None: + raise ValueError("A background dataset must be provided unless you are using feature_perturbation=\"tree_path_dependent\"!") + + if self.model.model_output != "raw": + if self.model.objective is None and self.model.tree_output is None: + emsg = ( + "Model does not have a known objective or output type! When model_output is " + "not \"raw\" then we need to know the model's objective or link function." + ) + raise Exception(emsg) + + # A change in the signature of `xgboost.Booster.predict()` method has been introduced in XGBoost v1.4: + # The introduced `iteration_range` parameter is used when obtaining SHAP (incl. interaction) values from XGBoost models. + if self.model.model_type == 'xgboost': + import xgboost + if version.parse(xgboost.__version__) < version.parse('1.4'): + raise RuntimeError(f"SHAP requires XGBoost >= v1.4 , but found version {xgboost.__version__}. Please upgrade XGBoost!") + + # compute the expected value if we have a parsed tree for the cext + if self.model.model_output == "log_loss": + self.expected_value = self.__dynamic_expected_value + elif data is not None: + try: + self.expected_value = self.model.predict(self.data).mean(0) + except ValueError: + raise ExplainerError("Currently TreeExplainer can only handle models with categorical splits when " \ + "feature_perturbation=\"tree_path_dependent\" and no background data is passed. Please try again using " \ + "shap.TreeExplainer(model, feature_perturbation=\"tree_path_dependent\").") + if hasattr(self.expected_value, '__len__') and len(self.expected_value) == 1: + self.expected_value = self.expected_value[0] + elif hasattr(self.model, "node_sample_weight"): + self.expected_value = self.model.values[:,0].sum(0) + if self.expected_value.size == 1: + self.expected_value = self.expected_value[0] + self.expected_value += self.model.base_offset + if self.model.model_output != "raw": + self.expected_value = None # we don't handle transforms in this case right now... + + # if our output format requires binary classification to be represented as two outputs then we do that here + if self.model.model_output == "probability_doubled" and self.expected_value is not None: + self.expected_value = [1 - self.expected_value, self.expected_value] + + def __dynamic_expected_value(self, y): + """ This computes the expected value conditioned on the given label value. + """ + + return self.model.predict(self.data, np.ones(self.data.shape[0]) * y).mean(0) + + def __call__(self, X, y=None, interactions=False, check_additivity=True): + + start_time = time.time() + + if isinstance(X, pd.DataFrame): + feature_names = list(X.columns) + else: + feature_names = getattr(self, "data_feature_names", None) + + if not interactions: + v = self.shap_values(X, y=y, from_call=True, check_additivity=check_additivity, approximate=self.approximate) + if isinstance(v, list): + v = np.stack(v, axis=-1) # put outputs at the end + else: + assert not self.approximate, "Approximate computation not yet supported for interaction effects!" + v = self.shap_interaction_values(X) + + # the Explanation object expects an `expected_value` for each row + if hasattr(self.expected_value, "__len__") and len(self.expected_value) > 1: + # `expected_value` is a list / array of numbers, length k, e.g. for multi-output scenarios + # we repeat it N times along the first axis, so ev_tiled.shape == (N, k) + if isinstance(v, list): + num_rows = v[0].shape[0] + else: + num_rows = v.shape[0] + ev_tiled = np.tile(self.expected_value, (num_rows, 1)) + else: + # `expected_value` is a scalar / array of 1 number, so we simply repeat it for every row in `v` + # ev_tiled.shape == (N,) + ev_tiled = np.tile(self.expected_value, v.shape[0]) + + # cf. GH dsgibbons#66, this conversion to numpy array should be done AFTER + # calculation of shap values + if isinstance(X, pd.DataFrame): + X = X.values + elif safe_isinstance(X, "xgboost.core.DMatrix"): + import xgboost + + if version.parse(xgboost.__version__) < version.parse("1.7.0"): # pragma: no cover + # cf. GH #3357 + wmsg = ( + "`shap.Explanation` does not support `xgboost.DMatrix` objects for xgboost < 1.7, " + "so the `data` attribute of the `Explanation` object will be set to None. If " + "you require the `data` attribute (e.g. using `shap.plots`), then either " + "update your xgboost to >=1.7.0 or explicitly set `Explanation.data = X`, where " + "`X` is a numpy or scipy array." + ) + warnings.warn(wmsg) + X = None + else: + X: scipy.sparse.csr_matrix = X.get_data() + + return Explanation( + v, + base_values=ev_tiled, + data=X, + feature_names=feature_names, + compute_time=time.time() - start_time, + ) + + def _validate_inputs(self, X, y, tree_limit, check_additivity): + # see if we have a default tree_limit in place. + if tree_limit is None: + tree_limit = -1 if self.model.tree_limit is None else self.model.tree_limit + + if tree_limit < 0 or tree_limit > self.model.values.shape[0]: + tree_limit = self.model.values.shape[0] + # convert dataframes + if isinstance(X, (pd.Series, pd.DataFrame)): + X = X.values + flat_output = False + if len(X.shape) == 1: + flat_output = True + X = X.reshape(1, X.shape[0]) + if X.dtype != self.model.input_dtype: + X = X.astype(self.model.input_dtype) + X_missing = np.isnan(X, dtype=bool) + assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X)) + assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!" + + if self.model.model_output == "log_loss": + if y is None: + emsg = ( + "Both samples and labels must be provided when model_output = \"log_loss\" " + "(i.e. `explainer.shap_values(X, y)`)!" + ) + raise ExplainerError(emsg) + if X.shape[0] != len(y): + emsg = ( + f"The number of labels ({len(y)}) does not match the number of samples " + f"to explain ({X.shape[0]})!" + ) + raise DimensionError(emsg) + + if self.feature_perturbation == "tree_path_dependent": + if not self.model.fully_defined_weighting: + emsg = ( + "The background dataset you provided does " + "not cover all the leaves in the model, " + "so TreeExplainer cannot run with the " + "feature_perturbation=\"tree_path_dependent\" option! " + "Try providing a larger background " + "dataset, no background dataset, or using " + "feature_perturbation=\"interventional\"." + ) + raise ExplainerError(emsg) + + if check_additivity and self.model.model_type == "pyspark": + warnings.warn( + "check_additivity requires us to run predictions which is not supported with " + "spark, " + "ignoring." + " Set check_additivity=False to remove this warning") + check_additivity = False + + return X, y, X_missing, flat_output, tree_limit, check_additivity + + def shap_values(self, X, y=None, tree_limit=None, approximate=False, check_additivity=True, from_call=False): + """ Estimate the SHAP values for a set of samples. + + Parameters + ---------- + X : numpy.array, pandas.DataFrame or catboost.Pool (for catboost) + A matrix of samples (# samples x # features) on which to explain the model's output. + + y : numpy.array + An array of label values for each sample. Used when explaining loss functions. + + tree_limit : None (default) or int + Limit the number of trees used by the model. By default, the limit of the original model + is used (``None``). ``-1`` means no limit. + + approximate : bool + Run fast, but only roughly approximate the Tree SHAP values. This runs a method + previously proposed by Saabas which only considers a single feature ordering. Take care + since this does not have the consistency guarantees of Shapley values and places too + much weight on lower splits in the tree. + + check_additivity : bool + Run a validation check that the sum of the SHAP values equals the output of the model. This + check takes only a small amount of time, and will catch potential unforeseen errors. + Note that this check only runs right now when explaining the margin of the model. + + Returns + ------- + array or list + For models with a single output, this returns a matrix of SHAP values + (# samples x # features). Each row sums to the difference between the model output for that + sample and the expected value of the model output (which is stored in the ``expected_value`` + attribute of the explainer when it is constant). For models with vector outputs, this returns + a list of such matrices, one for each output. + """ + # see if we have a default tree_limit in place. + if tree_limit is None: + tree_limit = -1 if self.model.tree_limit is None else self.model.tree_limit + + # shortcut using the C++ version of Tree SHAP in XGBoost, LightGBM, and CatBoost + if self.feature_perturbation == "tree_path_dependent" and self.model.model_type != "internal" and self.data is None: + model_output_vals = None + phi = None + if self.model.model_type == "xgboost": + import xgboost + if not isinstance(X, xgboost.core.DMatrix): + # Retrieve any DMatrix properties if they have been set on the TreeEnsemble Class + dmatrix_props = getattr(self.model, "_xgb_dmatrix_props", {}) + X = xgboost.DMatrix(X, **dmatrix_props) + if tree_limit == -1: + tree_limit = 0 + try: + phi = self.model.original_model.predict( + X, iteration_range=(0, tree_limit), pred_contribs=True, + approx_contribs=approximate, validate_features=False + ) + except ValueError as e: + emsg = ( + "This reshape error is often caused by passing a bad data matrix to SHAP. " + "See https://github.com/shap/shap/issues/580." + ) + raise ValueError(emsg) from e + + if check_additivity and self.model.model_output == "raw": + xgb_tree_limit = tree_limit // self.model.num_stacked_models + model_output_vals = self.model.original_model.predict( + X, iteration_range=(0, xgb_tree_limit), output_margin=True, + validate_features=False + ) + + elif self.model.model_type == "lightgbm": + assert not approximate, "approximate=True is not supported for LightGBM models!" + phi = self.model.original_model.predict(X, num_iteration=tree_limit, pred_contrib=True) + # Note: the data must be joined on the last axis + if self.model.original_model.params['objective'] == 'binary': + if not from_call: + warnings.warn('LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray') + phi = np.concatenate((0-phi, phi), axis=-1) + if phi.shape[1] != X.shape[1] + 1: + try: + phi = phi.reshape(X.shape[0], phi.shape[1]//(X.shape[1]+1), X.shape[1]+1) + except ValueError as e: + emsg = ( + "This reshape error is often caused by passing a bad data matrix to SHAP. " + "See https://github.com/shap/shap/issues/580." + ) + raise ValueError(emsg) from e + + elif self.model.model_type == "catboost": # thanks to the CatBoost team for implementing this... + assert not approximate, "approximate=True is not supported for CatBoost models!" + assert tree_limit == -1, "tree_limit is not yet supported for CatBoost models!" + import catboost + if type(X) != catboost.Pool: + X = catboost.Pool(X, cat_features=self.model.cat_feature_indices) + phi = self.model.original_model.get_feature_importance(data=X, fstr_type='ShapValues') + + # note we pull off the last column and keep it as our expected_value + if phi is not None: + if len(phi.shape) == 3: + self.expected_value = [phi[0, i, -1] for i in range(phi.shape[1])] + out = [phi[:, i, :-1] for i in range(phi.shape[1])] + else: + self.expected_value = phi[0, -1] + out = phi[:, :-1] + + if check_additivity and model_output_vals is not None: + self.assert_additivity(out, model_output_vals) + + return out + + X, y, X_missing, flat_output, tree_limit, check_additivity = self._validate_inputs( + X, y, tree_limit, check_additivity + ) + transform = self.model.get_transform() + + # run the core algorithm using the C extension + assert_import("cext") + phi = np.zeros((X.shape[0], X.shape[1]+1, self.model.num_outputs)) + if not approximate: + _cext.dense_tree_shap( + self.model.children_left, self.model.children_right, self.model.children_default, + self.model.features, self.model.thresholds, self.model.values, self.model.node_sample_weight, + self.model.max_depth, X, X_missing, y, self.data, self.data_missing, tree_limit, + self.model.base_offset, phi, feature_perturbation_codes[self.feature_perturbation], + output_transform_codes[transform], False + ) + else: + _cext.dense_tree_saabas( + self.model.children_left, self.model.children_right, self.model.children_default, + self.model.features, self.model.thresholds, self.model.values, + self.model.max_depth, tree_limit, self.model.base_offset, output_transform_codes[transform], + X, X_missing, y, phi + ) + + out = self._get_shap_output(phi, flat_output) + if check_additivity and self.model.model_output == "raw": + self.assert_additivity(out, self.model.predict(X)) + + return out + + def _get_shap_output(self, phi, flat_output): + """Pull off the last column of ``phi`` and keep it as our expected_value.""" + if self.model.num_outputs == 1: + if self.expected_value is None and self.model.model_output != "log_loss": + self.expected_value = phi[0, -1, 0] + if flat_output: + out = phi[0, :-1, 0] + else: + out = phi[:, :-1, 0] + else: + if self.expected_value is None and self.model.model_output != "log_loss": + self.expected_value = [phi[0, -1, i] for i in range(phi.shape[2])] + if flat_output: + out = [phi[0, :-1, i] for i in range(self.model.num_outputs)] + else: + out = [phi[:, :-1, i] for i in range(self.model.num_outputs)] + + # if our output format requires binary classification to be represented as two outputs then we do that here + if self.model.model_output == "probability_doubled": + out = [-out, out] + return out + + def shap_interaction_values(self, X, y=None, tree_limit=None): + """ Estimate the SHAP interaction values for a set of samples. + + Parameters + ---------- + X : numpy.array, pandas.DataFrame or catboost.Pool (for catboost) + A matrix of samples (# samples x # features) on which to explain the model's output. + + y : numpy.array + An array of label values for each sample. Used when explaining loss functions (not yet supported). + + tree_limit : None (default) or int + Limit the number of trees used by the model. By default, the limit of the original model + is used (``None``). ``-1`` means no limit. + + Returns + ------- + array or list + For models with a single output, this returns a tensor of SHAP values + (# samples x # features x # features). The matrix (# features x # features) for each sample sums + to the difference between the model output for that sample and the expected value of the model output + (which is stored in the ``expected_value`` attribute of the explainer). Each row of this matrix sums to the + SHAP value for that feature for that sample. The diagonal entries of the matrix represent the + "main effect" of that feature on the prediction. The symmetric off-diagonal entries represent the + interaction effects between all pairs of features for that sample. + For models with vector outputs, this returns a list of tensors, one for each output. + """ + + assert self.model.model_output == "raw", "Only model_output = \"raw\" is supported for SHAP interaction values right now!" + #assert self.feature_perturbation == "tree_path_dependent", "Only feature_perturbation = \"tree_path_dependent\" is supported for SHAP interaction values right now!" + transform = "identity" + + # see if we have a default tree_limit in place. + if tree_limit is None: + tree_limit = -1 if self.model.tree_limit is None else self.model.tree_limit + + # shortcut using the C++ version of Tree SHAP in XGBoost + if ( + self.model.model_type == "xgboost" + and self.feature_perturbation == "tree_path_dependent" + ): + import xgboost + if not isinstance(X, xgboost.core.DMatrix): + X = xgboost.DMatrix(X) + if tree_limit == -1: + tree_limit = 0 + xgb_tree_limit = tree_limit // self.model.num_stacked_models + phi = self.model.original_model.predict(X, iteration_range=(0, xgb_tree_limit), pred_interactions=True, validate_features=False) + + # note we pull off the last column and keep it as our expected_value + if len(phi.shape) == 4: + self.expected_value = [phi[0, i, -1, -1] for i in range(phi.shape[1])] + return [phi[:, i, :-1, :-1] for i in range(phi.shape[1])] + else: + self.expected_value = phi[0, -1, -1] + return phi[:, :-1, :-1] + elif (self.model.model_type == "catboost") and (self.feature_perturbation == "tree_path_dependent"): # thanks again to the CatBoost team for implementing this... + assert tree_limit == -1, "tree_limit is not yet supported for CatBoost models!" + import catboost + if type(X) != catboost.Pool: + X = catboost.Pool(X, cat_features=self.model.cat_feature_indices) + phi = self.model.original_model.get_feature_importance(data=X, fstr_type='ShapInteractionValues') + # note we pull off the last column and keep it as our expected_value + if len(phi.shape) == 4: + self.expected_value = getattr(self, "expected_value", [phi[0, i, -1, -1] for i in range(phi.shape[1])]) + return [phi[:, i, :-1, :-1] for i in range(phi.shape[1])] + else: + self.expected_value = getattr(self, "expected_value", phi[0, -1, -1]) + return phi[:, :-1, :-1] + + X, y, X_missing, flat_output, tree_limit, _ = self._validate_inputs(X, y, tree_limit, False) + # run the core algorithm using the C extension + assert_import("cext") + phi = np.zeros((X.shape[0], X.shape[1]+1, X.shape[1]+1, self.model.num_outputs)) + _cext.dense_tree_shap( + self.model.children_left, self.model.children_right, self.model.children_default, + self.model.features, self.model.thresholds, self.model.values, self.model.node_sample_weight, + self.model.max_depth, X, X_missing, y, self.data, self.data_missing, tree_limit, + self.model.base_offset, phi, feature_perturbation_codes[self.feature_perturbation], + output_transform_codes[transform], True + ) + + return self._get_shap_interactions_output(phi,flat_output) + + def _get_shap_interactions_output(self, phi, flat_output): + """Pull off the last column and keep it as our expected_value""" + if self.model.num_outputs == 1: + # get expected value only if not already set + self.expected_value = getattr(self, "expected_value", phi[0, -1, -1, 0]) + if flat_output: + out = phi[0, :-1, :-1, 0] + else: + out = phi[:, :-1, :-1, 0] + else: + self.expected_value = [phi[0, -1, -1, i] for i in range(phi.shape[3])] + if flat_output: + out = [phi[0, :-1, :-1, i] for i in range(self.model.num_outputs)] + else: + out = [phi[:, :-1, :-1, i] for i in range(self.model.num_outputs)] + return out + + def assert_additivity(self, phi, model_output): + + def check_sum(sum_val, model_output): + diff = np.abs(sum_val - model_output) + if np.max(diff / (np.abs(sum_val) + 1e-2)) > 1e-2: + ind = np.argmax(diff) + err_msg = "Additivity check failed in TreeExplainer! Please ensure the data matrix you passed to the " \ + "explainer is the same shape that the model was trained on. If your data shape is correct " \ + "then please report this on GitHub." + if self.feature_perturbation != "interventional": + err_msg += " Consider retrying with the feature_perturbation='interventional' option." + err_msg += " This check failed because for one of the samples the sum of the SHAP values" \ + " was {:f}, while the model output was {:f}. If this difference is acceptable" \ + " you can set check_additivity=False to disable this check.".format(sum_val[ind], model_output[ind]) + raise ExplainerError(err_msg) + + if isinstance(phi, list): + for i in range(len(phi)): + check_sum(self.expected_value[i] + phi[i].sum(-1), model_output[:,i]) + else: + check_sum(self.expected_value + phi.sum(-1), model_output) + + @staticmethod + def supports_model_with_masker(model, masker): + """ Determines if this explainer can handle the given model. + + This is an abstract static method meant to be implemented by each subclass. + """ + + if not isinstance(masker, (maskers.Independent)) and masker is not None: + return False + + try: + TreeEnsemble(model) + except Exception: + return False + return True + + +class TreeEnsemble: + """ An ensemble of decision trees. + + This object provides a common interface to many different types of models. + """ + + def __init__(self, model, data=None, data_missing=None, model_output=None): + self.model_type = "internal" + self.trees = None + self.base_offset = 0 + self.model_output = model_output + self.objective = None # what we explain when explaining the loss of the model + self.tree_output = None # what are the units of the values in the leaves of the trees + self.internal_dtype = np.float64 + self.input_dtype = np.float64 # for sklearn we need to use np.float32 to always get exact matches to their predictions + self.data = data + self.data_missing = data_missing + self.fully_defined_weighting = True # does the background dataset land in every leaf (making it valid for the tree_path_dependent method) + self.tree_limit = None # used for limiting the number of trees we use by default (like from early stopping) + self.num_stacked_models = 1 # If this is greater than 1 it means we have multiple stacked models with the same number of trees in each model (XGBoost multi-output style) + self.cat_feature_indices = None # If this is set it tells us which features are treated categorically + + # we use names like keras + objective_name_map = { + "mse": "squared_error", + "variance": "squared_error", + "friedman_mse": "squared_error", + "reg:linear": "squared_error", + "reg:squarederror": "squared_error", + "regression": "squared_error", + "regression_l2": "squared_error", + "mae": "absolute_error", + "gini": "binary_crossentropy", + "entropy": "binary_crossentropy", + "reg:logistic": "binary_crossentropy", + "binary:logistic": "binary_crossentropy", + "binary_logloss": "binary_crossentropy", + "binary": "binary_crossentropy", + } + + tree_output_name_map = { + "regression": "raw_value", + "regression_l2": "squared_error", + "reg:linear": "raw_value", + "reg:squarederror": "raw_value", + "reg:logistic": "log_odds", + "binary:logistic": "log_odds", + "binary_logloss": "log_odds", + "binary": "log_odds", + } + + if isinstance(model, dict) and "trees" in model: + # This allows a dictionary to be passed that represents the model. + # this dictionary has several numerical parameters and also a list of trees + # where each tree is a dictionary describing that tree + if "internal_dtype" in model: + self.internal_dtype = model["internal_dtype"] + if "input_dtype" in model: + self.input_dtype = model["input_dtype"] + if "objective" in model: + self.objective = model["objective"] + if "tree_output" in model: + self.tree_output = model["tree_output"] + if "base_offset" in model: + self.base_offset = model["base_offset"] + self.trees = [SingleTree(t, data=data, data_missing=data_missing) for t in model["trees"]] + elif isinstance(model, list) and isinstance(model[0], SingleTree): # old-style direct-load format + self.trees = model + elif safe_isinstance( + model, + [ + "sklearn.ensemble.RandomForestRegressor", + "sklearn.ensemble.forest.RandomForestRegressor", + "econml.grf._base_grf.BaseGRF", + ], + ): + assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?" + self.internal_dtype = model.estimators_[0].tree_.value.dtype.type + self.input_dtype = np.float32 + scaling = 1.0 / len(model.estimators_) # output is average of trees + self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_] + self.objective = objective_name_map.get(model.criterion, None) + self.tree_output = "raw_value" + elif safe_isinstance( + model, + [ + "sklearn.ensemble.IsolationForest", + "sklearn.ensemble._iforest.IsolationForest", + ], + ): + self.dtype = np.float32 + scaling = 1.0 / len(model.estimators_) # output is average of trees + self.trees = [IsoTree(e.tree_, f, scaling=scaling, data=data, data_missing=data_missing) for e, f in zip(model.estimators_, model.estimators_features_)] + self.tree_output = "raw_value" + elif safe_isinstance(model, ["pyod.models.iforest.IForest"]): + self.dtype = np.float32 + scaling = 1.0 / len(model.estimators_) # output is average of trees + self.trees = [IsoTree(e.tree_, f, scaling=scaling, data=data, data_missing=data_missing) for e, f in zip(model.detector_.estimators_, model.detector_.estimators_features_)] + self.tree_output = "raw_value" + elif safe_isinstance(model, "skopt.learning.forest.RandomForestRegressor"): + assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?" + self.internal_dtype = model.estimators_[0].tree_.value.dtype.type + self.input_dtype = np.float32 + scaling = 1.0 / len(model.estimators_) # output is average of trees + self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_] + self.objective = objective_name_map.get(model.criterion, None) + self.tree_output = "raw_value" + elif safe_isinstance( + model, + [ + "sklearn.ensemble.ExtraTreesRegressor", + "sklearn.ensemble.forest.ExtraTreesRegressor", + ], + ): + assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?" + self.internal_dtype = model.estimators_[0].tree_.value.dtype.type + self.input_dtype = np.float32 + scaling = 1.0 / len(model.estimators_) # output is average of trees + self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_] + self.objective = objective_name_map.get(model.criterion, None) + self.tree_output = "raw_value" + elif safe_isinstance(model, "skopt.learning.forest.ExtraTreesRegressor"): + assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?" + self.internal_dtype = model.estimators_[0].tree_.value.dtype.type + self.input_dtype = np.float32 + scaling = 1.0 / len(model.estimators_) # output is average of trees + self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_] + self.objective = objective_name_map.get(model.criterion, None) + self.tree_output = "raw_value" + elif safe_isinstance( + model, + [ + "sklearn.tree.DecisionTreeRegressor", + "sklearn.tree.tree.DecisionTreeRegressor", + "econml.grf._base_grftree.GRFTree", + ], + ): + self.internal_dtype = model.tree_.value.dtype.type + self.input_dtype = np.float32 + self.trees = [SingleTree(model.tree_, data=data, data_missing=data_missing)] + self.objective = objective_name_map.get(model.criterion, None) + self.tree_output = "raw_value" + elif safe_isinstance( + model, + [ + "sklearn.tree.DecisionTreeClassifier", + "sklearn.tree.tree.DecisionTreeClassifier", + ], + ): + self.internal_dtype = model.tree_.value.dtype.type + self.input_dtype = np.float32 + self.trees = [SingleTree(model.tree_, normalize=True, data=data, data_missing=data_missing)] + self.objective = objective_name_map.get(model.criterion, None) + self.tree_output = "probability" + elif safe_isinstance( + model, + [ + "sklearn.ensemble.RandomForestClassifier", + "sklearn.ensemble.forest.RandomForestClassifier", + ], + ): + assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?" + self.internal_dtype = model.estimators_[0].tree_.value.dtype.type + self.input_dtype = np.float32 + scaling = 1.0 / len(model.estimators_) # output is average of trees + self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_] + self.objective = objective_name_map.get(model.criterion, None) + self.tree_output = "probability" + elif safe_isinstance( + model, + [ + "sklearn.ensemble.ExtraTreesClassifier", + "sklearn.ensemble.forest.ExtraTreesClassifier", + ], + ): # TODO: add unit test for this case + assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?" + self.internal_dtype = model.estimators_[0].tree_.value.dtype.type + self.input_dtype = np.float32 + scaling = 1.0 / len(model.estimators_) # output is average of trees + self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_] + self.objective = objective_name_map.get(model.criterion, None) + self.tree_output = "probability" + elif safe_isinstance( + model, + [ + "sklearn.ensemble.GradientBoostingRegressor", + "sklearn.ensemble.gradient_boosting.GradientBoostingRegressor", + ], + ): + self.input_dtype = np.float32 + + # currently we only support the mean and quantile estimators + if safe_isinstance( + model.init_, + [ + "sklearn.ensemble.MeanEstimator", + "sklearn.ensemble.gradient_boosting.MeanEstimator", + ], + ): + self.base_offset = model.init_.mean + elif safe_isinstance( + model.init_, + [ + "sklearn.ensemble.QuantileEstimator", + "sklearn.ensemble.gradient_boosting.QuantileEstimator", + ], + ): + self.base_offset = model.init_.quantile + elif safe_isinstance(model.init_, "sklearn.dummy.DummyRegressor"): + self.base_offset = model.init_.constant_[0] + else: + emsg = f"Unsupported init model type: {type(model.init_)}" + raise InvalidModelError(emsg) + + self.trees = [SingleTree(e.tree_, scaling=model.learning_rate, data=data, data_missing=data_missing) for e in model.estimators_[:,0]] + self.objective = objective_name_map.get(model.criterion, None) + self.tree_output = "raw_value" + elif safe_isinstance(model, ["sklearn.ensemble.HistGradientBoostingRegressor"]): + # cf. GH #1028 for implementation notes + import sklearn + if self.model_output == "predict": + self.model_output = "raw" + self.input_dtype = sklearn.ensemble._hist_gradient_boosting.common.X_DTYPE + self.base_offset = model._baseline_prediction + self.trees = [] + for p in model._predictors: + nodes = p[0].nodes + # each node has values: ('value', 'count', 'feature_idx', 'threshold', 'missing_go_to_left', 'left', 'right', 'gain', 'depth', 'is_leaf', 'bin_threshold') + tree = { + "children_left": np.array([-1 if n[9] else n[5] for n in nodes]), + "children_right": np.array([-1 if n[9] else n[6] for n in nodes]), + "children_default": np.array([-1 if n[9] else (n[5] if n[4] else n[6]) for n in nodes]), + "features": np.array([-2 if n[9] else n[2] for n in nodes]), + "thresholds": np.array([n[3] for n in nodes], dtype=np.float64), + "values": np.array([[n[0]] for n in nodes], dtype=np.float64), + "node_sample_weight": np.array([n[1] for n in nodes], dtype=np.float64), + } + self.trees.append(SingleTree(tree, data=data, data_missing=data_missing)) + self.objective = objective_name_map.get(model.loss, None) + self.tree_output = "raw_value" + elif safe_isinstance(model, ["sklearn.ensemble.HistGradientBoostingClassifier"]): + # cf. GH #1028 for implementation notes + import sklearn + self.base_offset = model._baseline_prediction + has_len = hasattr(self.base_offset, "__len__") + # Note for newer sklearn versions, the base_offset is an array even for binary classification + if has_len and self.base_offset.shape == (1, 1): + self.base_offset = self.base_offset[0, 0] + has_len = False + if has_len and self.model_output != "raw": + emsg = ( + "Multi-output HistGradientBoostingClassifier models are not yet supported unless " + "model_output=\"raw\". See GitHub issue #1028." + ) + raise NotImplementedError(emsg) + self.input_dtype = sklearn.ensemble._hist_gradient_boosting.common.X_DTYPE + self.num_stacked_models = len(model._predictors[0]) + if self.model_output == "predict_proba": + if self.num_stacked_models == 1: + self.model_output = "probability_doubled" # with predict_proba we need to double the outputs to match + else: + self.model_output = "probability" + self.trees = [] + for p in model._predictors: + for i in range(self.num_stacked_models): + nodes = p[i].nodes + # each node has values: ('value', 'count', 'feature_idx', 'threshold', 'missing_go_to_left', 'left', 'right', 'gain', 'depth', 'is_leaf', 'bin_threshold') + tree = { + "children_left": np.array([-1 if n[9] else n[5] for n in nodes]), + "children_right": np.array([-1 if n[9] else n[6] for n in nodes]), + "children_default": np.array([-1 if n[9] else (n[5] if n[4] else n[6]) for n in nodes]), + "features": np.array([-2 if n[9] else n[2] for n in nodes]), + "thresholds": np.array([n[3] for n in nodes], dtype=np.float64), + "values": np.array([[n[0]] for n in nodes], dtype=np.float64), + "node_sample_weight": np.array([n[1] for n in nodes], dtype=np.float64), + } + self.trees.append(SingleTree(tree, data=data, data_missing=data_missing)) + self.objective = objective_name_map.get(model.loss, None) + self.tree_output = "log_odds" + elif safe_isinstance( + model, + [ + "sklearn.ensemble.GradientBoostingClassifier", + "sklearn.ensemble._gb.GradientBoostingClassifier", + "sklearn.ensemble.gradient_boosting.GradientBoostingClassifier", + ], + ): + self.input_dtype = np.float32 + + # TODO: deal with estimators for each class + if model.estimators_.shape[1] > 1: + emsg = "GradientBoostingClassifier is only supported for binary classification right now!" + raise InvalidModelError(emsg) + + # currently we only support the logs odds estimator + if safe_isinstance( + model.init_, + [ + "sklearn.ensemble.LogOddsEstimator", + "sklearn.ensemble.gradient_boosting.LogOddsEstimator", + ], + ): + self.base_offset = model.init_.prior + self.tree_output = "log_odds" + elif safe_isinstance(model.init_, "sklearn.dummy.DummyClassifier"): + self.base_offset = scipy.special.logit(model.init_.class_prior_[1]) # with two classes the trees only model the second class. + self.tree_output = "log_odds" + else: + emsg = f"Unsupported init model type: {type(model.init_)}" + raise InvalidModelError(emsg) + + self.trees = [SingleTree(e.tree_, scaling=model.learning_rate, data=data, data_missing=data_missing) for e in model.estimators_[:,0]] + self.objective = objective_name_map.get(model.criterion, None) + elif "pyspark.ml" in str(type(model)): + assert_import("pyspark") + self.model_type = "pyspark" + # model._java_obj.getImpurity() can be gini, entropy or variance. + self.objective = objective_name_map.get(model._java_obj.getImpurity(), None) + if "Classification" in str(type(model)): + normalize = True + self.tree_output = "probability" + else: + normalize = False + self.tree_output = "raw_value" + # Spark Random forest, create 1 weighted (avg) tree per sub-model + if safe_isinstance( + model, + [ + "pyspark.ml.classification.RandomForestClassificationModel", + "pyspark.ml.regression.RandomForestRegressionModel", + ], + ): + sum_weight = sum(model.treeWeights) # output is average of trees + self.trees = [ + SingleTree(tree, normalize=normalize, scaling=model.treeWeights[i] / sum_weight) + for i, tree in enumerate(model.trees) + ] + # Spark GBT, create 1 weighted (learning rate) tree per sub-model + elif safe_isinstance( + model, + [ + "pyspark.ml.classification.GBTClassificationModel", + "pyspark.ml.regression.GBTRegressionModel", + ], + ): + self.objective = "squared_error" # GBT subtree use the variance + self.tree_output = "raw_value" + self.trees = [ + SingleTree(tree, normalize=False, scaling=model.treeWeights[i]) + for i, tree in enumerate(model.trees) + ] + # Spark Basic model (single tree) + elif safe_isinstance( + model, + [ + "pyspark.ml.classification.DecisionTreeClassificationModel", + "pyspark.ml.regression.DecisionTreeRegressionModel", + ], + ): + self.trees = [SingleTree(model, normalize=normalize, scaling=1)] + else: + emsg = f"Unsupported Spark model type: {type(model)}" + raise NotImplementedError(emsg) + elif safe_isinstance(model, "xgboost.core.Booster"): + self.original_model = model + self.model_type = "xgboost" + xgb_loader = XGBTreeModelLoader(self.original_model) + self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing) + self.base_offset = xgb_loader.base_score + self.objective = objective_name_map.get(xgb_loader.name_obj, None) + self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None) + if xgb_loader.num_class > 0: + self.num_stacked_models = xgb_loader.num_class + elif safe_isinstance(model, "xgboost.sklearn.XGBClassifier"): + self.input_dtype = np.float32 + self.model_type = "xgboost" + self.original_model = model.get_booster() + xgb_loader = XGBTreeModelLoader(self.original_model) + self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing) + self.base_offset = xgb_loader.base_score + self.objective = objective_name_map.get(xgb_loader.name_obj, None) + self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None) + # 'best_ntree_limit' is problematic + # https://github.com/dmlc/xgboost/issues/6615 + if hasattr(model, 'best_iteration'): + trees_per_iteration = xgb_loader.num_class if xgb_loader.num_class > 0 else 1 + self.tree_limit = (getattr(model, "best_iteration", None) + 1) * trees_per_iteration + else: + self.tree_limit = getattr(model, "best_ntree_limit", None) + if xgb_loader.num_class > 0: + self.num_stacked_models = xgb_loader.num_class + if self.model_output == "predict_proba": + if self.num_stacked_models == 1: + self.model_output = "probability_doubled" # with predict_proba we need to double the outputs to match + else: + self.model_output = "probability" + # Some properties of the sklearn API are passed to a DMatrix object in xgboost + # We need to make sure we do the same here - GH #3313 + self._xgb_dmatrix_props = get_xgboost_dmatrix_properties(model) + elif safe_isinstance(model, "xgboost.sklearn.XGBRegressor"): + self.original_model = model.get_booster() + self.model_type = "xgboost" + xgb_loader = XGBTreeModelLoader(self.original_model) + self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing) + self.base_offset = xgb_loader.base_score + self.objective = objective_name_map.get(xgb_loader.name_obj, None) + self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None) + self.tree_limit = getattr(model, "best_ntree_limit", None) + if xgb_loader.num_class > 0: + self.num_stacked_models = xgb_loader.num_class + # Some properties of the sklearn API are passed to a DMatrix object in xgboost + # We need to make sure we do the same here - GH #3313 + self._xgb_dmatrix_props = get_xgboost_dmatrix_properties(model) + elif safe_isinstance(model, "xgboost.sklearn.XGBRanker"): + self.original_model = model.get_booster() + self.model_type = "xgboost" + xgb_loader = XGBTreeModelLoader(self.original_model) + self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing) + self.base_offset = xgb_loader.base_score + # Note: for ranker, leaving tree_output and objective as None as they + # are not implemented in native code yet + self.tree_limit = getattr(model, "best_ntree_limit", None) + if xgb_loader.num_class > 0: + self.num_stacked_models = xgb_loader.num_class + # Some properties of the sklearn API are passed to a DMatrix object in xgboost + # We need to make sure we do the same here - GH #3313 + self._xgb_dmatrix_props = get_xgboost_dmatrix_properties(model) + elif safe_isinstance(model, "lightgbm.basic.Booster"): + assert_import("lightgbm") + self.model_type = "lightgbm" + self.original_model = model + tree_info = self.original_model.dump_model()["tree_info"] + try: + self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info] + except Exception: + self.trees = None # we get here because the cext can't handle categorical splits yet + + self.objective = objective_name_map.get(model.params.get("objective", "regression"), None) + self.tree_output = tree_output_name_map.get(model.params.get("objective", "regression"), None) + + elif safe_isinstance(model, "gpboost.basic.Booster"): + assert_import("gpboost") + self.model_type = "gpboost" + self.original_model = model + tree_info = self.original_model.dump_model()["tree_info"] + try: + self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info] + except Exception: + self.trees = None # we get here because the cext can't handle categorical splits yet + + self.objective = objective_name_map.get(model.params.get("objective", "regression"), None) + self.tree_output = tree_output_name_map.get(model.params.get("objective", "regression"), None) + + elif safe_isinstance(model, "lightgbm.sklearn.LGBMRegressor"): + assert_import("lightgbm") + self.model_type = "lightgbm" + self.original_model = model.booster_ + tree_info = self.original_model.dump_model()["tree_info"] + try: + self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info] + except Exception: + self.trees = None # we get here because the cext can't handle categorical splits yet + self.objective = objective_name_map.get(model.objective, None) + self.tree_output = tree_output_name_map.get(model.objective, None) + if model.objective is None: + self.objective = "squared_error" + self.tree_output = "raw_value" + elif safe_isinstance(model, "lightgbm.sklearn.LGBMRanker"): + assert_import("lightgbm") + self.model_type = "lightgbm" + self.original_model = model.booster_ + tree_info = self.original_model.dump_model()["tree_info"] + try: + self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info] + except Exception: + self.trees = None # we get here because the cext can't handle categorical splits yet + # Note: for ranker, leaving tree_output and objective as None as they + # are not implemented in native code yet + elif safe_isinstance(model, "lightgbm.sklearn.LGBMClassifier"): + assert_import("lightgbm") + self.model_type = "lightgbm" + if model.n_classes_ > 2: + self.num_stacked_models = model.n_classes_ + self.original_model = model.booster_ + tree_info = self.original_model.dump_model()["tree_info"] + try: + self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info] + except Exception: + self.trees = None # we get here because the cext can't handle categorical splits yet + self.objective = objective_name_map.get(model.objective, None) + self.tree_output = tree_output_name_map.get(model.objective, None) + if model.objective is None: + self.objective = "binary_crossentropy" + self.tree_output = "log_odds" + elif safe_isinstance(model, "catboost.core.CatBoostRegressor"): + assert_import("catboost") + self.model_type = "catboost" + self.original_model = model + self.cat_feature_indices = model.get_cat_feature_indices() + try: + cb_loader = CatBoostTreeModelLoader(model) + self.trees = cb_loader.get_trees(data=data, data_missing=data_missing) + except Exception: + self.trees = None # we get here because the cext can't handle categorical splits yet + elif safe_isinstance(model, "catboost.core.CatBoostClassifier"): + assert_import("catboost") + self.model_type = "catboost" + self.original_model = model + self.input_dtype = np.float32 + try: + cb_loader = CatBoostTreeModelLoader(model) + self.trees = cb_loader.get_trees(data=data, data_missing=data_missing) + except Exception: + self.trees = None # we get here because the cext can't handle categorical splits yet + self.tree_output = "log_odds" + self.objective = "binary_crossentropy" + self.cat_feature_indices = model.get_cat_feature_indices() + elif safe_isinstance(model, "catboost.core.CatBoost"): + assert_import("catboost") + self.model_type = "catboost" + self.original_model = model + self.cat_feature_indices = model.get_cat_feature_indices() + elif safe_isinstance(model, "imblearn.ensemble._forest.BalancedRandomForestClassifier"): + self.input_dtype = np.float32 + scaling = 1.0 / len(model.estimators_) # output is average of trees + self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_] + self.objective = objective_name_map.get(model.criterion, None) + self.tree_output = "probability" + elif safe_isinstance( + model, + [ + "ngboost.ngboost.NGBoost", + "ngboost.api.NGBRegressor", + "ngboost.api.NGBClassifier", + ], + ): + assert model.base_models, "The NGBoost model has empty `base_models`! Have you called `model.fit`?" + if self.model_output == "raw": + param_idx = 0 # default to the first parameter of the output distribution + warnings.warn("Translating model_output=\"raw\" to model_output=0 for the 0-th parameter in the distribution. Use model_output=0 directly to avoid this warning.") + elif isinstance(self.model_output, int): + param_idx = self.model_output + self.model_output = "raw" # note that after loading we have a new model_output type + assert safe_isinstance(model.base_models[0][param_idx], ["sklearn.tree.DecisionTreeRegressor", "sklearn.tree.tree.DecisionTreeRegressor"]), "You must use default_tree_learner!" + shap_trees = [trees[param_idx] for trees in model.base_models] + self.internal_dtype = shap_trees[0].tree_.value.dtype.type + self.input_dtype = np.float32 + scaling = - model.learning_rate * np.array(model.scalings) # output is weighted average of trees + # ngboost reorders the features, so we need to map them back to the original order + missing_col_idxs = [[i for i in range(model.n_features) if i not in col_idx] for col_idx in model.col_idxs] + feature_mapping = [{i: col_idx for i, col_idx in enumerate(list(col_idxs) + missing_col_idx)} + for col_idxs, missing_col_idx in zip(model.col_idxs, missing_col_idxs)] + self.trees = [] + for idx, shap_tree in enumerate(shap_trees): + tree_ = shap_tree.tree_ + values = tree_.value.reshape(tree_.value.shape[0], tree_.value.shape[1] * tree_.value.shape[2]) + values = values * scaling[idx] + tree = { + "children_left": tree_.children_left.astype(np.int32), + "children_right": tree_.children_right.astype(np.int32), + "children_default": tree_.children_left, + "features": np.array([feature_mapping[idx].get(i, i) for i in tree_.feature]), + "thresholds": tree_.threshold.astype(np.float64), + "values": values, + "node_sample_weight": tree_.weighted_n_node_samples.astype(np.float64) + } + self.trees.append(SingleTree(tree, data=data, data_missing=data_missing)) + self.objective = objective_name_map.get(shap_trees[0].criterion, None) + self.tree_output = "raw_value" + self.base_offset = model.init_params[param_idx] + else: + raise InvalidModelError("Model type not yet supported by TreeExplainer: " + str(type(model))) + + # build a dense numpy version of all the tree objects + if self.trees is not None and self.trees: + max_nodes = np.max([len(t.values) for t in self.trees]) + assert len(np.unique([t.values.shape[1] for t in self.trees])) == 1, "All trees in the ensemble must have the same output dimension!" + num_trees = len(self.trees) + if self.num_stacked_models > 1: + assert len(self.trees) % self.num_stacked_models == 0, "Only stacked models with equal numbers of trees are supported!" + assert self.trees[0].values.shape[1] == 1, "Only stacked models with single outputs per model are supported!" + self.num_outputs = self.num_stacked_models + else: + self.num_outputs = self.trees[0].values.shape[1] + + # important to be -1 in unused sections!! This way we can tell which entries are valid. + self.children_left = -np.ones((num_trees, max_nodes), dtype=np.int32) + self.children_right = -np.ones((num_trees, max_nodes), dtype=np.int32) + self.children_default = -np.ones((num_trees, max_nodes), dtype=np.int32) + self.features = -np.ones((num_trees, max_nodes), dtype=np.int32) + + self.thresholds = np.zeros((num_trees, max_nodes), dtype=self.internal_dtype) + self.values = np.zeros((num_trees, max_nodes, self.num_outputs), dtype=self.internal_dtype) + self.node_sample_weight = np.zeros((num_trees, max_nodes), dtype=self.internal_dtype) + + for i in range(num_trees): + self.children_left[i,:len(self.trees[i].children_left)] = self.trees[i].children_left + self.children_right[i,:len(self.trees[i].children_right)] = self.trees[i].children_right + self.children_default[i,:len(self.trees[i].children_default)] = self.trees[i].children_default + self.features[i,:len(self.trees[i].features)] = self.trees[i].features + self.thresholds[i,:len(self.trees[i].thresholds)] = self.trees[i].thresholds + if self.num_stacked_models > 1: + # stack_pos = int(i // (num_trees / self.num_stacked_models)) + stack_pos = i % self.num_stacked_models + self.values[i,:len(self.trees[i].values[:,0]),stack_pos] = self.trees[i].values[:,0] + else: + self.values[i,:len(self.trees[i].values)] = self.trees[i].values + self.node_sample_weight[i,:len(self.trees[i].node_sample_weight)] = self.trees[i].node_sample_weight + + # ensure that the passed background dataset lands in every leaf + if np.min(self.trees[i].node_sample_weight) <= 0: + self.fully_defined_weighting = False + + self.num_nodes = np.array([len(t.values) for t in self.trees], dtype=np.int32) + self.max_depth = np.max([t.max_depth for t in self.trees]) + + # make sure the base offset is a 1D array + if not hasattr(self.base_offset, "__len__") or len(self.base_offset) == 0: + self.base_offset = (np.ones(self.num_outputs) * self.base_offset).astype(self.internal_dtype) + self.base_offset = self.base_offset.flatten() + assert len(self.base_offset) == self.num_outputs + + def get_transform(self): + """ A consistent interface to make predictions from this model. + """ + if self.model_output == "raw": + transform = "identity" + elif self.model_output in ("probability", "probability_doubled"): + if self.tree_output == "log_odds": + transform = "logistic" + elif self.tree_output == "probability": + transform = "identity" + else: + emsg = ( + "model_output = \"probability\" is not yet supported when model.tree_output = " + f"\"{self.tree_output}\"!" + ) + raise NotImplementedError(emsg) + elif self.model_output == "log_loss": + if self.objective == "squared_error": + transform = "squared_loss" + elif self.objective == "binary_crossentropy": + transform = "logistic_nlogloss" + else: + emsg = ( + "model_output = \"log_loss\" is not yet supported when model.objective = " + f"\"{self.objective}\"!" + ) + raise NotImplementedError(emsg) + else: + emsg = ( + f"Unrecognized model_output parameter value: {str(self.model_output)}! " + f"If `model.{str(self.model_output)}` is a valid function, open a Github issue to ask " + "that this method be supported. If you want 'predict_proba' just use 'probability' for now." + ) + raise ValueError(emsg) + + return transform + + def predict(self, X, y=None, output=None, tree_limit=None): + """ A consistent interface to make predictions from this model. + + Parameters + ---------- + tree_limit : None (default) or int + Limit the number of trees used by the model. By default None means no use the limit of the + original model, and -1 means no limit. + """ + + if output is None: + output = self.model_output + + if self.model_type == "pyspark": + #import pyspark + # TODO: support predict for pyspark + raise NotImplementedError("Predict with pyspark isn't implemented. Don't run 'interventional' as feature_perturbation.") + + # see if we have a default tree_limit in place. + if tree_limit is None: + tree_limit = -1 if self.tree_limit is None else self.tree_limit + + # convert dataframes + if isinstance(X, (pd.Series, pd.DataFrame)): + X = X.values + flat_output = False + if len(X.shape) == 1: + flat_output = True + X = X.reshape(1, X.shape[0]) + if X.dtype.type != self.input_dtype: + X = X.astype(self.input_dtype) + X_missing = np.isnan(X, dtype=bool) + assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X)) + assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!" + + if tree_limit < 0 or tree_limit > self.values.shape[0]: + tree_limit = self.values.shape[0] + + if output == "logloss": + assert y is not None, "Both samples and labels must be provided when explaining the loss (i.e. `explainer.shap_values(X, y)`)!" + assert X.shape[0] == len(y), "The number of labels (%d) does not match the number of samples to explain (%d)!" % (len(y), X.shape[0]) + transform = self.get_transform() + assert_import("cext") + output = np.zeros((X.shape[0], self.num_outputs)) + _cext.dense_tree_predict( + self.children_left, self.children_right, self.children_default, + self.features, self.thresholds, self.values, + self.max_depth, tree_limit, self.base_offset, output_transform_codes[transform], + X, X_missing, y, output + ) + + # drop dimensions we don't need + if flat_output: + if self.num_outputs == 1: + return output.flatten()[0] + else: + return output.reshape(-1, self.num_outputs) + else: + if self.num_outputs == 1: + return output.flatten() + else: + return output + + +class SingleTree: + """A single decision tree. + + The primary point of this object is to parse many different tree types into a common format. + + Attributes + ---------- + children_left : numpy.array + A 1d array of length #nodes. The index ``i`` of this array contains the index of + the left-child of the ``i-th`` node in the tree. An index of -1 is used to + represent that the ``i-th`` node is a leaf/terminal node. + + children_right : numpy.array + Same as ``children_left``, except it contains the index of the right child of + each ``i-th`` node in the tree. + + children_default : numpy.array + A 1d numpy array of length #nodes. The index ``i`` of this array contains either + the index of the left-child / right-child of the ``i-th`` node in the tree, + depending on whether the default split (for handling missing values) is left / + right. An index of -1 is used to represent that the ``i-th`` node is a leaf + node. + + features : numpy.array + A 1d numpy array of length #nodes. The value at the ``i-th`` position is the + index of the feature chosen for the split at node ``i``. Leaf nodes have no + splits, so is -1. + + thresholds : numpy.array + A 1d numpy array of length #nodes. The value at the ``i-th`` position is the + threshold used for the split at node ``i``. Leaf nodes have no thresholds, so is + -1. + + values : numpy.array + A 1d numpy array of length #nodes. The index ``i`` of this array contains the + raw predicted value that would be produced by node ``i`` if it were a leaf node. + + node_sample_weight : numpy.array + A 1d numpy array of length #nodes. The index ``i`` contains the number of + records (usually from the training data) that falls into node ``i``. + + max_depth : int + The max depth of the tree. + """ + def __init__(self, tree, normalize=False, scaling=1.0, data=None, data_missing=None): + assert_import("cext") + + if safe_isinstance(tree, ["sklearn.tree._tree.Tree", "econml.tree._tree.Tree"]): + self.children_left = tree.children_left.astype(np.int32) + self.children_right = tree.children_right.astype(np.int32) + self.children_default = self.children_left # missing values not supported in sklearn + self.features = tree.feature.astype(np.int32) + self.thresholds = tree.threshold.astype(np.float64) + self.values = tree.value.reshape(tree.value.shape[0], tree.value.shape[1] * tree.value.shape[2]) + if normalize: + self.values = (self.values.T / self.values.sum(1)).T + self.values = self.values * scaling + self.node_sample_weight = tree.weighted_n_node_samples.astype(np.float64) + + elif isinstance(tree, dict) and "features" in tree: + self.children_left = tree["children_left"].astype(np.int32) + self.children_right = tree["children_right"].astype(np.int32) + self.children_default = tree["children_default"].astype(np.int32) + self.features = tree["features"].astype(np.int32) + self.thresholds = tree["thresholds"] + self.values = tree["values"] * scaling + self.node_sample_weight = tree["node_sample_weight"] + + # deprecated dictionary support (with sklearn singular style "feature" and "value" names) + elif isinstance(tree, dict) and "children_left" in tree: + self.children_left = tree["children_left"].astype(np.int32) + self.children_right = tree["children_right"].astype(np.int32) + self.children_default = tree["children_default"].astype(np.int32) + self.features = tree["feature"].astype(np.int32) + self.thresholds = tree["threshold"] + self.values = tree["value"] * scaling + self.node_sample_weight = tree["node_sample_weight"] + + elif safe_isinstance( + tree, + [ + "pyspark.ml.classification.DecisionTreeClassificationModel", + "pyspark.ml.regression.DecisionTreeRegressionModel", + ], + ): + #model._java_obj.numNodes() doesn't give leaves, need to recompute the size + def getNumNodes(node, size): + size = size + 1 + if node.subtreeDepth() == 0: + return size + else: + size = getNumNodes(node.leftChild(), size) + return getNumNodes(node.rightChild(), size) + + num_nodes = getNumNodes(tree._java_obj.rootNode(), 0) + self.children_left = np.full(num_nodes, -2, dtype=np.int32) + self.children_right = np.full(num_nodes, -2, dtype=np.int32) + self.children_default = np.full(num_nodes, -2, dtype=np.int32) + self.features = np.full(num_nodes, -2, dtype=np.int32) + self.thresholds = np.full(num_nodes, -2, dtype=np.float64) + self.values = [-2]*num_nodes + self.node_sample_weight = np.full(num_nodes, -2, dtype=np.float64) + def buildTree(index, node): + index = index + 1 + if tree._java_obj.getImpurity() == 'variance': + self.values[index] = [node.prediction()] # prediction for the node + else: + self.values[index] = [e for e in node.impurityStats().stats()] #for gini: NDarray(numLabel): 1 per label: number of item for each label which went through this node + self.node_sample_weight[index] = node.impurityStats().count() # weighted count of element through this node + + if node.subtreeDepth() == 0: + return index + else: + self.features[index] = node.split().featureIndex() #index of the feature we split on, not available for leaf, int + if str(node.split().getClass()).endswith('tree.CategoricalSplit'): + #Categorical split isn't implemented, TODO: could fake it by creating a fake node to split on the exact value? + raise NotImplementedError('CategoricalSplit are not yet implemented') + self.thresholds[index] = node.split().threshold() #threshold for the feature, not available for leaf, float + + self.children_left[index] = index + 1 + idx = buildTree(index, node.leftChild()) + self.children_right[index] = idx + 1 + idx = buildTree(idx, node.rightChild()) + return idx + + buildTree(-1, tree._java_obj.rootNode()) + #default Not supported with mlib? (TODO) + self.children_default = self.children_left + self.values = np.asarray(self.values) + if normalize: + self.values = (self.values.T / self.values.sum(1)).T + self.values = self.values * scaling + + # dictionary output from LightGBM `.dump_model()` + elif isinstance(tree, dict) and "tree_structure" in tree: + start = tree["tree_structure"] + num_parents = tree["num_leaves"] - 1 + num_nodes = 2 * num_parents + 1 + self.children_left = np.empty(num_nodes, dtype=np.int32) + self.children_right = np.empty(num_nodes, dtype=np.int32) + self.children_default = np.empty(num_nodes, dtype=np.int32) + self.features = np.empty(num_nodes, dtype=np.int32) + self.thresholds = np.empty(num_nodes, dtype=np.float64) + self.values = [-2 for _ in range(num_nodes)] + self.node_sample_weight = np.empty(num_nodes, dtype=np.float64) + + # BFS traversal through the tree structure + visited, queue = [], [start] + while queue: + vertex = queue.pop(0) # TODO(perf): benchmark this against deque.popleft() + is_branch_node = "split_index" in vertex + if is_branch_node: + vsplit_idx: int = vertex["split_index"] + if vsplit_idx in visited: + continue + + left_child: dict = vertex["left_child"] + right_child: dict = vertex["right_child"] + left_is_branch_node = "split_index" in left_child + if left_is_branch_node: + self.children_left[vsplit_idx] = left_child["split_index"] + else: + self.children_left[vsplit_idx] = left_child["leaf_index"] + num_parents + right_is_branch_node = "split_index" in right_child + if right_is_branch_node: + self.children_right[vsplit_idx] = right_child["split_index"] + else: + self.children_right[vsplit_idx] = right_child["leaf_index"] + num_parents + if vertex["default_left"]: + self.children_default[vsplit_idx] = self.children_left[vsplit_idx] + else: + self.children_default[vsplit_idx] = self.children_right[vsplit_idx] + + self.features[vsplit_idx] = vertex["split_feature"] + self.thresholds[vsplit_idx] = vertex["threshold"] + self.values[vsplit_idx] = [vertex["internal_value"]] + self.node_sample_weight[vsplit_idx] = vertex["internal_count"] + visited.append(vsplit_idx) + queue.append(left_child) + queue.append(right_child) + else: + # NOTE: If "leaf_index" is not present as a key, it means we have a + # stump tree. I.e., num_nodes=1. + vleaf_idx: int = vertex.get("leaf_index", 0) + num_parents + self.children_left[vleaf_idx] = -1 + self.children_right[vleaf_idx] = -1 + self.children_default[vleaf_idx] = -1 + self.features[vleaf_idx] = -1 + self.children_left[vleaf_idx] = -1 + self.children_right[vleaf_idx] = -1 + self.children_default[vleaf_idx] = -1 + self.features[vleaf_idx] = -1 + self.thresholds[vleaf_idx] = -1 + self.values[vleaf_idx] = [vertex["leaf_value"]] + # FIXME: "leaf_count" currently doesn't exist if we have a stump tree. + # We should be technically be assigning the number of samples used to + # train the model as the weight here, but unfortunately this info is + # currently unavailable in `tree`, so we set to 0 first. + # cf. https://github.com/microsoft/LightGBM/issues/5962 + self.node_sample_weight[vleaf_idx] = vertex.get("leaf_count", 0) + self.values = np.asarray(self.values) + self.values = np.multiply(self.values, scaling) + + elif isinstance(tree, dict) and 'nodeid' in tree: + """ Directly create tree given the JSON dump (with stats) of a XGBoost model. + """ + + def max_id(node): + if "children" in node: + return max(node["nodeid"], *[max_id(n) for n in node["children"]]) + else: + return node["nodeid"] + + m = max_id(tree) + 1 + self.children_left = -np.ones(m, dtype=np.int32) + self.children_right = -np.ones(m, dtype=np.int32) + self.children_default = -np.ones(m, dtype=np.int32) + self.features = -np.ones(m, dtype=np.int32) + self.thresholds = np.zeros(m, dtype=np.float64) + self.values = np.zeros((m, 1), dtype=np.float64) + self.node_sample_weight = np.empty(m, dtype=np.float64) + + def extract_data(node, tree): + i = node["nodeid"] + tree.node_sample_weight[i] = node["cover"] + + if "children" in node: + tree.children_left[i] = node["yes"] + tree.children_right[i] = node["no"] + tree.children_default[i] = node["missing"] + tree.features[i] = node["split"] + tree.thresholds[i] = node["split_condition"] + + for n in node["children"]: + extract_data(n, tree) + elif "leaf" in node: + tree.values[i] = node["leaf"] * scaling + + extract_data(tree, self) + + elif isinstance(tree, str): + """ Build a tree from a text dump (with stats) of xgboost. + """ + + nodes = [t.lstrip() for t in tree[:-1].split("\n")] + nodes_dict = {} + for n in nodes: + nodes_dict[int(n.split(":")[0])] = n.split(":")[1] + m = max(nodes_dict.keys())+1 + children_left = -1*np.ones(m,dtype="int32") + children_right = -1*np.ones(m,dtype="int32") + children_default = -1*np.ones(m,dtype="int32") + features = -2*np.ones(m,dtype="int32") + thresholds = -1*np.ones(m,dtype="float64") + values = 1*np.ones(m,dtype="float64") + node_sample_weight = np.zeros(m,dtype="float64") + values_lst = list(nodes_dict.values()) + keys_lst = list(nodes_dict.keys()) + for i in range(0,len(keys_lst)): + value = values_lst[i] + key = keys_lst[i] + if "leaf" in value: + # Extract values + val = float(value.split("leaf=")[1].split(",")[0]) + node_sample_weight_val = float(value.split("cover=")[1]) + # Append to lists + values[key] = val + node_sample_weight[key] = node_sample_weight_val + else: + c_left = int(value.split("yes=")[1].split(",")[0]) + c_right = int(value.split("no=")[1].split(",")[0]) + c_default = int(value.split("missing=")[1].split(",")[0]) + feat_thres = value.split(" ")[0] + if ("<" in feat_thres): + feature = int(feat_thres.split("<")[0][2:]) + threshold = float(feat_thres.split("<")[1][:-1]) + if ("=" in feat_thres): + feature = int(feat_thres.split("=")[0][2:]) + threshold = float(feat_thres.split("=")[1][:-1]) + node_sample_weight_val = float(value.split("cover=")[1].split(",")[0]) + children_left[key] = c_left + children_right[key] = c_right + children_default[key] = c_default + features[key] = feature + thresholds[key] = threshold + node_sample_weight[key] = node_sample_weight_val + + self.children_left = children_left + self.children_right = children_right + self.children_default = children_default + self.features = features + self.thresholds = thresholds + self.values = values[:,np.newaxis] * scaling + self.node_sample_weight = node_sample_weight + else: + raise TypeError("Unknown input to SingleTree constructor: " + str(tree)) + + # Re-compute the number of samples that pass through each node if we are given data + if data is not None and data_missing is not None: + self.node_sample_weight.fill(0.0) + _cext.dense_tree_update_weights( + self.children_left, self.children_right, self.children_default, self.features, + self.thresholds, self.values, 1, self.node_sample_weight, data, data_missing + ) + + # we compute the expectations to make sure they follow the SHAP logic + self.max_depth = _cext.compute_expectations( + self.children_left, self.children_right, self.node_sample_weight, + self.values + ) + + +class IsoTree(SingleTree): + """ + In sklearn the tree of the Isolation Forest does not calculated in a good way. + """ + def __init__(self, tree, tree_features, normalize=False, scaling=1.0, data=None, data_missing=None): + super().__init__(tree, normalize, scaling, data, data_missing) + if safe_isinstance(tree, "sklearn.tree._tree.Tree"): + from sklearn.ensemble._iforest import ( + _average_path_length, + ) + + def _recalculate_value(tree, i , level): + if tree.children_left[i] == -1 and tree.children_right[i] == -1: + value = level + _average_path_length(np.array([tree.n_node_samples[i]]))[0] + self.values[i, 0] = value + return value * tree.n_node_samples[i] + else: + value_left = _recalculate_value(tree, tree.children_left[i] , level + 1) + value_right = _recalculate_value(tree, tree.children_right[i] , level + 1) + self.values[i, 0] = (value_left + value_right) / tree.n_node_samples[i] + return value_left + value_right + + _recalculate_value(tree, 0, 0) + if normalize: + self.values = (self.values.T / self.values.sum(1)).T + self.values = self.values * scaling + # re-number the features if each tree gets a different set of features + self.features = np.where(self.features >= 0, tree_features[self.features], self.features) + + +def get_xgboost_dmatrix_properties(model): + """ + Retrieves properties from an xgboost.sklearn.XGBModel instance that should be passed to the xgboost.core.DMatrix object before calling predict on the model + """ + properties_to_pass = ["missing", "n_jobs", "enable_categorical", "feature_types"] + dmatrix_attributes = {} + for attribute in properties_to_pass: + if hasattr(model, attribute): + dmatrix_attributes[attribute] = getattr(model, attribute) + + # Convert sklearn n_jobs to xgboost nthread + if "n_jobs" in dmatrix_attributes: + dmatrix_attributes["nthread"] = dmatrix_attributes.pop("n_jobs") + return dmatrix_attributes + +def get_xgboost_json(model): + """ This gets a JSON dump of an XGBoost model while ensuring the features names are their indexes. + """ + fnames = model.feature_names + model.feature_names = None + json_trees = model.get_dump(with_stats=True, dump_format="json") + model.feature_names = fnames + + # this fixes a bug where XGBoost can return invalid JSON + json_trees = [t.replace(": inf,", ": 1000000000000.0,") for t in json_trees] + json_trees = [t.replace(": -inf,", ": -1000000000000.0,") for t in json_trees] + + return json_trees + + +class XGBTreeModelLoader: + """ This loads an XGBoost model directly from a raw memory dump. + + We can't use the JSON dump because due to numerical precision issues those + tree can actually be wrong when feature values land almost on a threshold. + """ + def __init__(self, xgb_model): + import xgboost + xgb_params = self.read_xgb_params(xgb_model) + + self.base_score = float(xgb_params["learner_model_param"]["base_score"]) + self.num_feature = int(xgb_params["learner_model_param"]["num_feature"]) + self.num_class = int(xgb_params["learner_model_param"]["num_class"]) + self.name_obj = xgb_params["objective"]["name"] + self.name_gbm = xgb_params["gradient_booster"]["name"] + + # new in XGBoost 1.0 is that the base_score is saved untransformed (https://github.com/dmlc/xgboost/pull/5101) + # so we have to transform it depending on the objective + if version.parse(xgboost.__version__).major >= 1: + if self.name_obj in ["binary:logistic", "reg:logistic"]: + self.base_score = scipy.special.logit(self.base_score) + + assert self.name_gbm == "gbtree", "Only the 'gbtree' model type is supported, not '%s'!" % self.name_gbm + + # load the gbtree specific parameters + self.num_trees = int(xgb_params["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) + self.num_feature = int(xgb_params["learner_model_param"]["num_feature"]) + # load each tree + self.num_roots = np.zeros(self.num_trees, dtype=np.int32) + self.num_nodes = np.zeros(self.num_trees, dtype=np.int32) + self.num_deleted = np.zeros(self.num_trees, dtype=np.int32) + self.max_depth = np.zeros(self.num_trees, dtype=np.int32) + self.num_feature = np.zeros(self.num_trees, dtype=np.int32) + self.size_leaf_vector = np.zeros(self.num_trees, dtype=np.int32) + self.node_parents = [] + self.node_cleft = [] + self.node_cright = [] + self.node_sindex = [] + self.node_info = [] + self.loss_chg = [] + self.sum_hess = [] + self.base_weight = [] + self.leaf_child_cnt = [] + for i in range(self.num_trees): + tree_json = xgb_params["gradient_booster"]["model"]["trees"][i] + + # load the per-tree params + self.num_nodes[i] = tree_json["tree_param"]["num_nodes"] + self.num_deleted[i] = tree_json["tree_param"]["num_deleted"] + self.num_feature[i] = tree_json["tree_param"]["num_feature"] + self.size_leaf_vector[i] = tree_json["tree_param"]["size_leaf_vector"] + + # load the nodes + self.node_parents.append(np.array(tree_json["parents"], dtype=np.int32)) + self.node_cleft.append(np.array(tree_json["left_children"], dtype=np.int32)) + self.node_cright.append(np.array(tree_json["right_children"], dtype=np.int32)) + self.node_sindex.append(np.array(tree_json["split_indices"], dtype=np.uint32)) + self.node_info.append(np.array(tree_json["split_conditions"], dtype=np.float32)) + + # load the stat nodes + self.loss_chg.append(np.array(tree_json["loss_changes"], dtype=np.float32)) + self.sum_hess.append(np.array(tree_json["sum_hessian"], dtype=np.float64)) + self.base_weight.append(np.array(tree_json["base_weights"], dtype=np.float32)) + self.leaf_child_cnt.append(np.array(tree_json["default_left"], dtype=int)) + + @staticmethod + def read_xgb_params(xgb_model) -> Dict[str, Any]: + import xgboost + with tempfile.TemporaryDirectory() as tmp_dir: + if version.parse(xgboost.__version__) >= version.parse("1.6.0"): + tmp_file = os.path.join(tmp_dir, "model.ubj") + xgb_model.save_model(tmp_file) + xgb_params = decode_ubjson_buffer(open(tmp_file, 'rb')) + else: + warnings.warn("You are using an XGBoost version below 1.6.0 which is not fully supported by shap. " + "Shap falls back to encoding the model as JSON which can lead to numerical precision issues. " + "Please consider upgrading to XGBoost 1.6.0 or higher.") + tmp_file = os.path.join(tmp_dir, "model.json") + xgb_model.save_model(tmp_file) + with open(tmp_file) as fh: + xgb_params = json.load(fh) + return xgb_params["learner"] + + def get_trees(self, data=None, data_missing=None): + shape = (self.num_trees, self.num_nodes.max()) + self.children_default = np.zeros(shape, dtype=int) + self.features = np.zeros(shape, dtype=int) + self.thresholds = np.zeros(shape, dtype=np.float32) + self.values = np.zeros((shape[0], shape[1], 1), dtype=np.float32) + trees = [] + for i in range(self.num_trees): + for j in range(self.num_nodes[i]): + if np.right_shift(self.node_sindex[i][j], np.uint32(31)) != 0: + self.children_default[i,j] = self.node_cleft[i][j] + else: + self.children_default[i,j] = self.node_cright[i][j] + self.features[i,j] = self.node_sindex[i][j] & ((np.uint32(1) << np.uint32(31)) - np.uint32(1)) + if self.node_cleft[i][j] >= 0: + # Xgboost uses < for thresholds where shap uses <= + # Move the threshold down by the smallest possible increment + self.thresholds[i, j] = np.nextafter(self.node_info[i][j], - np.float32(np.inf)) + else: + self.values[i,j] = self.node_info[i][j] + + k = len(self.node_cleft[i]) + trees.append(SingleTree({ + "children_left": self.node_cleft[i], + "children_right": self.node_cright[i], + "children_default": self.children_default[i,:k], + "feature": self.features[i,:k], + "threshold": self.thresholds[i,:k], + "value": self.values[i,:k], + "node_sample_weight": self.sum_hess[i] + }, data=data, data_missing=data_missing)) + return trees + + def read(self, dtype): + size = struct.calcsize(dtype) + val = struct.unpack(dtype, self.buf[self.pos:self.pos+size])[0] + self.pos += size + return val + + def read_arr(self, dtype, n_items): + format = "%d%s" % (n_items, dtype) + size = struct.calcsize(format) + val = struct.unpack(format, self.buf[self.pos:self.pos+size])[0] + self.pos += size + return val + + def read_str(self, size): + val = self.buf[self.pos:self.pos+size].decode('utf-8') + self.pos += size + return val + + def print_info(self): + + print("--- global parameters ---") + print("base_score =", self.base_score) + print("num_feature =", self.num_feature) + print("num_class =", self.num_class) + print("name_obj =", self.name_obj) + print("name_gbm =", self.name_gbm) + print() + print("--- gbtree specific parameters ---") + print("num_trees =", self.num_trees) + print("num_roots =", self.num_roots) + print("num_feature =", self.num_feature) + print("size_leaf_vector =", self.size_leaf_vector) + + +class CatBoostTreeModelLoader: + def __init__(self, cb_model): + import tempfile + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_file = os.path.join(tmp_dir, "model.json") + cb_model.save_model(tmp_file, format="json") + self.loaded_cb_model = json.load(open(tmp_file)) + + # load the CatBoost oblivious trees specific parameters + self.num_trees = len(self.loaded_cb_model['oblivious_trees']) + self.max_depth = self.loaded_cb_model['model_info']['params']['tree_learner_options']['depth'] + + def get_trees(self, data=None, data_missing=None): + # load each tree + trees = [] + for tree_index in range(self.num_trees): + + # load the per-tree params + #depth = len(self.loaded_cb_model['oblivious_trees'][tree_index]['splits']) + + # load the nodes + + # Re-compute the number of samples that pass through each node if we are given data + leaf_weights = self.loaded_cb_model['oblivious_trees'][tree_index]['leaf_weights'] + leaf_weights_unraveled = [0] * (len(leaf_weights) - 1) + leaf_weights + leaf_weights_unraveled[0] = sum(leaf_weights) + for index in range(len(leaf_weights) - 2, 0, -1): + leaf_weights_unraveled[index] = leaf_weights_unraveled[2 * index + 1] + leaf_weights_unraveled[2 * index + 2] + + leaf_values = self.loaded_cb_model['oblivious_trees'][tree_index]['leaf_values'] + leaf_values_unraveled = [0] * (len(leaf_values) - 1) + leaf_values + + children_left = [i * 2 + 1 for i in range(len(leaf_values) - 1)] + children_left += [-1] * len(leaf_values) + + children_right = [i * 2 for i in range(1, len(leaf_values))] + children_right += [-1] * len(leaf_values) + + children_default = [i * 2 + 1 for i in range(len(leaf_values) - 1)] + children_default += [-1] * len(leaf_values) + + # load the split features and borders + # split features and borders go from leafs to the root + split_features_index = [] + borders = [] + + # split features and borders go from leafs to the root + for elem in self.loaded_cb_model['oblivious_trees'][tree_index]['splits']: + split_type = elem.get('split_type') + if split_type == 'FloatFeature': + split_feature_index = elem.get('float_feature_index') + borders.append(elem['border']) + elif split_type == 'OneHotFeature': + split_feature_index = elem.get('cat_feature_index') + borders.append(elem['value']) + else: + split_feature_index = elem.get('ctr_target_border_idx') + borders.append(elem['border']) + split_features_index.append(split_feature_index) + + split_features_index_unraveled = [] + for counter, feature_index in enumerate(split_features_index[::-1]): + split_features_index_unraveled += [feature_index] * (2 ** counter) + split_features_index_unraveled += [0] * len(leaf_values) + + borders_unraveled = [] + for counter, border in enumerate(borders[::-1]): + borders_unraveled += [border] * (2 ** counter) + borders_unraveled += [0] * len(leaf_values) + + trees.append(SingleTree({"children_left": np.array(children_left), + "children_right": np.array(children_right), + "children_default": np.array(children_default), + "feature": np.array(split_features_index_unraveled), + "threshold": np.array(borders_unraveled), + "value": np.array(leaf_values_unraveled).reshape((-1,1)), + "node_sample_weight": np.array(leaf_weights_unraveled), + }, data=data, data_missing=data_missing)) + + return trees