Spaces:

IVSD
/

EasyMachineLearningDemo

Sleeping

App Files Files Community

LLH commited on Feb 14, 2024

Commit

c95b9af

1 Parent(s): b71863b

2024/02/14/10:51

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

analysis/shap_model.py +2 -1
diagram/__init__.py +0 -0
lib/__init__.py +0 -0
lib/shap/__init__.py +144 -0
lib/shap/_cext.cp310-win_amd64.pyd +0 -0
lib/shap/_explanation.py +901 -0
lib/shap/_serializable.py +204 -0
lib/shap/_version.py +16 -0
lib/shap/actions/__init__.py +3 -0
lib/shap/actions/_action.py +8 -0
lib/shap/actions/_optimizer.py +92 -0
lib/shap/benchmark/__init__.py +9 -0
lib/shap/benchmark/_compute.py +9 -0
lib/shap/benchmark/_explanation_error.py +181 -0
lib/shap/benchmark/_result.py +34 -0
lib/shap/benchmark/_sequential.py +332 -0
lib/shap/benchmark/experiments.py +414 -0
lib/shap/benchmark/framework.py +113 -0
lib/shap/benchmark/measures.py +424 -0
lib/shap/benchmark/methods.py +148 -0
lib/shap/benchmark/metrics.py +824 -0
lib/shap/benchmark/models.py +230 -0
lib/shap/benchmark/plots.py +566 -0
lib/shap/cext/_cext.cc +560 -0
lib/shap/cext/_cext_gpu.cc +187 -0
lib/shap/cext/_cext_gpu.cu +353 -0
lib/shap/cext/gpu_treeshap.h +1535 -0
lib/shap/cext/tree_shap.h +1460 -0
lib/shap/datasets.py +309 -0
lib/shap/explainers/__init__.py +38 -0
lib/shap/explainers/_additive.py +187 -0
lib/shap/explainers/_deep/__init__.py +125 -0
lib/shap/explainers/_deep/deep_pytorch.py +386 -0
lib/shap/explainers/_deep/deep_tf.py +763 -0
lib/shap/explainers/_deep/deep_utils.py +23 -0
lib/shap/explainers/_exact.py +366 -0
lib/shap/explainers/_explainer.py +457 -0
lib/shap/explainers/_gpu_tree.py +179 -0
lib/shap/explainers/_gradient.py +592 -0
lib/shap/explainers/_kernel.py +696 -0
lib/shap/explainers/_linear.py +406 -0
lib/shap/explainers/_partition.py +681 -0
lib/shap/explainers/_permutation.py +217 -0
lib/shap/explainers/_sampling.py +199 -0
lib/shap/explainers/_tree.py +0 -0
lib/shap/explainers/other/__init__.py +26 -0
lib/shap/explainers/other/_coefficient.py +17 -0
lib/shap/explainers/other/_lime.py +73 -0
lib/shap/explainers/other/_maple.py +306 -0
lib/shap/explainers/other/_random.py +79 -0

analysis/shap_model.py CHANGED Viewed

@@ -1,6 +1,7 @@
-import shap
 import matplotlib.pyplot as plt
 def shap_calculate(model, x, feature_names):
     explainer = shap.Explainer(model.predict, x)

 import matplotlib.pyplot as plt
+import lib.shap as shap
 def shap_calculate(model, x, feature_names):
     explainer = shap.Explainer(model.predict, x)

diagram/__init__.py ADDED Viewed

File without changes

lib/__init__.py ADDED Viewed

File without changes

lib/shap/__init__.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from ._explanation import Cohorts, Explanation
+# explainers
+from .explainers import other
+from .explainers._additive import AdditiveExplainer
+from .explainers._deep import DeepExplainer
+from .explainers._exact import ExactExplainer
+from .explainers._explainer import Explainer
+from .explainers._gpu_tree import GPUTreeExplainer
+from .explainers._gradient import GradientExplainer
+from .explainers._kernel import KernelExplainer
+from .explainers._linear import LinearExplainer
+from .explainers._partition import PartitionExplainer
+from .explainers._permutation import PermutationExplainer
+from .explainers._sampling import SamplingExplainer
+from .explainers._tree import TreeExplainer
+try:
+    # Version from setuptools-scm
+    from ._version import version as __version__
+except ImportError:
+    # Expected when running locally without build
+    __version__ = "0.0.0-not-built"
+_no_matplotlib_warning = "matplotlib is not installed so plotting is not available! Run `pip install matplotlib` " \
+                         "to fix this."
+# plotting (only loaded if matplotlib is present)
+def unsupported(*args, **kwargs):
+    raise ImportError(_no_matplotlib_warning)
+class UnsupportedModule:
+    def __getattribute__(self, item):
+        raise ImportError(_no_matplotlib_warning)
+try:
+    import matplotlib  # noqa: F401
+    have_matplotlib = True
+except ImportError:
+    have_matplotlib = False
+if have_matplotlib:
+    from . import plots
+    from .plots._bar import bar_legacy as bar_plot
+    from .plots._beeswarm import summary_legacy as summary_plot
+    from .plots._decision import decision as decision_plot
+    from .plots._decision import multioutput_decision as multioutput_decision_plot
+    from .plots._embedding import embedding as embedding_plot
+    from .plots._force import force as force_plot
+    from .plots._force import getjs, initjs, save_html
+    from .plots._group_difference import group_difference as group_difference_plot
+    from .plots._heatmap import heatmap as heatmap_plot
+    from .plots._image import image as image_plot
+    from .plots._monitoring import monitoring as monitoring_plot
+    from .plots._partial_dependence import partial_dependence as partial_dependence_plot
+    from .plots._scatter import dependence_legacy as dependence_plot
+    from .plots._text import text as text_plot
+    from .plots._violin import violin as violin_plot
+    from .plots._waterfall import waterfall as waterfall_plot
+else:
+    bar_plot = unsupported
+    summary_plot = unsupported
+    decision_plot = unsupported
+    multioutput_decision_plot = unsupported
+    embedding_plot = unsupported
+    force_plot = unsupported
+    getjs = unsupported
+    initjs = unsupported
+    save_html = unsupported
+    group_difference_plot = unsupported
+    heatmap_plot = unsupported
+    image_plot = unsupported
+    monitoring_plot = unsupported
+    partial_dependence_plot = unsupported
+    dependence_plot = unsupported
+    text_plot = unsupported
+    violin_plot = unsupported
+    waterfall_plot = unsupported
+    # If matplotlib is available, then the plots submodule will be directly available.
+    # If not, we need to define something that will issue a meaningful warning message
+    # (rather than ModuleNotFound).
+    plots = UnsupportedModule()
+# other stuff :)
+from . import datasets, links, utils  # noqa: E402
+from .actions._optimizer import ActionOptimizer  # noqa: E402
+from .utils import approximate_interactions, sample  # noqa: E402
+#from . import benchmark
+from .utils._legacy import kmeans  # noqa: E402
+# Use __all__ to let type checkers know what is part of the public API.
+__all__ = [
+    "Cohorts",
+    "Explanation",
+    # Explainers
+    "other",
+    "AdditiveExplainer",
+    "DeepExplainer",
+    "ExactExplainer",
+    "Explainer",
+    "GPUTreeExplainer",
+    "GradientExplainer",
+    "KernelExplainer",
+    "LinearExplainer",
+    "PartitionExplainer",
+    "PermutationExplainer",
+    "SamplingExplainer",
+    "TreeExplainer",
+    # Plots
+    "plots",
+    "bar_plot",
+    "summary_plot",
+    "decision_plot",
+    "multioutput_decision_plot",
+    "embedding_plot",
+    "force_plot",
+    "getjs",
+    "initjs",
+    "save_html",
+    "group_difference_plot",
+    "heatmap_plot",
+    "image_plot",
+    "monitoring_plot",
+    "partial_dependence_plot",
+    "dependence_plot",
+    "text_plot",
+    "violin_plot",
+    "waterfall_plot",
+    # Other stuff
+    "datasets",
+    "links",
+    "utils",
+    "ActionOptimizer",
+    "approximate_interactions",
+    "sample",
+    "kmeans",
+]

lib/shap/_cext.cp310-win_amd64.pyd ADDED Viewed

Binary file (44 kB). View file

lib/shap/_explanation.py ADDED Viewed

	@@ -0,0 +1,901 @@

+import copy
+import operator
+import numpy as np
+import pandas as pd
+import scipy.cluster
+import scipy.sparse
+import scipy.spatial
+import sklearn
+from slicer import Alias, Obj, Slicer
+from .utils._exceptions import DimensionError
+from .utils._general import OpChain
+op_chain_root = OpChain("shap.Explanation")
+class MetaExplanation(type):
+    """ This metaclass exposes the Explanation object's methods for creating template op chains.
+    """
+    def __getitem__(cls, item):
+        return op_chain_root.__getitem__(item)
+    @property
+    def abs(cls):
+        """ Element-wise absolute value op.
+        """
+        return op_chain_root.abs
+    @property
+    def identity(cls):
+        """ A no-op.
+        """
+        return op_chain_root.identity
+    @property
+    def argsort(cls):
+        """ Numpy style argsort.
+        """
+        return op_chain_root.argsort
+    @property
+    def sum(cls):
+        """ Numpy style sum.
+        """
+        return op_chain_root.sum
+    @property
+    def max(cls):
+        """ Numpy style max.
+        """
+        return op_chain_root.max
+    @property
+    def min(cls):
+        """ Numpy style min.
+        """
+        return op_chain_root.min
+    @property
+    def mean(cls):
+        """ Numpy style mean.
+        """
+        return op_chain_root.mean
+    @property
+    def sample(cls):
+        """ Numpy style sample.
+        """
+        return op_chain_root.sample
+    @property
+    def hclust(cls):
+        """ Hierarchical clustering op.
+        """
+        return op_chain_root.hclust
+class Explanation(metaclass=MetaExplanation):
+    """ A sliceable set of parallel arrays representing a SHAP explanation.
+    """
+    def __init__(
+        self,
+        values,
+        base_values=None,
+        data=None,
+        display_data=None,
+        instance_names=None,
+        feature_names=None,
+        output_names=None,
+        output_indexes=None,
+        lower_bounds=None,
+        upper_bounds=None,
+        error_std=None,
+        main_effects=None,
+        hierarchical_values=None,
+        clustering=None,
+        compute_time=None
+    ):
+        self.op_history = []
+        self.compute_time = compute_time
+        # cloning. TODOsomeday: better cloning :)
+        if issubclass(type(values), Explanation):
+            e = values
+            values = e.values
+            base_values = e.base_values
+            data = e.data
+        self.output_dims = compute_output_dims(values, base_values, data, output_names)
+        values_shape = _compute_shape(values)
+        if output_names is None and len(self.output_dims) == 1:
+            output_names = [f"Output {i}" for i in range(values_shape[self.output_dims[0]])]
+        if len(_compute_shape(feature_names)) == 1:  # TODO: should always be an alias once slicer supports per-row aliases
+            if len(values_shape) >= 2 and len(feature_names) == values_shape[1]:
+                feature_names = Alias(list(feature_names), 1)
+            elif len(values_shape) >= 1 and len(feature_names) == values_shape[0]:
+                feature_names = Alias(list(feature_names), 0)
+        if len(_compute_shape(output_names)) == 1:  # TODO: should always be an alias once slicer supports per-row aliases
+            output_names = Alias(list(output_names), self.output_dims[0])
+            # if len(values_shape) >= 1 and len(output_names) == values_shape[0]:
+            #     output_names = Alias(list(output_names), 0)
+            # elif len(values_shape) >= 2 and len(output_names) == values_shape[1]:
+            #     output_names = Alias(list(output_names), 1)
+        if output_names is not None and not isinstance(output_names, Alias):
+            output_names_order = len(_compute_shape(output_names))
+            if output_names_order == 0:
+                pass
+            elif output_names_order == 1:
+                output_names = Obj(output_names, self.output_dims)
+            elif output_names_order == 2:
+                output_names = Obj(output_names, [0] + list(self.output_dims))
+            else:
+                raise ValueError("shap.Explanation does not yet support output_names of order greater than 3!")
+        if not hasattr(base_values, "__len__") or len(base_values) == 0:
+            pass
+        elif len(_compute_shape(base_values)) == len(self.output_dims):
+            base_values = Obj(base_values, list(self.output_dims))
+        else:
+            base_values = Obj(base_values, [0] + list(self.output_dims))
+        self._s = Slicer(
+            values=values,
+            base_values=base_values,
+            data=list_wrap(data),
+            display_data=list_wrap(display_data),
+            instance_names=None if instance_names is None else Alias(instance_names, 0),
+            feature_names=feature_names,
+            output_names=output_names,
+            output_indexes=None if output_indexes is None else (self.output_dims, output_indexes),
+            lower_bounds=list_wrap(lower_bounds),
+            upper_bounds=list_wrap(upper_bounds),
+            error_std=list_wrap(error_std),
+            main_effects=list_wrap(main_effects),
+            hierarchical_values=list_wrap(hierarchical_values),
+            clustering=None if clustering is None else Obj(clustering, [0])
+        )
+    @property
+    def shape(self):
+        """ Compute the shape over potentially complex data nesting.
+        """
+        return _compute_shape(self._s.values)
+    @property
+    def values(self):
+        """ Pass-through from the underlying slicer object.
+        """
+        return self._s.values
+    @values.setter
+    def values(self, new_values):
+        self._s.values = new_values
+    @property
+    def base_values(self):
+        """ Pass-through from the underlying slicer object.
+        """
+        return self._s.base_values
+    @base_values.setter
+    def base_values(self, new_base_values):
+        self._s.base_values = new_base_values
+    @property
+    def data(self):
+        """ Pass-through from the underlying slicer object.
+        """
+        return self._s.data
+    @data.setter
+    def data(self, new_data):
+        self._s.data = new_data
+    @property
+    def display_data(self):
+        """ Pass-through from the underlying slicer object.
+        """
+        return self._s.display_data
+    @display_data.setter
+    def display_data(self, new_display_data):
+        if issubclass(type(new_display_data), pd.DataFrame):
+            new_display_data = new_display_data.values
+        self._s.display_data = new_display_data
+    @property
+    def instance_names(self):
+        """ Pass-through from the underlying slicer object.
+        """
+        return self._s.instance_names
+    @property
+    def output_names(self):
+        """ Pass-through from the underlying slicer object.
+        """
+        return self._s.output_names
+    @output_names.setter
+    def output_names(self, new_output_names):
+        self._s.output_names = new_output_names
+    @property
+    def output_indexes(self):
+        """ Pass-through from the underlying slicer object.
+        """
+        return self._s.output_indexes
+    @property
+    def feature_names(self):
+        """ Pass-through from the underlying slicer object.
+        """
+        return self._s.feature_names
+    @feature_names.setter
+    def feature_names(self, new_feature_names):
+        self._s.feature_names = new_feature_names
+    @property
+    def lower_bounds(self):
+        """ Pass-through from the underlying slicer object.
+        """
+        return self._s.lower_bounds
+    @property
+    def upper_bounds(self):
+        """ Pass-through from the underlying slicer object.
+        """
+        return self._s.upper_bounds
+    @property
+    def error_std(self):
+        """ Pass-through from the underlying slicer object.
+        """
+        return self._s.error_std
+    @property
+    def main_effects(self):
+        """ Pass-through from the underlying slicer object.
+        """
+        return self._s.main_effects
+    @main_effects.setter
+    def main_effects(self, new_main_effects):
+        self._s.main_effects = new_main_effects
+    @property
+    def hierarchical_values(self):
+        """ Pass-through from the underlying slicer object.
+        """
+        return self._s.hierarchical_values
+    @hierarchical_values.setter
+    def hierarchical_values(self, new_hierarchical_values):
+        self._s.hierarchical_values = new_hierarchical_values
+    @property
+    def clustering(self):
+        """ Pass-through from the underlying slicer object.
+        """
+        return self._s.clustering
+    @clustering.setter
+    def clustering(self, new_clustering):
+        self._s.clustering = new_clustering
+    def cohorts(self, cohorts):
+        """ Split this explanation into several cohorts.
+        Parameters
+        ----------
+        cohorts : int or array
+            If this is an integer then we auto build that many cohorts using a decision tree. If this is
+            an array then we treat that as an array of cohort names/ids for each instance.
+        """
+        if isinstance(cohorts, int):
+            return _auto_cohorts(self, max_cohorts=cohorts)
+        if isinstance(cohorts, (list, tuple, np.ndarray)):
+            cohorts = np.array(cohorts)
+            return Cohorts(**{name: self[cohorts == name] for name in np.unique(cohorts)})
+        raise TypeError("The given set of cohort indicators is not recognized! Please give an array or int.")
+    def __repr__(self):
+        """ Display some basic printable info, but not everything.
+        """
+        out = ".values =\n"+self.values.__repr__()
+        if self.base_values is not None:
+            out += "\n\n.base_values =\n"+self.base_values.__repr__()
+        if self.data is not None:
+            out += "\n\n.data =\n"+self.data.__repr__()
+        return out
+    def __getitem__(self, item):
+        """ This adds support for OpChain indexing.
+        """
+        new_self = None
+        if not isinstance(item, tuple):
+            item = (item,)
+        # convert any OpChains or magic strings
+        pos = -1
+        for t in item:
+            pos += 1
+            # skip over Ellipsis
+            if t is Ellipsis:
+                pos += len(self.shape) - len(item)
+                continue
+            orig_t = t
+            if issubclass(type(t), OpChain):
+                t = t.apply(self)
+                if issubclass(type(t), (np.int64, np.int32)): # because slicer does not like numpy indexes
+                    t = int(t)
+                elif issubclass(type(t), np.ndarray):
+                    t = [int(v) for v in t] # slicer wants lists not numpy arrays for indexing
+            elif issubclass(type(t), Explanation):
+                t = t.values
+            elif isinstance(t, str):
+                # work around for 2D output_names since they are not yet slicer supported
+                output_names_dims = []
+                if "output_names" in self._s._objects:
+                    output_names_dims = self._s._objects["output_names"].dim
+                elif "output_names" in self._s._aliases:
+                    output_names_dims = self._s._aliases["output_names"].dim
+                if pos != 0 and pos in output_names_dims:
+                    if len(output_names_dims) == 1:
+                        t = np.argwhere(np.array(self.output_names) == t)[0][0]
+                    elif len(output_names_dims) == 2:
+                        new_values = []
+                        new_base_values = []
+                        new_data = []
+                        new_self = copy.deepcopy(self)
+                        for i, v in enumerate(self.values):
+                            for j, s in enumerate(self.output_names[i]):
+                                if s == t:
+                                    new_values.append(np.array(v[:,j]))
+                                    new_data.append(np.array(self.data[i]))
+                                    new_base_values.append(self.base_values[i][j])
+                        new_self = Explanation(
+                            np.array(new_values),
+                            np.array(new_base_values),
+                            np.array(new_data),
+                            self.display_data,
+                            self.instance_names,
+                            np.array(new_data),
+                            t, # output_names
+                            self.output_indexes,
+                            self.lower_bounds,
+                            self.upper_bounds,
+                            self.error_std,
+                            self.main_effects,
+                            self.hierarchical_values,
+                            self.clustering
+                        )
+                        new_self.op_history = copy.copy(self.op_history)
+                        # new_self = copy.deepcopy(self)
+                        # new_self.values = np.array(new_values)
+                        # new_self.base_values = np.array(new_base_values)
+                        # new_self.data = np.array(new_data)
+                        # new_self.output_names = t
+                        # new_self.feature_names = np.array(new_data)
+                        # new_self.clustering = None
+                # work around for 2D feature_names since they are not yet slicer supported
+                feature_names_dims = []
+                if "feature_names" in self._s._objects:
+                    feature_names_dims = self._s._objects["feature_names"].dim
+                if pos != 0 and pos in feature_names_dims and len(feature_names_dims) == 2:
+                    new_values = []
+                    new_data = []
+                    for i, val_i in enumerate(self.values):
+                        for s,v,d in zip(self.feature_names[i], val_i, self.data[i]):
+                            if s == t:
+                                new_values.append(v)
+                                new_data.append(d)
+                    new_self = copy.deepcopy(self)
+                    new_self.values = new_values
+                    new_self.data = new_data
+                    new_self.feature_names = t
+                    new_self.clustering = None
+                    # return new_self
+            if issubclass(type(t), (np.int8, np.int16, np.int32, np.int64)):
+                t = int(t)
+            if t is not orig_t:
+                tmp = list(item)
+                tmp[pos] = t
+                item = tuple(tmp)
+        # call slicer for the real work
+        item = tuple(v for v in item) # SML I cut out: `if not isinstance(v, str)`
+        if len(item) == 0:
+            return new_self
+        if new_self is None:
+            new_self = copy.copy(self)
+        new_self._s = new_self._s.__getitem__(item)
+        new_self.op_history.append({
+            "name": "__getitem__",
+            "args": (item,),
+            "prev_shape": self.shape
+        })
+        return new_self
+    def __len__(self):
+        return self.shape[0]
+    def __copy__(self):
+        new_exp = Explanation(
+            self.values,
+            self.base_values,
+            self.data,
+            self.display_data,
+            self.instance_names,
+            self.feature_names,
+            self.output_names,
+            self.output_indexes,
+            self.lower_bounds,
+            self.upper_bounds,
+            self.error_std,
+            self.main_effects,
+            self.hierarchical_values,
+            self.clustering
+        )
+        new_exp.op_history = copy.copy(self.op_history)
+        return new_exp
+    def _apply_binary_operator(self, other, binary_op, op_name):
+        new_exp = self.__copy__()
+        new_exp.op_history = copy.copy(self.op_history)
+        new_exp.op_history.append({
+            "name": op_name,
+            "args": (other,),
+            "prev_shape": self.shape
+        })
+        if isinstance(other, Explanation):
+            new_exp.values = binary_op(new_exp.values, other.values)
+            if new_exp.data is not None:
+                new_exp.data = binary_op(new_exp.data, other.data)
+            if new_exp.base_values is not None:
+                new_exp.base_values = binary_op(new_exp.base_values, other.base_values)
+        else:
+            new_exp.values = binary_op(new_exp.values, other)
+            if new_exp.data is not None:
+                new_exp.data = binary_op(new_exp.data, other)
+            if new_exp.base_values is not None:
+                new_exp.base_values = binary_op(new_exp.base_values, other)
+        return new_exp
+    def __add__(self, other):
+        return self._apply_binary_operator(other, operator.add, "__add__")
+    def __radd__(self, other):
+        return self._apply_binary_operator(other, operator.add, "__add__")
+    def __sub__(self, other):
+        return self._apply_binary_operator(other, operator.sub, "__sub__")
+    def __rsub__(self, other):
+        return self._apply_binary_operator(other, operator.sub, "__sub__")
+    def __mul__(self, other):
+        return self._apply_binary_operator(other, operator.mul, "__mul__")
+    def __rmul__(self, other):
+        return self._apply_binary_operator(other, operator.mul, "__mul__")
+    def __truediv__(self, other):
+        return self._apply_binary_operator(other, operator.truediv, "__truediv__")
+    # @property
+    # def abs(self):
+    #     """ Element-size absolute value operator.
+    #     """
+    #     new_self = copy.copy(self)
+    #     new_self.values = np.abs(new_self.values)
+    #     new_self.op_history.append({
+    #         "name": "abs",
+    #         "prev_shape": self.shape
+    #     })
+    #     return new_self
+    def _numpy_func(self, fname, **kwargs):
+        """ Apply a numpy-style function to this Explanation.
+        """
+        new_self = copy.copy(self)
+        axis = kwargs.get("axis", None)
+        # collapse the slicer to right shape
+        if axis == 0:
+            new_self = new_self[0]
+        elif axis == 1:
+            new_self = new_self[1]
+        elif axis == 2:
+            new_self = new_self[2]
+        if axis in [0,1,2]:
+            new_self.op_history = new_self.op_history[:-1] # pop off the slicing operation we just used
+        if self.feature_names is not None and not is_1d(self.feature_names) and axis == 0:
+            new_values = self._flatten_feature_names()
+            new_self.feature_names = np.array(list(new_values.keys()))
+            new_self.values = np.array([getattr(np, fname)(v,0) for v in new_values.values()])
+            new_self.clustering = None
+        else:
+            new_self.values = getattr(np, fname)(np.array(self.values), **kwargs)
+            if new_self.data is not None:
+                try:
+                    new_self.data = getattr(np, fname)(np.array(self.data), **kwargs)
+                except Exception:
+                    new_self.data = None
+            if new_self.base_values is not None and issubclass(type(axis), int) and len(self.base_values.shape) > axis:
+                new_self.base_values = getattr(np, fname)(self.base_values, **kwargs)
+            elif issubclass(type(axis), int):
+                new_self.base_values = None
+        if axis == 0 and self.clustering is not None and len(self.clustering.shape) == 3:
+            if self.clustering.std(0).sum() < 1e-8:
+                new_self.clustering = self.clustering[0]
+            else:
+                new_self.clustering = None
+        new_self.op_history.append({
+            "name": fname,
+            "kwargs": kwargs,
+            "prev_shape": self.shape,
+            "collapsed_instances": axis == 0
+        })
+        return new_self
+    def mean(self, axis):
+        """ Numpy-style mean function.
+        """
+        return self._numpy_func("mean", axis=axis)
+    def max(self, axis):
+        """ Numpy-style mean function.
+        """
+        return self._numpy_func("max", axis=axis)
+    def min(self, axis):
+        """ Numpy-style mean function.
+        """
+        return self._numpy_func("min", axis=axis)
+    def sum(self, axis=None, grouping=None):
+        """ Numpy-style mean function.
+        """
+        if grouping is None:
+            return self._numpy_func("sum", axis=axis)
+        elif axis == 1 or len(self.shape) == 1:
+            return group_features(self, grouping)
+        else:
+            raise DimensionError("Only axis = 1 is supported for grouping right now...")
+    def hstack(self, other):
+        """ Stack two explanations column-wise.
+        """
+        assert self.shape[0] == other.shape[0], "Can't hstack explanations with different numbers of rows!"
+        assert np.max(np.abs(self.base_values - other.base_values)) < 1e-6, "Can't hstack explanations with different base values!"
+        new_exp = Explanation(
+            values=np.hstack([self.values, other.values]),
+            base_values=self.base_values,
+            data=self.data,
+            display_data=self.display_data,
+            instance_names=self.instance_names,
+            feature_names=self.feature_names,
+            output_names=self.output_names,
+            output_indexes=self.output_indexes,
+            lower_bounds=self.lower_bounds,
+            upper_bounds=self.upper_bounds,
+            error_std=self.error_std,
+            main_effects=self.main_effects,
+            hierarchical_values=self.hierarchical_values,
+            clustering=self.clustering,
+        )
+        return new_exp
+    # def reshape(self, *args):
+    #     return self._numpy_func("reshape", newshape=args)
+    @property
+    def abs(self):
+        return self._numpy_func("abs")
+    @property
+    def identity(self):
+        return self
+    @property
+    def argsort(self):
+        return self._numpy_func("argsort")
+    @property
+    def flip(self):
+        return self._numpy_func("flip")
+    def hclust(self, metric="sqeuclidean", axis=0):
+        """ Computes an optimal leaf ordering sort order using hclustering.
+        hclust(metric="sqeuclidean")
+        Parameters
+        ----------
+        metric : string
+            A metric supported by scipy clustering.
+        axis : int
+            The axis to cluster along.
+        """
+        values = self.values
+        if len(values.shape) != 2:
+            raise DimensionError("The hclust order only supports 2D arrays right now!")
+        if axis == 1:
+            values = values.T
+        # compute a hierarchical clustering and return the optimal leaf ordering
+        D = scipy.spatial.distance.pdist(values, metric)
+        cluster_matrix = scipy.cluster.hierarchy.complete(D)
+        inds = scipy.cluster.hierarchy.leaves_list(scipy.cluster.hierarchy.optimal_leaf_ordering(cluster_matrix, D))
+        return inds
+    def sample(self, max_samples, replace=False, random_state=0):
+        """ Randomly samples the instances (rows) of the Explanation object.
+        Parameters
+        ----------
+        max_samples : int
+            The number of rows to sample. Note that if replace=False then less than
+            fewer than max_samples will be drawn if explanation.shape[0] < max_samples.
+        replace : bool
+            Sample with or without replacement.
+        """
+        prev_seed = np.random.seed(random_state)
+        inds = np.random.choice(self.shape[0], min(max_samples, self.shape[0]), replace=replace)
+        np.random.seed(prev_seed)
+        return self[list(inds)]
+    def _flatten_feature_names(self):
+        new_values = {}
+        for i in range(len(self.values)):
+            for s,v in zip(self.feature_names[i], self.values[i]):
+                if s not in new_values:
+                    new_values[s] = []
+                new_values[s].append(v)
+        return new_values
+    def _use_data_as_feature_names(self):
+        new_values = {}
+        for i in range(len(self.values)):
+            for s,v in zip(self.data[i], self.values[i]):
+                if s not in new_values:
+                    new_values[s] = []
+                new_values[s].append(v)
+        return new_values
+    def percentile(self, q, axis=None):
+        new_self = copy.deepcopy(self)
+        if self.feature_names is not None and not is_1d(self.feature_names) and axis == 0:
+            new_values = self._flatten_feature_names()
+            new_self.feature_names = np.array(list(new_values.keys()))
+            new_self.values = np.array([np.percentile(v, q) for v in new_values.values()])
+            new_self.clustering = None
+        else:
+            new_self.values = np.percentile(new_self.values, q, axis)
+            new_self.data = np.percentile(new_self.data, q, axis)
+        #new_self.data = None
+        new_self.op_history.append({
+            "name": "percentile",
+            "args": (axis,),
+            "prev_shape": self.shape,
+            "collapsed_instances": axis == 0
+        })
+        return new_self
+def group_features(shap_values, feature_map):
+    # TODOsomeday: support and deal with clusterings
+    reverse_map = {}
+    for name in feature_map:
+        reverse_map[feature_map[name]] = reverse_map.get(feature_map[name], []) + [name]
+    curr_names = shap_values.feature_names
+    sv_new = copy.deepcopy(shap_values)
+    found = {}
+    i = 0
+    rank1 = len(shap_values.shape) == 1
+    for name in curr_names:
+        new_name = feature_map.get(name, name)
+        if new_name in found:
+            continue
+        found[new_name] = True
+        new_name = feature_map.get(name, name)
+        cols_to_sum = reverse_map.get(new_name, [new_name])
+        old_inds = [curr_names.index(v) for v in cols_to_sum]
+        if rank1:
+            sv_new.values[i] = shap_values.values[old_inds].sum()
+            sv_new.data[i] = shap_values.data[old_inds].sum()
+        else:
+            sv_new.values[:,i] = shap_values.values[:,old_inds].sum(1)
+            sv_new.data[:,i] = shap_values.data[:,old_inds].sum(1)
+        sv_new.feature_names[i] = new_name
+        i += 1
+    return Explanation(
+        sv_new.values[:i] if rank1 else sv_new.values[:,:i],
+        base_values = sv_new.base_values,
+        data = sv_new.data[:i] if rank1 else sv_new.data[:,:i],
+        display_data = None if sv_new.display_data is None else (sv_new.display_data[:,:i] if rank1 else sv_new.display_data[:,:i]),
+        instance_names = None,
+        feature_names = None if sv_new.feature_names is None else sv_new.feature_names[:i],
+        output_names = None,
+        output_indexes = None,
+        lower_bounds = None,
+        upper_bounds = None,
+        error_std = None,
+        main_effects = None,
+        hierarchical_values = None,
+        clustering = None
+    )
+def compute_output_dims(values, base_values, data, output_names):
+    """ Uses the passed data to infer which dimensions correspond to the model's output.
+    """
+    values_shape = _compute_shape(values)
+    # input shape matches the data shape
+    if data is not None:
+        data_shape = _compute_shape(data)
+    # if we are not given any data we assume it would be the same shape as the given values
+    else:
+        data_shape = values_shape
+    # output shape is known from the base values or output names
+    if output_names is not None:
+        output_shape = _compute_shape(output_names)
+        # if our output_names are per sample then we need to drop the sample dimension here
+        if values_shape[-len(output_shape):] != output_shape and \
+                values_shape[-len(output_shape)+1:] == output_shape[1:] and values_shape[0] == output_shape[0]:
+            output_shape = output_shape[1:]
+    elif base_values is not None:
+        output_shape = _compute_shape(base_values)[1:]
+    else:
+        output_shape = tuple()
+    interaction_order = len(values_shape) - len(data_shape) - len(output_shape)
+    output_dims = range(len(data_shape) + interaction_order, len(values_shape))
+    return tuple(output_dims)
+def is_1d(val):
+    return not (isinstance(val[0], list) or isinstance(val[0], np.ndarray))
+class Op:
+    pass
+class Percentile(Op):
+    def __init__(self, percentile):
+        self.percentile = percentile
+    def add_repr(self, s, verbose=False):
+        return "percentile("+s+", "+str(self.percentile)+")"
+def _first_item(x):
+    for item in x:
+        return item
+    return None
+def _compute_shape(x):
+    if not hasattr(x, "__len__") or isinstance(x, str):
+        return tuple()
+    elif not scipy.sparse.issparse(x) and len(x) > 0 and isinstance(_first_item(x), str):
+        return (None,)
+    else:
+        if isinstance(x, dict):
+            return (len(x),) + _compute_shape(x[next(iter(x))])
+        # 2D arrays we just take their shape as-is
+        if len(getattr(x, "shape", tuple())) > 1:
+            return x.shape
+        # 1D arrays we need to look inside
+        if len(x) == 0:
+            return (0,)
+        elif len(x) == 1:
+            return (1,) + _compute_shape(_first_item(x))
+        else:
+            first_shape = _compute_shape(_first_item(x))
+            if first_shape == tuple():
+                return (len(x),)
+            else: # we have an array of arrays...
+                matches = np.ones(len(first_shape), dtype=bool)
+                for i in range(1, len(x)):
+                    shape = _compute_shape(x[i])
+                    assert len(shape) == len(first_shape), "Arrays in Explanation objects must have consistent inner dimensions!"
+                    for j in range(0, len(shape)):
+                        matches[j] &= shape[j] == first_shape[j]
+                return (len(x),) + tuple(first_shape[j] if match else None for j, match in enumerate(matches))
+class Cohorts:
+    def __init__(self, **kwargs):
+        self.cohorts = kwargs
+        for k in self.cohorts:
+            assert isinstance(self.cohorts[k], Explanation), "All the arguments to a Cohorts set must be Explanation objects!"
+    def __getitem__(self, item):
+        new_cohorts = Cohorts()
+        for k in self.cohorts:
+            new_cohorts.cohorts[k] = self.cohorts[k].__getitem__(item)
+        return new_cohorts
+    def __getattr__(self, name):
+        new_cohorts = Cohorts()
+        for k in self.cohorts:
+            new_cohorts.cohorts[k] = getattr(self.cohorts[k], name)
+        return new_cohorts
+    def __call__(self, *args, **kwargs):
+        new_cohorts = Cohorts()
+        for k in self.cohorts:
+            new_cohorts.cohorts[k] = self.cohorts[k].__call__(*args, **kwargs)
+        return new_cohorts
+    def __repr__(self):
+        return f"<shap._explanation.Cohorts object with {len(self.cohorts)} cohorts of sizes: {[v.shape for v in self.cohorts.values()]}>"
+def _auto_cohorts(shap_values, max_cohorts):
+    """ This uses a DecisionTreeRegressor to build a group of cohorts with similar SHAP values.
+    """
+    # fit a decision tree that well separates the SHAP values
+    m = sklearn.tree.DecisionTreeRegressor(max_leaf_nodes=max_cohorts)
+    m.fit(shap_values.data, shap_values.values)
+    # group instances by their decision paths
+    paths = m.decision_path(shap_values.data).toarray()
+    path_names = []
+    # mark each instance with a path name
+    for i in range(shap_values.shape[0]):
+        name = ""
+        for j in range(len(paths[i])):
+            if paths[i,j] > 0:
+                feature = m.tree_.feature[j]
+                threshold = m.tree_.threshold[j]
+                val = shap_values.data[i,feature]
+                if feature >= 0:
+                    name += str(shap_values.feature_names[feature])
+                    if val < threshold:
+                        name += " < "
+                    else:
+                        name += " >= "
+                    name += str(threshold) + " & "
+        path_names.append(name[:-3]) # the -3 strips off the last unneeded ' & '
+    path_names = np.array(path_names)
+    # split the instances into cohorts by their path names
+    cohorts = {}
+    for name in np.unique(path_names):
+        cohorts[name] = shap_values[path_names == name]
+    return Cohorts(**cohorts)
+def list_wrap(x):
+    """ A helper to patch things since slicer doesn't handle arrays of arrays (it does handle lists of arrays)
+    """
+    if isinstance(x, np.ndarray) and len(x.shape) == 1 and isinstance(x[0], np.ndarray):
+        return [v for v in x]
+    else:
+        return x

lib/shap/_serializable.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import inspect
+import logging
+import pickle
+import cloudpickle
+import numpy as np
+log = logging.getLogger('shap')
+class Serializable:
+    """ This is the superclass of all serializable objects.
+    """
+    def save(self, out_file):
+        """ Save the model to the given file stream.
+        """
+        pickle.dump(type(self), out_file)
+    @classmethod
+    def load(cls, in_file, instantiate=True):
+        """ This is meant to be overridden by subclasses and called with super.
+        We return constructor argument values when not being instantiated. Since there are no
+        constructor arguments for the Serializable class we just return an empty dictionary.
+        """
+        if instantiate:
+            return cls._instantiated_load(in_file)
+        return {}
+    @classmethod
+    def _instantiated_load(cls, in_file, **kwargs):
+        """ This is meant to be overridden by subclasses and called with super.
+        We return constructor argument values (we have no values to load in this abstract class).
+        """
+        obj_type = pickle.load(in_file)
+        if obj_type is None:
+            return None
+        if not inspect.isclass(obj_type) or (not issubclass(obj_type, cls) and (obj_type is not cls)):
+            raise Exception(f"Invalid object type loaded from file. {obj_type} is not a subclass of {cls}.")
+        # here we call the constructor with all the arguments we have loaded
+        constructor_args = obj_type.load(in_file, instantiate=False, **kwargs)
+        used_args = inspect.getfullargspec(obj_type.__init__)[0]
+        return obj_type(**{k: constructor_args[k] for k in constructor_args if k in used_args})
+class Serializer:
+    """ Save data items to an input stream.
+    """
+    def __init__(self, out_stream, block_name, version):
+        self.out_stream = out_stream
+        self.block_name = block_name
+        self.block_version = version
+        self.serializer_version = 0 # update this when the serializer changes
+    def __enter__(self):
+        log.debug("serializer_version = %d", self.serializer_version)
+        pickle.dump(self.serializer_version, self.out_stream)
+        log.debug("block_name = %s", self.block_name)
+        pickle.dump(self.block_name, self.out_stream)
+        log.debug("block_version = %d", self.block_version)
+        pickle.dump(self.block_version, self.out_stream)
+        return self
+    def __exit__(self, exception_type, exception_value, traceback):
+        log.debug("END_BLOCK___")
+        pickle.dump("END_BLOCK___", self.out_stream)
+    def save(self, name, value, encoder="auto"):
+        """ Dump a data item to the current input stream.
+        """
+        log.debug("name = %s", name)
+        pickle.dump(name, self.out_stream)
+        if encoder is None or encoder is False:
+            log.debug("encoder_name = %s", "no_encoder")
+            pickle.dump("no_encoder", self.out_stream)
+        elif callable(encoder):
+            log.debug("encoder_name = %s", "custom_encoder")
+            pickle.dump("custom_encoder", self.out_stream)
+            encoder(value, self.out_stream)
+        elif encoder == ".save" or (isinstance(value, Serializable) and encoder == "auto"):
+            log.debug("encoder_name = %s", "serializable.save")
+            pickle.dump("serializable.save", self.out_stream)
+            if len(inspect.getfullargspec(value.save)[0]) == 3: # backward compat for MLflow, can remove 4/1/2021
+                value.save(self.out_stream, value)
+            else:
+                value.save(self.out_stream)
+        elif encoder == "auto":
+            if isinstance(value, (int, float, str)):
+                log.debug("encoder_name = %s", "pickle.dump")
+                pickle.dump("pickle.dump", self.out_stream)
+                pickle.dump(value, self.out_stream)
+            else:
+                log.debug("encoder_name = %s", "cloudpickle.dump")
+                pickle.dump("cloudpickle.dump", self.out_stream)
+                cloudpickle.dump(value, self.out_stream)
+        else:
+            raise ValueError(f"Unknown encoder type '{encoder}' given for serialization!")
+        log.debug("value = %s", str(value))
+class Deserializer:
+    """ Load data items from an input stream.
+    """
+    def __init__(self, in_stream, block_name, min_version, max_version):
+        self.in_stream = in_stream
+        self.block_name = block_name
+        self.block_min_version = min_version
+        self.block_max_version = max_version
+        # update these when the serializer changes
+        self.serializer_min_version = 0
+        self.serializer_max_version = 0
+    def __enter__(self):
+        # confirm the serializer version
+        serializer_version = pickle.load(self.in_stream)
+        log.debug("serializer_version = %d", serializer_version)
+        if serializer_version < self.serializer_min_version:
+            raise ValueError(
+                f"The file being loaded was saved with a serializer version of {serializer_version}, " + \
+                f"but the current deserializer in SHAP requires at least version {self.serializer_min_version}."
+            )
+        if serializer_version > self.serializer_max_version:
+            raise ValueError(
+                f"The file being loaded was saved with a serializer version of {serializer_version}, " + \
+                f"but the current deserializer in SHAP only support up to version {self.serializer_max_version}."
+            )
+        # confirm the block name
+        block_name = pickle.load(self.in_stream)
+        log.debug("block_name = %s", block_name)
+        if block_name != self.block_name:
+            raise ValueError(
+                f"The next data block in the file being loaded was supposed to be {self.block_name}, " + \
+                f"but the next block found was {block_name}."
+            )
+        # confirm the block version
+        block_version = pickle.load(self.in_stream)
+        log.debug("block_version = %d", block_version)
+        if block_version < self.block_min_version:
+            raise ValueError(
+                f"The file being loaded was saved with a block version of {block_version}, " + \
+                f"but the current deserializer in SHAP requires at least version {self.block_min_version}."
+            )
+        if block_version > self.block_max_version:
+            raise ValueError(
+                f"The file being loaded was saved with a block version of {block_version}, " + \
+                f"but the current deserializer in SHAP only support up to version {self.block_max_version}."
+            )
+        return self
+    def __exit__(self, exception_type, exception_value, traceback):
+        # confirm the block end token
+        for _ in range(100):
+            end_token = pickle.load(self.in_stream)
+            log.debug("end_token = %s", end_token)
+            if end_token == "END_BLOCK___":
+                return
+            self._load_data_value()
+        raise ValueError(
+            f"The data block end token wsa not found for the block {self.block_name}."
+        )
+    def load(self, name, decoder=None):
+        """ Load a data item from the current input stream.
+        """
+        # confirm the block name
+        loaded_name = pickle.load(self.in_stream)
+        log.debug("loaded_name = %s", loaded_name)
+        print("loaded_name", loaded_name)
+        if loaded_name != name:
+            raise ValueError(
+                f"The next data item in the file being loaded was supposed to be {name}, " + \
+                f"but the next block found was {loaded_name}."
+            ) # We should eventually add support for skipping over unused data items in old formats...
+        value = self._load_data_value(decoder)
+        log.debug("value = %s", str(value))
+        return value
+    def _load_data_value(self, decoder=None):
+        encoder_name = pickle.load(self.in_stream)
+        log.debug("encoder_name = %s", encoder_name)
+        if encoder_name == "custom_encoder" or callable(decoder):
+            assert callable(decoder), "You must provide a callable custom decoder for the data item {name}!"
+            return decoder(self.in_stream)
+        if encoder_name == "no_encoder":
+            return None
+        if encoder_name == "serializable.save":
+            return Serializable.load(self.in_stream)
+        if encoder_name == "numpy.save":
+            return np.load(self.in_stream)
+        if encoder_name == "pickle.dump":
+            return pickle.load(self.in_stream)
+        if encoder_name == "cloudpickle.dump":
+            return cloudpickle.load(self.in_stream)
+        raise ValueError(f"Unsupported encoder type found: {encoder_name}")

lib/shap/_version.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# file generated by setuptools_scm
+# don't change, don't track in version control
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple, Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+else:
+    VERSION_TUPLE = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+__version__ = version = '0.44.1'
+__version_tuple__ = version_tuple = (0, 44, 1)

lib/shap/actions/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from ._action import Action
2	+
3	+ __all__ = ["Action"]

lib/shap/actions/_action.py ADDED Viewed

	@@ -0,0 +1,8 @@

+class Action:
+    """ Abstract action class.
+    """
+    def __lt__(self, other_action):
+        return self.cost < other_action.cost
+    def __repr__(self):
+        return f"<Action '{self.__str__()}'>"

lib/shap/actions/_optimizer.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import copy
+import queue
+import warnings
+from ..utils._exceptions import ConvergenceError, InvalidAction
+from ._action import Action
+class ActionOptimizer:
+    def __init__(self, model, actions):
+        self.model = model
+        warnings.warn(
+            "Note that ActionOptimizer is still in an alpha state and is subjust to API changes."
+        )
+        # actions go into mutually exclusive groups
+        self.action_groups = []
+        for group in actions:
+            if issubclass(type(group), Action):
+                group._group_index = len(self.action_groups)
+                group._grouped_index = 0
+                self.action_groups.append([copy.copy(group)])
+            elif issubclass(type(group), list):
+                group = sorted([copy.copy(v) for v in group], key=lambda a: a.cost)
+                for i, v in enumerate(group):
+                    v._group_index = len(self.action_groups)
+                    v._grouped_index = i
+                self.action_groups.append(group)
+            else:
+                raise InvalidAction(
+                    "A passed action was not an Action or list of actions!"
+                )
+    def __call__(self, *args, max_evals=10000):
+        # init our queue with all the least costly actions
+        q = queue.PriorityQueue()
+        for i in range(len(self.action_groups)):
+            group = self.action_groups[i]
+            q.put((group[0].cost, [group[0]]))
+        nevals = 0
+        while not q.empty():
+            # see if we have exceeded our runtime budget
+            nevals += 1
+            if nevals > max_evals:
+                raise ConvergenceError(
+                    f"Failed to find a solution with max_evals={max_evals}! Try reducing the number of actions or increasing max_evals."
+                )
+            # get the next cheapest set of actions we can do
+            cost, actions = q.get()
+            # apply those actions
+            args_tmp = copy.deepcopy(args)
+            for a in actions:
+                a(*args_tmp)
+            # if the model is now satisfied we are done!!
+            v = self.model(*args_tmp)
+            if v:
+                return actions
+            # if not then we add all possible follow-on actions to our queue
+            else:
+                for i in range(len(self.action_groups)):
+                    group = self.action_groups[i]
+                    # look to to see if we already have a action from this group, if so we need to
+                    # move to a more expensive action in the same group
+                    next_ind = 0
+                    prev_in_group = -1
+                    for j, a in enumerate(actions):
+                        if a._group_index == i:
+                            next_ind = max(next_ind, a._grouped_index + 1)
+                            prev_in_group = j
+                    # we are adding a new action type
+                    if prev_in_group == -1:
+                        new_actions = actions + [group[next_ind]]
+                    # we are moving from one action to a more expensive one in the same group
+                    elif next_ind < len(group):
+                        new_actions = copy.copy(actions)
+                        new_actions[prev_in_group] = group[next_ind]
+                    # we don't have a more expensive action left in this group
+                    else:
+                        new_actions = None
+                    # add the new option to our queue
+                    if new_actions is not None:
+                        q.put((sum([a.cost for a in new_actions]), new_actions))

lib/shap/benchmark/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from ._compute import ComputeTime
+from ._explanation_error import ExplanationError
+from ._result import BenchmarkResult
+from ._sequential import SequentialMasker
+# from . import framework
+# from .. import datasets
+__all__ = ["ComputeTime", "ExplanationError", "BenchmarkResult", "SequentialMasker"]

lib/shap/benchmark/_compute.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from ._result import BenchmarkResult
+class ComputeTime:
+    """ Extracts a runtime benchmark result from the passed Explanation.
+    """
+    def __call__(self, explanation, name):
+        return BenchmarkResult("compute time", name, value=explanation.compute_time / explanation.shape[0])

lib/shap/benchmark/_explanation_error.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import time
+import numpy as np
+from tqdm.auto import tqdm
+from shap import Explanation, links
+from shap.maskers import FixedComposite, Image, Text
+from shap.utils import MaskedModel, partition_tree_shuffle
+from shap.utils._exceptions import DimensionError
+from ._result import BenchmarkResult
+class ExplanationError:
+    """ A measure of the explanation error relative to a model's actual output.
+    This benchmark metric measures the discrepancy between the output of the model predicted by an
+    attribution explanation vs. the actual output of the model. This discrepancy is measured over
+    many masking patterns drawn from permutations of the input features.
+    For explanations (like Shapley values) that explain the difference between one alternative and another
+    (for example a current sample and typical background feature values) there is possible explanation error
+    for every pattern of mixing foreground and background, or other words every possible masking pattern.
+    In this class we compute the standard deviation over these explanation errors where masking patterns
+    are drawn from prefixes of random feature permutations. This seems natural, and aligns with Shapley value
+    computations, but of course you could choose to summarize explanation errors in others ways as well.
+    """
+    def __init__(self, masker, model, *model_args, batch_size=500, num_permutations=10, link=links.identity, linearize_link=True, seed=38923):
+        """ Build a new explanation error benchmarker with the given masker, model, and model args.
+        Parameters
+        ----------
+        masker : function or shap.Masker
+            The masker defines how we hide features during the perturbation process.
+        model : function or shap.Model
+            The model we want to evaluate explanations against.
+        model_args : ...
+            The list of arguments we will give to the model that we will have explained. When we later call this benchmark
+            object we should pass explanations that have been computed on this same data.
+        batch_size : int
+            The maximum batch size we should use when calling the model. For some large NLP models this needs to be set
+            lower (at say 1) to avoid running out of GPU memory.
+        num_permutations : int
+            How many permutations we will use to estimate the average explanation error for each sample. If you are running
+            this benchmark on a large dataset with many samples then you can reduce this value since the final result is
+            averaged over samples as well and the averages of both directly combine to reduce variance. So for 10k samples
+            num_permutations=1 is appropreiate.
+        link : function
+            Allows for a non-linear link function to be used to bringe between the model output space and the explanation
+            space.
+        linearize_link : bool
+            Non-linear links can destroy additive separation in generalized linear models, so by linearizing the link we can
+            retain additive separation. See upcoming paper/doc for details.
+        """
+        self.masker = masker
+        self.model = model
+        self.model_args = model_args
+        self.num_permutations = num_permutations
+        self.link = link
+        self.linearize_link = linearize_link
+        self.model_args = model_args
+        self.batch_size = batch_size
+        self.seed = seed
+        # user must give valid masker
+        underlying_masker = masker.masker if isinstance(masker, FixedComposite) else masker
+        if isinstance(underlying_masker, Text):
+            self.data_type = "text"
+        elif isinstance(underlying_masker, Image):
+            self.data_type = "image"
+        else:
+            self.data_type = "tabular"
+    def __call__(self, explanation, name, step_fraction=0.01, indices=[], silent=False):
+        """ Run this benchmark on the given explanation.
+        """
+        if isinstance(explanation, np.ndarray):
+            attributions = explanation
+        elif isinstance(explanation, Explanation):
+            attributions = explanation.values
+        else:
+            raise ValueError("The passed explanation must be either of type numpy.ndarray or shap.Explanation!")
+        if len(attributions) != len(self.model_args[0]):
+            emsg = (
+                "The explanation passed must have the same number of rows as "
+                "the self.model_args that were passed!"
+            )
+            raise DimensionError(emsg)
+        # it is important that we choose the same permutations for the different explanations we are comparing
+        # so as to avoid needless noise
+        old_seed = np.random.seed()
+        np.random.seed(self.seed)
+        pbar = None
+        start_time = time.time()
+        svals = []
+        mask_vals = []
+        for i, args in enumerate(zip(*self.model_args)):
+            if len(args[0].shape) != len(attributions[i].shape):
+                raise ValueError("The passed explanation must have the same dim as the model_args and must not have a vector output!")
+            feature_size = np.prod(attributions[i].shape)
+            sample_attributions = attributions[i].flatten()
+            # compute any custom clustering for this row
+            row_clustering = None
+            if getattr(self.masker, "clustering", None) is not None:
+                if isinstance(self.masker.clustering, np.ndarray):
+                    row_clustering = self.masker.clustering
+                elif callable(self.masker.clustering):
+                    row_clustering = self.masker.clustering(*args)
+                else:
+                    raise NotImplementedError("The masker passed has a .clustering attribute that is not yet supported by the ExplanationError benchmark!")
+            masked_model = MaskedModel(self.model, self.masker, self.link, self.linearize_link, *args)
+            total_values = None
+            for _ in range(self.num_permutations):
+                masks = []
+                mask = np.zeros(feature_size, dtype=bool)
+                masks.append(mask.copy())
+                ordered_inds = np.arange(feature_size)
+                # shuffle the indexes so we get a random permutation ordering
+                if row_clustering is not None:
+                    inds_mask = np.ones(feature_size, dtype=bool)
+                    partition_tree_shuffle(ordered_inds, inds_mask, row_clustering)
+                else:
+                    np.random.shuffle(ordered_inds)
+                increment = max(1, int(feature_size * step_fraction))
+                for j in range(0, feature_size, increment):
+                    mask[ordered_inds[np.arange(j, min(feature_size, j+increment))]] = True
+                    masks.append(mask.copy())
+                mask_vals.append(masks)
+                values = []
+                masks_arr = np.array(masks)
+                for j in range(0, len(masks_arr), self.batch_size):
+                    values.append(masked_model(masks_arr[j:j + self.batch_size]))
+                values = np.concatenate(values)
+                base_value = values[0]
+                for j, v in enumerate(values):
+                    values[j] = (v - (base_value + np.sum(sample_attributions[masks_arr[j]])))**2
+                if total_values is None:
+                    total_values = values
+                else:
+                    total_values += values
+            total_values /= self.num_permutations
+            svals.append(total_values)
+            if pbar is None and time.time() - start_time > 5:
+                pbar = tqdm(total=len(self.model_args[0]), disable=silent, leave=False, desc=f"ExplanationError for {name}")
+                pbar.update(i+1)
+            if pbar is not None:
+                pbar.update(1)
+        if pbar is not None:
+            pbar.close()
+        svals = np.array(svals)
+        # reset the random seed so we don't mess up the caller
+        np.random.seed(old_seed)
+        return BenchmarkResult("explanation error", name, value=np.sqrt(np.sum(total_values)/len(total_values)))

lib/shap/benchmark/_result.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import numpy as np
+import sklearn
+sign_defaults = {
+    "keep positive": 1,
+    "keep negative": -1,
+    "remove positive": -1,
+    "remove negative": 1,
+    "compute time": -1,
+    "keep absolute": -1, # the absolute signs are defaults that make sense when scoring losses
+    "remove absolute": 1,
+    "explanation error": -1
+}
+class BenchmarkResult:
+    """ The result of a benchmark run.
+    """
+    def __init__(self, metric, method, value=None, curve_x=None, curve_y=None, curve_y_std=None, value_sign=None):
+        self.metric = metric
+        self.method = method
+        self.value = value
+        self.curve_x = curve_x
+        self.curve_y = curve_y
+        self.curve_y_std = curve_y_std
+        self.value_sign = value_sign
+        if self.value_sign is None and self.metric in sign_defaults:
+            self.value_sign = sign_defaults[self.metric]
+        if self.value is None:
+            self.value = sklearn.metrics.auc(curve_x, (np.array(curve_y) - curve_y[0]))
+    @property
+    def full_name(self):
+        return self.method + " " + self.metric

lib/shap/benchmark/_sequential.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import time
+import matplotlib.pyplot as pl
+import numpy as np
+import pandas as pd
+import sklearn
+from tqdm.auto import tqdm
+from shap import Explanation, links
+from shap.maskers import FixedComposite, Image, Text
+from shap.utils import MaskedModel
+from ._result import BenchmarkResult
+class SequentialMasker:
+    def __init__(self, mask_type, sort_order, masker, model, *model_args, batch_size=500):
+        for arg in model_args:
+            if isinstance(arg, pd.DataFrame):
+                raise TypeError("DataFrame arguments dont iterate correctly, pass numpy arrays instead!")
+        # convert any DataFrames to numpy arrays
+        # self.model_arg_cols = []
+        # self.model_args = []
+        # self.has_df = False
+        # for arg in model_args:
+        #     if isinstance(arg, pd.DataFrame):
+        #         self.model_arg_cols.append(arg.columns)
+        #         self.model_args.append(arg.values)
+        #         self.has_df = True
+        #     else:
+        #         self.model_arg_cols.append(None)
+        #         self.model_args.append(arg)
+        # if self.has_df:
+        #     given_model = model
+        #     def new_model(*args):
+        #         df_args = []
+        #         for i, arg in enumerate(args):
+        #             if self.model_arg_cols[i] is not None:
+        #                 df_args.append(pd.DataFrame(arg, columns=self.model_arg_cols[i]))
+        #             else:
+        #                 df_args.append(arg)
+        #         return given_model(*df_args)
+        #     model = new_model
+        self.inner = SequentialPerturbation(
+            model, masker, sort_order, mask_type
+        )
+        self.model_args = model_args
+        self.batch_size = batch_size
+    def __call__(self, explanation, name, **kwargs):
+        return self.inner(name, explanation, *self.model_args, batch_size=self.batch_size, **kwargs)
+class SequentialPerturbation:
+    def __init__(self, model, masker, sort_order, perturbation, linearize_link=False):
+        # self.f = lambda masked, x, index: model.predict(masked)
+        self.model = model if callable(model) else model.predict
+        self.masker = masker
+        self.sort_order = sort_order
+        self.perturbation = perturbation
+        self.linearize_link = linearize_link
+        # define our sort order
+        if self.sort_order == "positive":
+            self.sort_order_map = lambda x: np.argsort(-x)
+        elif self.sort_order == "negative":
+            self.sort_order_map = lambda x: np.argsort(x)
+        elif self.sort_order == "absolute":
+            self.sort_order_map = lambda x: np.argsort(-abs(x))
+        else:
+            raise ValueError("sort_order must be either \"positive\", \"negative\", or \"absolute\"!")
+        # user must give valid masker
+        underlying_masker = masker.masker if isinstance(masker, FixedComposite) else masker
+        if isinstance(underlying_masker, Text):
+            self.data_type = "text"
+        elif isinstance(underlying_masker, Image):
+            self.data_type = "image"
+        else:
+            self.data_type = "tabular"
+            #raise ValueError("masker must be for \"tabular\", \"text\", or \"image\"!")
+        self.score_values = []
+        self.score_aucs = []
+        self.labels = []
+    def __call__(self, name, explanation, *model_args, percent=0.01, indices=[], y=None, label=None, silent=False, debug_mode=False, batch_size=10):
+        # if explainer is already the attributions
+        if isinstance(explanation, np.ndarray):
+            attributions = explanation
+        elif isinstance(explanation, Explanation):
+            attributions = explanation.values
+        else:
+            raise ValueError("The passed explanation must be either of type numpy.ndarray or shap.Explanation!")
+        assert len(attributions) == len(model_args[0]), "The explanation passed must have the same number of rows as the model_args that were passed!"
+        if label is None:
+            label = "Score %d" % len(self.score_values)
+        # convert dataframes
+        # if isinstance(X, (pd.Series, pd.DataFrame)):
+        #     X = X.values
+        # convert all single-sample vectors to matrices
+        # if not hasattr(attributions[0], "__len__"):
+        #     attributions = np.array([attributions])
+        # if not hasattr(X[0], "__len__") and self.data_type == "tabular":
+        #     X = np.array([X])
+        pbar = None
+        start_time = time.time()
+        svals = []
+        mask_vals = []
+        for i, args in enumerate(zip(*model_args)):
+            # if self.data_type == "image":
+            #     x_shape, y_shape = attributions[i].shape[0], attributions[i].shape[1]
+            #     feature_size = np.prod([x_shape, y_shape])
+            #     sample_attributions = attributions[i].mean(2).reshape(feature_size, -1)
+            #     data = X[i].flatten()
+            #     mask_shape = X[i].shape
+            # else:
+            feature_size = np.prod(attributions[i].shape)
+            sample_attributions = attributions[i].flatten()
+            # data = X[i]
+            # mask_shape = feature_size
+            self.masked_model = MaskedModel(self.model, self.masker, links.identity, self.linearize_link, *args)
+            masks = []
+            mask = np.ones(feature_size, dtype=bool) * (self.perturbation == "remove")
+            masks.append(mask.copy())
+            ordered_inds = self.sort_order_map(sample_attributions)
+            increment = max(1,int(feature_size*percent))
+            for j in range(0, feature_size, increment):
+                oind_list = [ordered_inds[t] for t in range(j, min(feature_size, j+increment))]
+                for oind in oind_list:
+                    if not ((self.sort_order == "positive" and sample_attributions[oind] <= 0) or \
+                            (self.sort_order == "negative" and sample_attributions[oind] >= 0)):
+                        mask[oind] = self.perturbation == "keep"
+                masks.append(mask.copy())
+            mask_vals.append(masks)
+            # mask_size = len(range(0, feature_size, increment)) + 1
+            values = []
+            masks_arr = np.array(masks)
+            for j in range(0, len(masks_arr), batch_size):
+                values.append(self.masked_model(masks_arr[j:j + batch_size]))
+            values = np.concatenate(values)
+            svals.append(values)
+            if pbar is None and time.time() - start_time > 5:
+                pbar = tqdm(total=len(model_args[0]), disable=silent, leave=False, desc="SequentialMasker")
+                pbar.update(i+1)
+            if pbar is not None:
+                pbar.update(1)
+        if pbar is not None:
+            pbar.close()
+        self.score_values.append(np.array(svals))
+        # if self.sort_order == "negative":
+        #     curve_sign = -1
+        # else:
+        curve_sign = 1
+        self.labels.append(label)
+        xs = np.linspace(0, 1, 100)
+        curves = np.zeros((len(self.score_values[-1]), len(xs)))
+        for j in range(len(self.score_values[-1])):
+            xp = np.linspace(0, 1, len(self.score_values[-1][j]))
+            yp = self.score_values[-1][j]
+            curves[j,:] = np.interp(xs, xp, yp)
+        ys = curves.mean(0)
+        std = curves.std(0) / np.sqrt(curves.shape[0])
+        auc = sklearn.metrics.auc(np.linspace(0, 1, len(ys)), curve_sign*(ys-ys[0]))
+        if not debug_mode:
+            return BenchmarkResult(self.perturbation + " " + self.sort_order, name, curve_x=xs, curve_y=ys, curve_y_std=std)
+        else:
+            aucs = []
+            for j in range(len(self.score_values[-1])):
+                curve = curves[j,:]
+                auc = sklearn.metrics.auc(np.linspace(0, 1, len(curve)), curve_sign*(curve-curve[0]))
+                aucs.append(auc)
+            return mask_vals, curves, aucs
+    def score(self, explanation, X, percent=0.01, y=None, label=None, silent=False, debug_mode=False):
+        '''
+        Will be deprecated once MaskedModel is in complete support
+        '''
+        # if explainer is already the attributions
+        if isinstance(explanation, np.ndarray):
+            attributions = explanation
+        elif isinstance(explanation, Explanation):
+            attributions = explanation.values
+        if label is None:
+            label = "Score %d" % len(self.score_values)
+        # convert dataframes
+        if isinstance(X, (pd.Series, pd.DataFrame)):
+            X = X.values
+        # convert all single-sample vectors to matrices
+        if not hasattr(attributions[0], "__len__"):
+            attributions = np.array([attributions])
+        if not hasattr(X[0], "__len__") and self.data_type == "tabular":
+            X = np.array([X])
+        pbar = None
+        start_time = time.time()
+        svals = []
+        mask_vals = []
+        for i in range(len(X)):
+            if self.data_type == "image":
+                x_shape, y_shape = attributions[i].shape[0], attributions[i].shape[1]
+                feature_size = np.prod([x_shape, y_shape])
+                sample_attributions = attributions[i].mean(2).reshape(feature_size, -1)
+            else:
+                feature_size = attributions[i].shape[0]
+                sample_attributions = attributions[i]
+            if len(attributions[i].shape) == 1 or self.data_type == "tabular":
+                output_size = 1
+            else:
+                output_size = attributions[i].shape[-1]
+            for k in range(output_size):
+                if self.data_type == "image":
+                    mask_shape = X[i].shape
+                else:
+                    mask_shape = feature_size
+                mask = np.ones(mask_shape, dtype=bool) * (self.perturbation == "remove")
+                masks = [mask.copy()]
+                values = np.zeros(feature_size+1)
+                # masked, data = self.masker(mask, X[i])
+                masked = self.masker(mask, X[i])
+                data = None
+                curr_val = self.f(masked, data, k).mean(0)
+                values[0] = curr_val
+                if output_size != 1:
+                    test_attributions = sample_attributions[:,k]
+                else:
+                    test_attributions = sample_attributions
+                ordered_inds = self.sort_order_map(test_attributions)
+                increment = max(1,int(feature_size*percent))
+                for j in range(0, feature_size, increment):
+                    oind_list = [ordered_inds[t] for t in range(j, min(feature_size, j+increment))]
+                    for oind in oind_list:
+                        if not ((self.sort_order == "positive" and test_attributions[oind] <= 0) or \
+                                (self.sort_order == "negative" and test_attributions[oind] >= 0)):
+                            if self.data_type == "image":
+                                xoind, yoind = oind // attributions[i].shape[1], oind % attributions[i].shape[1]
+                                mask[xoind][yoind] = self.perturbation == "keep"
+                            else:
+                                mask[oind] = self.perturbation == "keep"
+                    masks.append(mask.copy())
+                    # masked, data = self.masker(mask, X[i])
+                    masked = self.masker(mask, X[i])
+                    curr_val = self.f(masked, data, k).mean(0)
+                    for t in range(j, min(feature_size, j+increment)):
+                        values[t+1] = curr_val
+                svals.append(values)
+                mask_vals.append(masks)
+            if pbar is None and time.time() - start_time > 5:
+                pbar = tqdm(total=len(X), disable=silent, leave=False)
+                pbar.update(i+1)
+            if pbar is not None:
+                pbar.update(1)
+        if pbar is not None:
+            pbar.close()
+        self.score_values.append(np.array(svals))
+        if self.sort_order == "negative":
+            curve_sign = -1
+        else:
+            curve_sign = 1
+        self.labels.append(label)
+        xs = np.linspace(0, 1, 100)
+        curves = np.zeros((len(self.score_values[-1]), len(xs)))
+        for j in range(len(self.score_values[-1])):
+            xp = np.linspace(0, 1, len(self.score_values[-1][j]))
+            yp = self.score_values[-1][j]
+            curves[j,:] = np.interp(xs, xp, yp)
+        ys = curves.mean(0)
+        if debug_mode:
+            aucs = []
+            for j in range(len(self.score_values[-1])):
+                curve = curves[j,:]
+                auc = sklearn.metrics.auc(np.linspace(0, 1, len(curve)), curve_sign*(curve-curve[0]))
+                aucs.append(auc)
+            return mask_vals, curves, aucs
+        else:
+            auc = sklearn.metrics.auc(np.linspace(0, 1, len(ys)), curve_sign*(ys-ys[0]))
+            return xs, ys, auc
+    def plot(self, xs, ys, auc):
+        pl.plot(xs, ys, label="AUC %0.4f" % auc)
+        pl.legend()
+        xlabel = "Percent Unmasked" if self.perturbation == "keep" else "Percent Masked"
+        pl.xlabel(xlabel)
+        pl.ylabel("Model Output")
+        pl.show()

lib/shap/benchmark/experiments.py ADDED Viewed

	@@ -0,0 +1,414 @@

+import copy
+import itertools
+import os
+import pickle
+import random
+import subprocess
+import sys
+import time
+from multiprocessing import Pool
+from .. import __version__, datasets
+from . import metrics, models
+try:
+    from queue import Queue
+except ImportError:
+    from Queue import Queue
+from threading import Lock, Thread
+regression_metrics = [
+    "local_accuracy",
+    "consistency_guarantees",
+    "keep_positive_mask",
+    "keep_positive_resample",
+    #"keep_positive_impute",
+    "keep_negative_mask",
+    "keep_negative_resample",
+    #"keep_negative_impute",
+    "keep_absolute_mask__r2",
+    "keep_absolute_resample__r2",
+    #"keep_absolute_impute__r2",
+    "remove_positive_mask",
+    "remove_positive_resample",
+    #"remove_positive_impute",
+    "remove_negative_mask",
+    "remove_negative_resample",
+    #"remove_negative_impute",
+    "remove_absolute_mask__r2",
+    "remove_absolute_resample__r2",
+    #"remove_absolute_impute__r2"
+    "runtime",
+]
+binary_classification_metrics = [
+    "local_accuracy",
+    "consistency_guarantees",
+    "keep_positive_mask",
+    "keep_positive_resample",
+    #"keep_positive_impute",
+    "keep_negative_mask",
+    "keep_negative_resample",
+    #"keep_negative_impute",
+    "keep_absolute_mask__roc_auc",
+    "keep_absolute_resample__roc_auc",
+    #"keep_absolute_impute__roc_auc",
+    "remove_positive_mask",
+    "remove_positive_resample",
+    #"remove_positive_impute",
+    "remove_negative_mask",
+    "remove_negative_resample",
+    #"remove_negative_impute",
+    "remove_absolute_mask__roc_auc",
+    "remove_absolute_resample__roc_auc",
+    #"remove_absolute_impute__roc_auc"
+    "runtime",
+]
+human_metrics = [
+    "human_and_00",
+    "human_and_01",
+    "human_and_11",
+    "human_or_00",
+    "human_or_01",
+    "human_or_11",
+    "human_xor_00",
+    "human_xor_01",
+    "human_xor_11",
+    "human_sum_00",
+    "human_sum_01",
+    "human_sum_11"
+]
+linear_regress_methods = [
+    "linear_shap_corr",
+    "linear_shap_ind",
+    "coef",
+    "random",
+    "kernel_shap_1000_meanref",
+    #"kernel_shap_100_meanref",
+    #"sampling_shap_10000",
+    "sampling_shap_1000",
+    "lime_tabular_regression_1000"
+    #"sampling_shap_100"
+]
+linear_classify_methods = [
+    # NEED LIME
+    "linear_shap_corr",
+    "linear_shap_ind",
+    "coef",
+    "random",
+    "kernel_shap_1000_meanref",
+    #"kernel_shap_100_meanref",
+    #"sampling_shap_10000",
+    "sampling_shap_1000",
+    #"lime_tabular_regression_1000"
+    #"sampling_shap_100"
+]
+tree_regress_methods = [
+    # NEED tree_shap_ind
+    # NEED split_count?
+    "tree_shap_tree_path_dependent",
+    "tree_shap_independent_200",
+    "saabas",
+    "random",
+    "tree_gain",
+    "kernel_shap_1000_meanref",
+    "mean_abs_tree_shap",
+    #"kernel_shap_100_meanref",
+    #"sampling_shap_10000",
+    "sampling_shap_1000",
+    "lime_tabular_regression_1000",
+    "maple"
+    #"sampling_shap_100"
+]
+rf_regress_methods = [ # methods that only support random forest models
+    "tree_maple"
+]
+tree_classify_methods = [
+    # NEED tree_shap_ind
+    # NEED split_count?
+    "tree_shap_tree_path_dependent",
+    "tree_shap_independent_200",
+    "saabas",
+    "random",
+    "tree_gain",
+    "kernel_shap_1000_meanref",
+    "mean_abs_tree_shap",
+    #"kernel_shap_100_meanref",
+    #"sampling_shap_10000",
+    "sampling_shap_1000",
+    "lime_tabular_classification_1000",
+    "maple"
+    #"sampling_shap_100"
+]
+deep_regress_methods = [
+    "deep_shap",
+    "expected_gradients",
+    "random",
+    "kernel_shap_1000_meanref",
+    "sampling_shap_1000",
+    #"lime_tabular_regression_1000"
+]
+deep_classify_methods = [
+    "deep_shap",
+    "expected_gradients",
+    "random",
+    "kernel_shap_1000_meanref",
+    "sampling_shap_1000",
+    #"lime_tabular_regression_1000"
+]
+_experiments = []
+_experiments += [["corrgroups60", "lasso", m, s] for s in regression_metrics for m in linear_regress_methods]
+_experiments += [["corrgroups60", "ridge", m, s] for s in regression_metrics for m in linear_regress_methods]
+_experiments += [["corrgroups60", "decision_tree", m, s] for s in regression_metrics for m in tree_regress_methods]
+_experiments += [["corrgroups60", "random_forest", m, s] for s in regression_metrics for m in (tree_regress_methods + rf_regress_methods)]
+_experiments += [["corrgroups60", "gbm", m, s] for s in regression_metrics for m in tree_regress_methods]
+_experiments += [["corrgroups60", "ffnn", m, s] for s in regression_metrics for m in deep_regress_methods]
+_experiments += [["independentlinear60", "lasso", m, s] for s in regression_metrics for m in linear_regress_methods]
+_experiments += [["independentlinear60", "ridge", m, s] for s in regression_metrics for m in linear_regress_methods]
+_experiments += [["independentlinear60", "decision_tree", m, s] for s in regression_metrics for m in tree_regress_methods]
+_experiments += [["independentlinear60", "random_forest", m, s] for s in regression_metrics for m in (tree_regress_methods + rf_regress_methods)]
+_experiments += [["independentlinear60", "gbm", m, s] for s in regression_metrics for m in tree_regress_methods]
+_experiments += [["independentlinear60", "ffnn", m, s] for s in regression_metrics for m in deep_regress_methods]
+_experiments += [["cric", "lasso", m, s] for s in binary_classification_metrics for m in linear_classify_methods]
+_experiments += [["cric", "ridge", m, s] for s in binary_classification_metrics for m in linear_classify_methods]
+_experiments += [["cric", "decision_tree", m, s] for s in binary_classification_metrics for m in tree_classify_methods]
+_experiments += [["cric", "random_forest", m, s] for s in binary_classification_metrics for m in tree_classify_methods]
+_experiments += [["cric", "gbm", m, s] for s in binary_classification_metrics for m in tree_classify_methods]
+_experiments += [["cric", "ffnn", m, s] for s in binary_classification_metrics for m in deep_classify_methods]
+_experiments += [["human", "decision_tree", m, s] for s in human_metrics for m in tree_regress_methods]
+def experiments(dataset=None, model=None, method=None, metric=None):
+    for experiment in _experiments:
+        if dataset is not None and dataset != experiment[0]:
+            continue
+        if model is not None and model != experiment[1]:
+            continue
+        if method is not None and method != experiment[2]:
+            continue
+        if metric is not None and metric != experiment[3]:
+            continue
+        yield experiment
+def run_experiment(experiment, use_cache=True, cache_dir="/tmp"):
+    dataset_name, model_name, method_name, metric_name = experiment
+    # see if we have a cached version
+    cache_id = __gen_cache_id(experiment)
+    cache_file = os.path.join(cache_dir, cache_id + ".pickle")
+    if use_cache and os.path.isfile(cache_file):
+        with open(cache_file, "rb") as f:
+            #print(cache_id.replace("__", " ") + " ...loaded from cache.")
+            return pickle.load(f)
+    # compute the scores
+    print(cache_id.replace("__", " ", 4) + " ...")
+    sys.stdout.flush()
+    start = time.time()
+    X,y = getattr(datasets, dataset_name)()
+    score = getattr(metrics, metric_name)(
+        X, y,
+        getattr(models, dataset_name+"__"+model_name),
+        method_name
+    )
+    print("...took %f seconds.\n" % (time.time() - start))
+    # cache the scores
+    with open(cache_file, "wb") as f:
+        pickle.dump(score, f)
+    return score
+def run_experiments_helper(args):
+    experiment, cache_dir = args
+    return run_experiment(experiment, cache_dir=cache_dir)
+def run_experiments(dataset=None, model=None, method=None, metric=None, cache_dir="/tmp", nworkers=1):
+    experiments_arr = list(experiments(dataset=dataset, model=model, method=method, metric=metric))
+    if nworkers == 1:
+        out = list(map(run_experiments_helper, zip(experiments_arr, itertools.repeat(cache_dir))))
+    else:
+        with Pool(nworkers) as pool:
+            out = pool.map(run_experiments_helper, zip(experiments_arr, itertools.repeat(cache_dir)))
+    return list(zip(experiments_arr, out))
+nexperiments = 0
+total_sent = 0
+total_done = 0
+total_failed = 0
+host_records = {}
+worker_lock = Lock()
+ssh_conn_per_min_limit = 0 # set as an argument to run_remote_experiments
+def __thread_worker(q, host):
+    global total_sent, total_done
+    hostname, python_binary = host.split(":")
+    while True:
+        # make sure we are not sending too many ssh connections to the host
+        # (if we send too many connections ssh thottling will lock us out)
+        while True:
+            all_clear = False
+            worker_lock.acquire()
+            try:
+                if hostname not in host_records:
+                    host_records[hostname] = []
+                if len(host_records[hostname]) < ssh_conn_per_min_limit:
+                    all_clear = True
+                elif time.time() - host_records[hostname][-ssh_conn_per_min_limit] > 61:
+                    all_clear = True
+            finally:
+                worker_lock.release()
+            # if we are clear to send a new ssh connection then break
+            if all_clear:
+                break
+            # if we are not clear then we sleep and try again
+            time.sleep(5)
+        experiment = q.get()
+        # if we are not loading from the cache then we note that we have called the host
+        cache_dir = "/tmp"
+        cache_file = os.path.join(cache_dir, __gen_cache_id(experiment) + ".pickle")
+        if not os.path.isfile(cache_file):
+            worker_lock.acquire()
+            try:
+                host_records[hostname].append(time.time())
+            finally:
+                worker_lock.release()
+        # record how many we have sent off for execution
+        worker_lock.acquire()
+        try:
+            total_sent += 1
+            __print_status()
+        finally:
+            worker_lock.release()
+        __run_remote_experiment(experiment, hostname, cache_dir=cache_dir, python_binary=python_binary)
+        # record how many are finished
+        worker_lock.acquire()
+        try:
+            total_done += 1
+            __print_status()
+        finally:
+            worker_lock.release()
+        q.task_done()
+def __print_status():
+    print("Benchmark task %d of %d done (%d failed, %d running)" % (total_done, nexperiments, total_failed, total_sent - total_done), end="\r")
+    sys.stdout.flush()
+def run_remote_experiments(experiments, thread_hosts, rate_limit=10):
+    """ Use ssh to run the experiments on remote machines in parallel.
+    Parameters
+    ----------
+    experiments : iterable
+        Output of shap.benchmark.experiments(...).
+    thread_hosts : list of strings
+        Each host has the format "host_name:path_to_python_binary" and can appear multiple times
+        in the list (one for each parallel execution you want on that machine).
+    rate_limit : int
+        How many ssh connections we make per minute to each host (to avoid throttling issues).
+    """
+    global ssh_conn_per_min_limit
+    ssh_conn_per_min_limit = rate_limit
+    # first we kill any remaining workers from previous runs
+    # note we don't check_call because pkill kills our ssh call as well
+    thread_hosts = copy.copy(thread_hosts)
+    random.shuffle(thread_hosts)
+    for host in set(thread_hosts):
+        hostname,_ = host.split(":")
+        try:
+            subprocess.run(["ssh", hostname, "pkill -f shap.benchmark.run_experiment"], timeout=15)
+        except subprocess.TimeoutExpired:
+            print("Failed to connect to", hostname, "after 15 seconds! Exiting.")
+            return
+    experiments = copy.copy(list(experiments))
+    random.shuffle(experiments) # this way all the hard experiments don't get put on one machine
+    global nexperiments, total_sent, total_done, total_failed, host_records
+    nexperiments = len(experiments)
+    total_sent = 0
+    total_done = 0
+    total_failed = 0
+    host_records = {}
+    q = Queue()
+    for host in thread_hosts:
+        worker = Thread(target=__thread_worker, args=(q, host))
+        worker.setDaemon(True)
+        worker.start()
+    for experiment in experiments:
+        q.put(experiment)
+    q.join()
+def __run_remote_experiment(experiment, remote, cache_dir="/tmp", python_binary="python"):
+    global total_failed
+    dataset_name, model_name, method_name, metric_name = experiment
+    # see if we have a cached version
+    cache_id = __gen_cache_id(experiment)
+    cache_file = os.path.join(cache_dir, cache_id + ".pickle")
+    if os.path.isfile(cache_file):
+        with open(cache_file, "rb") as f:
+            return pickle.load(f)
+    # this is just so we don't dump everything at once on a machine
+    time.sleep(random.uniform(0,5))
+    # run the benchmark on the remote machine
+    #start = time.time()
+    cmd = "CUDA_VISIBLE_DEVICES=\"\" "+python_binary+" -c \"import shap; shap.benchmark.run_experiment(['{}', '{}', '{}', '{}'], cache_dir='{}')\" &> {}/{}.output".format(
+        dataset_name, model_name, method_name, metric_name, cache_dir, cache_dir, cache_id
+    )
+    try:
+        subprocess.check_output(["ssh", remote, cmd])
+    except subprocess.CalledProcessError as e:
+        print("The following command failed on %s:" % remote, file=sys.stderr)
+        print(cmd, file=sys.stderr)
+        total_failed += 1
+        print(e)
+        return
+    # copy the results back
+    subprocess.check_output(["scp", remote+":"+cache_file, cache_file])
+    if os.path.isfile(cache_file):
+        with open(cache_file, "rb") as f:
+            #print(cache_id.replace("__", " ") + " ...loaded from remote after %f seconds" % (time.time() - start))
+            return pickle.load(f)
+    else:
+        raise FileNotFoundError("Remote benchmark call finished but no local file was found!")
+def __gen_cache_id(experiment):
+    dataset_name, model_name, method_name, metric_name = experiment
+    return "v" + "__".join([__version__, dataset_name, model_name, method_name, metric_name])

lib/shap/benchmark/framework.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import itertools as it
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from . import perturbation
+def update(model, attributions, X, y, masker, sort_order, perturbation_method, scores):
+    metric = perturbation_method + ' ' + sort_order
+    sp = perturbation.SequentialPerturbation(model, masker, sort_order, perturbation_method)
+    xs, ys, auc = sp.model_score(attributions, X, y=y)
+    scores['metrics'].append(metric)
+    scores['values'][metric] = [xs, ys, auc]
+def get_benchmark(model, attributions, X, y, masker, metrics):
+    # convert dataframes
+    if isinstance(X, (pd.Series, pd.DataFrame)):
+        X = X.values
+    if isinstance(masker, (pd.Series, pd.DataFrame)):
+        masker = masker.values
+    # record scores per metric
+    scores = {'metrics': list(), 'values': dict()}
+    for sort_order, perturbation_method in list(it.product(metrics['sort_order'], metrics['perturbation'])):
+        update(model, attributions, X, y, masker, sort_order, perturbation_method, scores)
+    return scores
+def get_metrics(benchmarks, selection):
+    # select metrics to plot using selection function
+    explainer_metrics = set()
+    for explainer in benchmarks:
+        scores = benchmarks[explainer]
+        if len(explainer_metrics) == 0:
+            explainer_metrics = set(scores['metrics'])
+        else:
+            explainer_metrics = selection(explainer_metrics, set(scores['metrics']))
+    return list(explainer_metrics)
+def trend_plot(benchmarks):
+    explainer_metrics = get_metrics(benchmarks, lambda x, y: x.union(y))
+    # plot all curves if metric exists
+    for metric in explainer_metrics:
+        plt.clf()
+        for explainer in benchmarks:
+            scores = benchmarks[explainer]
+            if metric in scores['values']:
+                x, y, auc = scores['values'][metric]
+                plt.plot(x, y, label=f'{round(auc, 3)} - {explainer}')
+        if 'keep' in metric:
+            xlabel = 'Percent Unmasked'
+        if 'remove' in metric:
+            xlabel = 'Percent Masked'
+        plt.ylabel('Model Output')
+        plt.xlabel(xlabel)
+        plt.title(metric)
+        plt.legend()
+        plt.show()
+def compare_plot(benchmarks):
+    explainer_metrics = get_metrics(benchmarks, lambda x, y: x.intersection(y))
+    explainers = list(benchmarks.keys())
+    num_explainers = len(explainers)
+    num_metrics = len(explainer_metrics)
+    # dummy start to evenly distribute explainers on the left
+    # can later be replaced by boolean metrics
+    aucs = dict()
+    for i in range(num_explainers):
+        explainer = explainers[i]
+        aucs[explainer] = [i/(num_explainers-1)]
+    # normalize per metric
+    for metric in explainer_metrics:
+        max_auc, min_auc = -float('inf'), float('inf')
+        for explainer in explainers:
+            scores = benchmarks[explainer]
+            _, _, auc = scores['values'][metric]
+            min_auc = min(auc, min_auc)
+            max_auc = max(auc, max_auc)
+        for explainer in explainers:
+            scores = benchmarks[explainer]
+            _, _, auc = scores['values'][metric]
+            aucs[explainer].append((auc-min_auc)/(max_auc-min_auc))
+    # plot common curves
+    ax = plt.gca()
+    for explainer in explainers:
+        plt.plot(np.linspace(0, 1, len(explainer_metrics)+1), aucs[explainer], '--o')
+    ax.tick_params(which='major', axis='both', labelsize=8)
+    ax.set_yticks([i/(num_explainers-1) for i in range(0, num_explainers)])
+    ax.set_yticklabels(explainers, rotation=0)
+    ax.set_xticks(np.linspace(0, 1, num_metrics+1))
+    ax.set_xticklabels([' '] + explainer_metrics, rotation=45, ha='right')
+    plt.grid(which='major', axis='x', linestyle='--')
+    plt.tight_layout()
+    plt.ylabel('Relative Performance of Each Explanation Method')
+    plt.xlabel('Evaluation Metrics')
+    plt.title('Explanation Method Performance Across Metrics')
+    plt.show()

lib/shap/benchmark/measures.py ADDED Viewed

	@@ -0,0 +1,424 @@

+import warnings
+import numpy as np
+import pandas as pd
+import sklearn.utils
+from tqdm.auto import tqdm
+_remove_cache = {}
+def remove_retrain(nmask, X_train, y_train, X_test, y_test, attr_test, model_generator, metric, trained_model, random_state):
+    """ The model is retrained for each test sample with the important features set to a constant.
+    If you want to know how important a set of features is you can ask how the model would be
+    different if those features had never existed. To determine this we can mask those features
+    across the entire training and test datasets, then retrain the model. If we apply compare the
+    output of this retrained model to the original model we can see the effect produced by knowning
+    the features we masked. Since for individualized explanation methods each test sample has a
+    different set of most important features we need to retrain the model for every test sample
+    to get the change in model performance when a specified fraction of the most important features
+    are withheld.
+    """
+    warnings.warn("The retrain based measures can incorrectly evaluate models in some cases!")
+    # see if we match the last cached call
+    global _remove_cache
+    args = (X_train, y_train, X_test, y_test, model_generator, metric)
+    cache_match = False
+    if "args" in _remove_cache:
+        if all(a is b for a,b in zip(_remove_cache["args"], args)) and np.all(_remove_cache["attr_test"] == attr_test):
+            cache_match = True
+    X_train, X_test = to_array(X_train, X_test)
+    # how many features to mask
+    assert X_train.shape[1] == X_test.shape[1]
+    # this is the model we will retrain many times
+    model_masked = model_generator()
+    # mask nmask top features and re-train the model for each test explanation
+    X_train_tmp = np.zeros(X_train.shape)
+    X_test_tmp = np.zeros(X_test.shape)
+    yp_masked_test = np.zeros(y_test.shape)
+    tie_breaking_noise = const_rand(X_train.shape[1]) * 1e-6
+    last_nmask = _remove_cache.get("nmask", None)
+    last_yp_masked_test = _remove_cache.get("yp_masked_test", None)
+    for i in tqdm(range(len(y_test)), "Retraining for the 'remove' metric"):
+        if cache_match and last_nmask[i] == nmask[i]:
+            yp_masked_test[i] = last_yp_masked_test[i]
+        elif nmask[i] == 0:
+            yp_masked_test[i] = trained_model.predict(X_test[i:i+1])[0]
+        else:
+            # mask out the most important features for this test instance
+            X_train_tmp[:] = X_train
+            X_test_tmp[:] = X_test
+            ordering = np.argsort(-attr_test[i,:] + tie_breaking_noise)
+            X_train_tmp[:,ordering[:nmask[i]]] = X_train[:,ordering[:nmask[i]]].mean()
+            X_test_tmp[i,ordering[:nmask[i]]] = X_train[:,ordering[:nmask[i]]].mean()
+            # retrain the model and make a prediction
+            model_masked.fit(X_train_tmp, y_train)
+            yp_masked_test[i] = model_masked.predict(X_test_tmp[i:i+1])[0]
+    # save our results so the next call to us can be faster when there is redundancy
+    _remove_cache["nmask"] = nmask
+    _remove_cache["yp_masked_test"] = yp_masked_test
+    _remove_cache["attr_test"] = attr_test
+    _remove_cache["args"] = args
+    return metric(y_test, yp_masked_test)
+def remove_mask(nmask, X_train, y_train, X_test, y_test, attr_test, model_generator, metric, trained_model, random_state):
+    """ Each test sample is masked by setting the important features to a constant.
+    """
+    X_train, X_test = to_array(X_train, X_test)
+    # how many features to mask
+    assert X_train.shape[1] == X_test.shape[1]
+    # mask nmask top features for each test explanation
+    X_test_tmp = X_test.copy()
+    tie_breaking_noise = const_rand(X_train.shape[1], random_state) * 1e-6
+    mean_vals = X_train.mean(0)
+    for i in range(len(y_test)):
+        if nmask[i] > 0:
+            ordering = np.argsort(-attr_test[i,:] + tie_breaking_noise)
+            X_test_tmp[i,ordering[:nmask[i]]] = mean_vals[ordering[:nmask[i]]]
+    yp_masked_test = trained_model.predict(X_test_tmp)
+    return metric(y_test, yp_masked_test)
+def remove_impute(nmask, X_train, y_train, X_test, y_test, attr_test, model_generator, metric, trained_model, random_state):
+    """ The model is reevaluated for each test sample with the important features set to an imputed value.
+    Note that the imputation is done using a multivariate normality assumption on the dataset. This depends on
+    being able to estimate the full data covariance matrix (and inverse) accuractly. So X_train.shape[0] should
+    be significantly bigger than X_train.shape[1].
+    """
+    X_train, X_test = to_array(X_train, X_test)
+    # how many features to mask
+    assert X_train.shape[1] == X_test.shape[1]
+    # keep nkeep top features for each test explanation
+    C = np.cov(X_train.T)
+    C += np.eye(C.shape[0]) * 1e-6
+    X_test_tmp = X_test.copy()
+    yp_masked_test = np.zeros(y_test.shape)
+    tie_breaking_noise = const_rand(X_train.shape[1], random_state) * 1e-6
+    mean_vals = X_train.mean(0)
+    for i in range(len(y_test)):
+        if nmask[i] > 0:
+            ordering = np.argsort(-attr_test[i,:] + tie_breaking_noise)
+            observe_inds = ordering[nmask[i]:]
+            impute_inds = ordering[:nmask[i]]
+            # impute missing data assuming it follows a multivariate normal distribution
+            Coo_inv = np.linalg.inv(C[observe_inds,:][:,observe_inds])
+            Cio = C[impute_inds,:][:,observe_inds]
+            impute = mean_vals[impute_inds] + Cio @ Coo_inv @ (X_test[i, observe_inds] - mean_vals[observe_inds])
+            X_test_tmp[i, impute_inds] = impute
+    yp_masked_test = trained_model.predict(X_test_tmp)
+    return metric(y_test, yp_masked_test)
+def remove_resample(nmask, X_train, y_train, X_test, y_test, attr_test, model_generator, metric, trained_model, random_state):
+    """ The model is reevaluated for each test sample with the important features set to resample background values.
+    """
+    X_train, X_test = to_array(X_train, X_test)
+    # how many features to mask
+    assert X_train.shape[1] == X_test.shape[1]
+    # how many samples to take
+    nsamples = 100
+    # keep nkeep top features for each test explanation
+    N,M = X_test.shape
+    X_test_tmp = np.tile(X_test, [1, nsamples]).reshape(nsamples * N, M)
+    tie_breaking_noise = const_rand(M) * 1e-6
+    inds = sklearn.utils.resample(np.arange(N), n_samples=nsamples, random_state=random_state)
+    for i in range(N):
+        if nmask[i] > 0:
+            ordering = np.argsort(-attr_test[i,:] + tie_breaking_noise)
+            X_test_tmp[i*nsamples:(i+1)*nsamples, ordering[:nmask[i]]] = X_train[inds, :][:, ordering[:nmask[i]]]
+    yp_masked_test = trained_model.predict(X_test_tmp)
+    yp_masked_test = np.reshape(yp_masked_test, (N, nsamples)).mean(1) # take the mean output over all samples
+    return metric(y_test, yp_masked_test)
+def batch_remove_retrain(nmask_train, nmask_test, X_train, y_train, X_test, y_test, attr_train, attr_test, model_generator, metric):
+    """ An approximation of holdout that only retraines the model once.
+    This is also called ROAR (RemOve And Retrain) in work by Google. It is much more computationally
+    efficient that the holdout method because it masks the most important features in every sample
+    and then retrains the model once, instead of retraining the model for every test sample like
+    the holdout metric.
+    """
+    warnings.warn("The retrain based measures can incorrectly evaluate models in some cases!")
+    X_train, X_test = to_array(X_train, X_test)
+    # how many features to mask
+    assert X_train.shape[1] == X_test.shape[1]
+    # mask nmask top features for each explanation
+    X_train_tmp = X_train.copy()
+    X_train_mean = X_train.mean(0)
+    tie_breaking_noise = const_rand(X_train.shape[1]) * 1e-6
+    for i in range(len(y_train)):
+        if nmask_train[i] > 0:
+            ordering = np.argsort(-attr_train[i, :] + tie_breaking_noise)
+            X_train_tmp[i, ordering[:nmask_train[i]]] = X_train_mean[ordering[:nmask_train[i]]]
+    X_test_tmp = X_test.copy()
+    for i in range(len(y_test)):
+        if nmask_test[i] > 0:
+            ordering = np.argsort(-attr_test[i, :] + tie_breaking_noise)
+            X_test_tmp[i, ordering[:nmask_test[i]]] = X_train_mean[ordering[:nmask_test[i]]]
+    # train the model with all the given features masked
+    model_masked = model_generator()
+    model_masked.fit(X_train_tmp, y_train)
+    yp_test_masked = model_masked.predict(X_test_tmp)
+    return metric(y_test, yp_test_masked)
+_keep_cache = {}
+def keep_retrain(nkeep, X_train, y_train, X_test, y_test, attr_test, model_generator, metric, trained_model, random_state):
+    """ The model is retrained for each test sample with the non-important features set to a constant.
+    If you want to know how important a set of features is you can ask how the model would be
+    different if only those features had existed. To determine this we can mask the other features
+    across the entire training and test datasets, then retrain the model. If we apply compare the
+    output of this retrained model to the original model we can see the effect produced by only
+    knowning the important features. Since for individualized explanation methods each test sample
+    has a different set of most important features we need to retrain the model for every test sample
+    to get the change in model performance when a specified fraction of the most important features
+    are retained.
+    """
+    warnings.warn("The retrain based measures can incorrectly evaluate models in some cases!")
+    # see if we match the last cached call
+    global _keep_cache
+    args = (X_train, y_train, X_test, y_test, model_generator, metric)
+    cache_match = False
+    if "args" in _keep_cache:
+        if all(a is b for a,b in zip(_keep_cache["args"], args)) and np.all(_keep_cache["attr_test"] == attr_test):
+            cache_match = True
+    X_train, X_test = to_array(X_train, X_test)
+    # how many features to mask
+    assert X_train.shape[1] == X_test.shape[1]
+    # this is the model we will retrain many times
+    model_masked = model_generator()
+    # keep nkeep top features and re-train the model for each test explanation
+    X_train_tmp = np.zeros(X_train.shape)
+    X_test_tmp = np.zeros(X_test.shape)
+    yp_masked_test = np.zeros(y_test.shape)
+    tie_breaking_noise = const_rand(X_train.shape[1]) * 1e-6
+    last_nkeep = _keep_cache.get("nkeep", None)
+    last_yp_masked_test = _keep_cache.get("yp_masked_test", None)
+    for i in tqdm(range(len(y_test)), "Retraining for the 'keep' metric"):
+        if cache_match and last_nkeep[i] == nkeep[i]:
+            yp_masked_test[i] = last_yp_masked_test[i]
+        elif nkeep[i] == attr_test.shape[1]:
+            yp_masked_test[i] = trained_model.predict(X_test[i:i+1])[0]
+        else:
+            # mask out the most important features for this test instance
+            X_train_tmp[:] = X_train
+            X_test_tmp[:] = X_test
+            ordering = np.argsort(-attr_test[i,:] + tie_breaking_noise)
+            X_train_tmp[:,ordering[nkeep[i]:]] = X_train[:,ordering[nkeep[i]:]].mean()
+            X_test_tmp[i,ordering[nkeep[i]:]] = X_train[:,ordering[nkeep[i]:]].mean()
+            # retrain the model and make a prediction
+            model_masked.fit(X_train_tmp, y_train)
+            yp_masked_test[i] = model_masked.predict(X_test_tmp[i:i+1])[0]
+    # save our results so the next call to us can be faster when there is redundancy
+    _keep_cache["nkeep"] = nkeep
+    _keep_cache["yp_masked_test"] = yp_masked_test
+    _keep_cache["attr_test"] = attr_test
+    _keep_cache["args"] = args
+    return metric(y_test, yp_masked_test)
+def keep_mask(nkeep, X_train, y_train, X_test, y_test, attr_test, model_generator, metric, trained_model, random_state):
+    """ The model is reevaluated for each test sample with the non-important features set to their mean.
+    """
+    X_train, X_test = to_array(X_train, X_test)
+    # how many features to mask
+    assert X_train.shape[1] == X_test.shape[1]
+    # keep nkeep top features for each test explanation
+    X_test_tmp = X_test.copy()
+    yp_masked_test = np.zeros(y_test.shape)
+    tie_breaking_noise = const_rand(X_train.shape[1], random_state) * 1e-6
+    mean_vals = X_train.mean(0)
+    for i in range(len(y_test)):
+        if nkeep[i] < X_test.shape[1]:
+            ordering = np.argsort(-attr_test[i,:] + tie_breaking_noise)
+            X_test_tmp[i,ordering[nkeep[i]:]] = mean_vals[ordering[nkeep[i]:]]
+    yp_masked_test = trained_model.predict(X_test_tmp)
+    return metric(y_test, yp_masked_test)
+def keep_impute(nkeep, X_train, y_train, X_test, y_test, attr_test, model_generator, metric, trained_model, random_state):
+    """ The model is reevaluated for each test sample with the non-important features set to an imputed value.
+    Note that the imputation is done using a multivariate normality assumption on the dataset. This depends on
+    being able to estimate the full data covariance matrix (and inverse) accuractly. So X_train.shape[0] should
+    be significantly bigger than X_train.shape[1].
+    """
+    X_train, X_test = to_array(X_train, X_test)
+    # how many features to mask
+    assert X_train.shape[1] == X_test.shape[1]
+    # keep nkeep top features for each test explanation
+    C = np.cov(X_train.T)
+    C += np.eye(C.shape[0]) * 1e-6
+    X_test_tmp = X_test.copy()
+    yp_masked_test = np.zeros(y_test.shape)
+    tie_breaking_noise = const_rand(X_train.shape[1], random_state) * 1e-6
+    mean_vals = X_train.mean(0)
+    for i in range(len(y_test)):
+        if nkeep[i] < X_test.shape[1]:
+            ordering = np.argsort(-attr_test[i,:] + tie_breaking_noise)
+            observe_inds = ordering[:nkeep[i]]
+            impute_inds = ordering[nkeep[i]:]
+            # impute missing data assuming it follows a multivariate normal distribution
+            Coo_inv = np.linalg.inv(C[observe_inds,:][:,observe_inds])
+            Cio = C[impute_inds,:][:,observe_inds]
+            impute = mean_vals[impute_inds] + Cio @ Coo_inv @ (X_test[i, observe_inds] - mean_vals[observe_inds])
+            X_test_tmp[i, impute_inds] = impute
+    yp_masked_test = trained_model.predict(X_test_tmp)
+    return metric(y_test, yp_masked_test)
+def keep_resample(nkeep, X_train, y_train, X_test, y_test, attr_test, model_generator, metric, trained_model, random_state):
+    """ The model is reevaluated for each test sample with the non-important features set to resample background values.
+    """ # why broken? overwriting?
+    X_train, X_test = to_array(X_train, X_test)
+    # how many features to mask
+    assert X_train.shape[1] == X_test.shape[1]
+    # how many samples to take
+    nsamples = 100
+    # keep nkeep top features for each test explanation
+    N,M = X_test.shape
+    X_test_tmp = np.tile(X_test, [1, nsamples]).reshape(nsamples * N, M)
+    tie_breaking_noise = const_rand(M) * 1e-6
+    inds = sklearn.utils.resample(np.arange(N), n_samples=nsamples, random_state=random_state)
+    for i in range(N):
+        if nkeep[i] < M:
+            ordering = np.argsort(-attr_test[i,:] + tie_breaking_noise)
+            X_test_tmp[i*nsamples:(i+1)*nsamples, ordering[nkeep[i]:]] = X_train[inds, :][:, ordering[nkeep[i]:]]
+    yp_masked_test = trained_model.predict(X_test_tmp)
+    yp_masked_test = np.reshape(yp_masked_test, (N, nsamples)).mean(1) # take the mean output over all samples
+    return metric(y_test, yp_masked_test)
+def batch_keep_retrain(nkeep_train, nkeep_test, X_train, y_train, X_test, y_test, attr_train, attr_test, model_generator, metric):
+    """ An approximation of keep that only retraines the model once.
+    This is also called KAR (Keep And Retrain) in work by Google. It is much more computationally
+    efficient that the keep method because it masks the unimportant features in every sample
+    and then retrains the model once, instead of retraining the model for every test sample like
+    the keep metric.
+    """
+    warnings.warn("The retrain based measures can incorrectly evaluate models in some cases!")
+    X_train, X_test = to_array(X_train, X_test)
+    # how many features to mask
+    assert X_train.shape[1] == X_test.shape[1]
+    # mask nkeep top features for each explanation
+    X_train_tmp = X_train.copy()
+    X_train_mean = X_train.mean(0)
+    tie_breaking_noise = const_rand(X_train.shape[1]) * 1e-6
+    for i in range(len(y_train)):
+        if nkeep_train[i] < X_train.shape[1]:
+            ordering = np.argsort(-attr_train[i, :] + tie_breaking_noise)
+            X_train_tmp[i, ordering[nkeep_train[i]:]] = X_train_mean[ordering[nkeep_train[i]:]]
+    X_test_tmp = X_test.copy()
+    for i in range(len(y_test)):
+        if nkeep_test[i] < X_test.shape[1]:
+            ordering = np.argsort(-attr_test[i, :] + tie_breaking_noise)
+            X_test_tmp[i, ordering[nkeep_test[i]:]] = X_train_mean[ordering[nkeep_test[i]:]]
+    # train the model with all the features not given masked
+    model_masked = model_generator()
+    model_masked.fit(X_train_tmp, y_train)
+    yp_test_masked = model_masked.predict(X_test_tmp)
+    return metric(y_test, yp_test_masked)
+def local_accuracy(X_train, y_train, X_test, y_test, attr_test, model_generator, metric, trained_model):
+    """ The how well do the features plus a constant base rate sum up to the model output.
+    """
+    X_train, X_test = to_array(X_train, X_test)
+    # how many features to mask
+    assert X_train.shape[1] == X_test.shape[1]
+    # keep nkeep top features and re-train the model for each test explanation
+    yp_test = trained_model.predict(X_test)
+    return metric(yp_test, strip_list(attr_test).sum(1))
+def to_array(*args):
+    return [a.values if isinstance(a, pd.DataFrame) else a for a in args]
+def const_rand(size, seed=23980):
+    """ Generate a random array with a fixed seed.
+    """
+    old_seed = np.random.seed()
+    np.random.seed(seed)
+    out = np.random.rand(size)
+    np.random.seed(old_seed)
+    return out
+def const_shuffle(arr, seed=23980):
+    """ Shuffle an array in-place with a fixed seed.
+    """
+    old_seed = np.random.seed()
+    np.random.seed(seed)
+    np.random.shuffle(arr)
+    np.random.seed(old_seed)
+def strip_list(attrs):
+    """ This assumes that if you have a list of outputs you just want the second one (the second class is the '1' class).
+    """
+    if isinstance(attrs, list):
+        return attrs[1]
+    else:
+        return attrs

lib/shap/benchmark/methods.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import numpy as np
+import sklearn
+from .. import (
+    DeepExplainer,
+    GradientExplainer,
+    KernelExplainer,
+    LinearExplainer,
+    SamplingExplainer,
+    TreeExplainer,
+    kmeans,
+)
+from ..explainers import other
+from .models import KerasWrap
+def linear_shap_corr(model, data):
+    """ Linear SHAP (corr 1000)
+    """
+    return LinearExplainer(model, data, feature_dependence="correlation", nsamples=1000).shap_values
+def linear_shap_ind(model, data):
+    """ Linear SHAP (ind)
+    """
+    return LinearExplainer(model, data, feature_dependence="independent").shap_values
+def coef(model, data):
+    """ Coefficients
+    """
+    return other.CoefficentExplainer(model).attributions
+def random(model, data):
+    """ Random
+    color = #777777
+    linestyle = solid
+    """
+    return other.RandomExplainer().attributions
+def kernel_shap_1000_meanref(model, data):
+    """ Kernel SHAP 1000 mean ref.
+    color = red_blue_circle(0.5)
+    linestyle = solid
+    """
+    return lambda X: KernelExplainer(model.predict, kmeans(data, 1)).shap_values(X, nsamples=1000, l1_reg=0)
+def sampling_shap_1000(model, data):
+    """ IME 1000
+    color = red_blue_circle(0.5)
+    linestyle = dashed
+    """
+    return lambda X: SamplingExplainer(model.predict, data).shap_values(X, nsamples=1000)
+def tree_shap_tree_path_dependent(model, data):
+    """ TreeExplainer
+    color = red_blue_circle(0)
+    linestyle = solid
+    """
+    return TreeExplainer(model, feature_dependence="tree_path_dependent").shap_values
+def tree_shap_independent_200(model, data):
+    """ TreeExplainer (independent)
+    color = red_blue_circle(0)
+    linestyle = dashed
+    """
+    data_subsample = sklearn.utils.resample(data, replace=False, n_samples=min(200, data.shape[0]), random_state=0)
+    return TreeExplainer(model, data_subsample, feature_dependence="independent").shap_values
+def mean_abs_tree_shap(model, data):
+    """ mean(|TreeExplainer|)
+    color = red_blue_circle(0.25)
+    linestyle = solid
+    """
+    def f(X):
+        v = TreeExplainer(model).shap_values(X)
+        if isinstance(v, list):
+            return [np.tile(np.abs(sv).mean(0), (X.shape[0], 1)) for sv in v]
+        else:
+            return np.tile(np.abs(v).mean(0), (X.shape[0], 1))
+    return f
+def saabas(model, data):
+    """ Saabas
+    color = red_blue_circle(0)
+    linestyle = dotted
+    """
+    return lambda X: TreeExplainer(model).shap_values(X, approximate=True)
+def tree_gain(model, data):
+    """ Gain/Gini Importance
+    color = red_blue_circle(0.25)
+    linestyle = dotted
+    """
+    return other.TreeGainExplainer(model).attributions
+def lime_tabular_regression_1000(model, data):
+    """ LIME Tabular 1000
+    color = red_blue_circle(0.75)
+    """
+    return lambda X: other.LimeTabularExplainer(model.predict, data, mode="regression").attributions(X, nsamples=1000)
+def lime_tabular_classification_1000(model, data):
+    """ LIME Tabular 1000
+    color = red_blue_circle(0.75)
+    """
+    return lambda X: other.LimeTabularExplainer(model.predict_proba, data, mode="classification").attributions(X, nsamples=1000)[1]
+def maple(model, data):
+    """ MAPLE
+    color = red_blue_circle(0.6)
+    """
+    return lambda X: other.MapleExplainer(model.predict, data).attributions(X, multiply_by_input=False)
+def tree_maple(model, data):
+    """ Tree MAPLE
+    color = red_blue_circle(0.6)
+    linestyle = dashed
+    """
+    return lambda X: other.TreeMapleExplainer(model, data).attributions(X, multiply_by_input=False)
+def deep_shap(model, data):
+    """ Deep SHAP (DeepLIFT)
+    """
+    if isinstance(model, KerasWrap):
+        model = model.model
+    explainer = DeepExplainer(model, kmeans(data, 1).data)
+    def f(X):
+        phi = explainer.shap_values(X)
+        if isinstance(phi, list) and len(phi) == 1:
+            return phi[0]
+        else:
+            return phi
+    return f
+def expected_gradients(model, data):
+    """ Expected Gradients
+    """
+    if isinstance(model, KerasWrap):
+        model = model.model
+    explainer = GradientExplainer(model, data)
+    def f(X):
+        phi = explainer.shap_values(X)
+        if isinstance(phi, list) and len(phi) == 1:
+            return phi[0]
+        else:
+            return phi
+    return f

lib/shap/benchmark/metrics.py ADDED Viewed

	@@ -0,0 +1,824 @@

+import hashlib
+import os
+import time
+import numpy as np
+import sklearn
+from .. import __version__
+from . import measures, methods
+try:
+    import dill as pickle
+except Exception:
+    pass
+try:
+    from sklearn.model_selection import train_test_split
+except Exception:
+    from sklearn.cross_validation import train_test_split
+def runtime(X, y, model_generator, method_name):
+    """ Runtime (sec / 1k samples)
+    transform = "negate_log"
+    sort_order = 2
+    """
+    old_seed = np.random.seed()
+    np.random.seed(3293)
+    # average the method scores over several train/test splits
+    method_reps = []
+    for i in range(3):
+        X_train, X_test, y_train, _ = train_test_split(__toarray(X), y, test_size=100, random_state=i)
+        # define the model we are going to explain
+        model = model_generator()
+        model.fit(X_train, y_train)
+        # evaluate each method
+        start = time.time()
+        explainer = getattr(methods, method_name)(model, X_train)
+        build_time = time.time() - start
+        start = time.time()
+        explainer(X_test)
+        explain_time = time.time() - start
+        # we always normalize the explain time as though we were explaining 1000 samples
+        # even if to reduce the runtime of the benchmark we do less (like just 100)
+        method_reps.append(build_time + explain_time * 1000.0 / X_test.shape[0])
+    np.random.seed(old_seed)
+    return None, np.mean(method_reps)
+def local_accuracy(X, y, model_generator, method_name):
+    """ Local Accuracy
+    transform = "identity"
+    sort_order = 0
+    """
+    def score_map(true, pred):
+        """ Computes local accuracy as the normalized standard deviation of numerical scores.
+        """
+        return np.std(pred - true) / (np.std(true) + 1e-6)
+    def score_function(X_train, X_test, y_train, y_test, attr_function, trained_model, random_state):
+        return measures.local_accuracy(
+            X_train, y_train, X_test, y_test, attr_function(X_test),
+            model_generator, score_map, trained_model
+        )
+    return None, __score_method(X, y, None, model_generator, score_function, method_name)
+def consistency_guarantees(X, y, model_generator, method_name):
+    """ Consistency Guarantees
+    transform = "identity"
+    sort_order = 1
+    """
+    # 1.0 - perfect consistency
+    # 0.8 - guarantees depend on sampling
+    # 0.6 - guarantees depend on approximation
+    # 0.0 - no garuntees
+    guarantees = {
+        "linear_shap_corr": 1.0,
+        "linear_shap_ind": 1.0,
+        "coef": 0.0,
+        "kernel_shap_1000_meanref": 0.8,
+        "sampling_shap_1000": 0.8,
+        "random": 0.0,
+        "saabas": 0.0,
+        "tree_gain": 0.0,
+        "tree_shap_tree_path_dependent": 1.0,
+        "tree_shap_independent_200": 1.0,
+        "mean_abs_tree_shap": 1.0,
+        "lime_tabular_regression_1000": 0.8,
+        "lime_tabular_classification_1000": 0.8,
+        "maple": 0.8,
+        "tree_maple": 0.8,
+        "deep_shap": 0.6,
+        "expected_gradients": 0.6
+    }
+    return None, guarantees[method_name]
+def __mean_pred(true, pred):
+    """ A trivial metric that is just is the output of the model.
+    """
+    return np.mean(pred)
+def keep_positive_mask(X, y, model_generator, method_name, num_fcounts=11):
+    """ Keep Positive (mask)
+    xlabel = "Max fraction of features kept"
+    ylabel = "Mean model output"
+    transform = "identity"
+    sort_order = 4
+    """
+    return __run_measure(measures.keep_mask, X, y, model_generator, method_name, 1, num_fcounts, __mean_pred)
+def keep_negative_mask(X, y, model_generator, method_name, num_fcounts=11):
+    """ Keep Negative (mask)
+    xlabel = "Max fraction of features kept"
+    ylabel = "Negative mean model output"
+    transform = "negate"
+    sort_order = 5
+    """
+    return __run_measure(measures.keep_mask, X, y, model_generator, method_name, -1, num_fcounts, __mean_pred)
+def keep_absolute_mask__r2(X, y, model_generator, method_name, num_fcounts=11):
+    """ Keep Absolute (mask)
+    xlabel = "Max fraction of features kept"
+    ylabel = "R^2"
+    transform = "identity"
+    sort_order = 6
+    """
+    return __run_measure(measures.keep_mask, X, y, model_generator, method_name, 0, num_fcounts, sklearn.metrics.r2_score)
+def keep_absolute_mask__roc_auc(X, y, model_generator, method_name, num_fcounts=11):
+    """ Keep Absolute (mask)
+    xlabel = "Max fraction of features kept"
+    ylabel = "ROC AUC"
+    transform = "identity"
+    sort_order = 6
+    """
+    return __run_measure(measures.keep_mask, X, y, model_generator, method_name, 0, num_fcounts, sklearn.metrics.roc_auc_score)
+def remove_positive_mask(X, y, model_generator, method_name, num_fcounts=11):
+    """ Remove Positive (mask)
+    xlabel = "Max fraction of features removed"
+    ylabel = "Negative mean model output"
+    transform = "negate"
+    sort_order = 7
+    """
+    return __run_measure(measures.remove_mask, X, y, model_generator, method_name, 1, num_fcounts, __mean_pred)
+def remove_negative_mask(X, y, model_generator, method_name, num_fcounts=11):
+    """ Remove Negative (mask)
+    xlabel = "Max fraction of features removed"
+    ylabel = "Mean model output"
+    transform = "identity"
+    sort_order = 8
+    """
+    return __run_measure(measures.remove_mask, X, y, model_generator, method_name, -1, num_fcounts, __mean_pred)
+def remove_absolute_mask__r2(X, y, model_generator, method_name, num_fcounts=11):
+    """ Remove Absolute (mask)
+    xlabel = "Max fraction of features removed"
+    ylabel = "1 - R^2"
+    transform = "one_minus"
+    sort_order = 9
+    """
+    return __run_measure(measures.remove_mask, X, y, model_generator, method_name, 0, num_fcounts, sklearn.metrics.r2_score)
+def remove_absolute_mask__roc_auc(X, y, model_generator, method_name, num_fcounts=11):
+    """ Remove Absolute (mask)
+    xlabel = "Max fraction of features removed"
+    ylabel = "1 - ROC AUC"
+    transform = "one_minus"
+    sort_order = 9
+    """
+    return __run_measure(measures.remove_mask, X, y, model_generator, method_name, 0, num_fcounts, sklearn.metrics.roc_auc_score)
+def keep_positive_resample(X, y, model_generator, method_name, num_fcounts=11):
+    """ Keep Positive (resample)
+    xlabel = "Max fraction of features kept"
+    ylabel = "Mean model output"
+    transform = "identity"
+    sort_order = 10
+    """
+    return __run_measure(measures.keep_resample, X, y, model_generator, method_name, 1, num_fcounts, __mean_pred)
+def keep_negative_resample(X, y, model_generator, method_name, num_fcounts=11):
+    """ Keep Negative (resample)
+    xlabel = "Max fraction of features kept"
+    ylabel = "Negative mean model output"
+    transform = "negate"
+    sort_order = 11
+    """
+    return __run_measure(measures.keep_resample, X, y, model_generator, method_name, -1, num_fcounts, __mean_pred)
+def keep_absolute_resample__r2(X, y, model_generator, method_name, num_fcounts=11):
+    """ Keep Absolute (resample)
+    xlabel = "Max fraction of features kept"
+    ylabel = "R^2"
+    transform = "identity"
+    sort_order = 12
+    """
+    return __run_measure(measures.keep_resample, X, y, model_generator, method_name, 0, num_fcounts, sklearn.metrics.r2_score)
+def keep_absolute_resample__roc_auc(X, y, model_generator, method_name, num_fcounts=11):
+    """ Keep Absolute (resample)
+    xlabel = "Max fraction of features kept"
+    ylabel = "ROC AUC"
+    transform = "identity"
+    sort_order = 12
+    """
+    return __run_measure(measures.keep_resample, X, y, model_generator, method_name, 0, num_fcounts, sklearn.metrics.roc_auc_score)
+def remove_positive_resample(X, y, model_generator, method_name, num_fcounts=11):
+    """ Remove Positive (resample)
+    xlabel = "Max fraction of features removed"
+    ylabel = "Negative mean model output"
+    transform = "negate"
+    sort_order = 13
+    """
+    return __run_measure(measures.remove_resample, X, y, model_generator, method_name, 1, num_fcounts, __mean_pred)
+def remove_negative_resample(X, y, model_generator, method_name, num_fcounts=11):
+    """ Remove Negative (resample)
+    xlabel = "Max fraction of features removed"
+    ylabel = "Mean model output"
+    transform = "identity"
+    sort_order = 14
+    """
+    return __run_measure(measures.remove_resample, X, y, model_generator, method_name, -1, num_fcounts, __mean_pred)
+def remove_absolute_resample__r2(X, y, model_generator, method_name, num_fcounts=11):
+    """ Remove Absolute (resample)
+    xlabel = "Max fraction of features removed"
+    ylabel = "1 - R^2"
+    transform = "one_minus"
+    sort_order = 15
+    """
+    return __run_measure(measures.remove_resample, X, y, model_generator, method_name, 0, num_fcounts, sklearn.metrics.r2_score)
+def remove_absolute_resample__roc_auc(X, y, model_generator, method_name, num_fcounts=11):
+    """ Remove Absolute (resample)
+    xlabel = "Max fraction of features removed"
+    ylabel = "1 - ROC AUC"
+    transform = "one_minus"
+    sort_order = 15
+    """
+    return __run_measure(measures.remove_resample, X, y, model_generator, method_name, 0, num_fcounts, sklearn.metrics.roc_auc_score)
+def keep_positive_impute(X, y, model_generator, method_name, num_fcounts=11):
+    """ Keep Positive (impute)
+    xlabel = "Max fraction of features kept"
+    ylabel = "Mean model output"
+    transform = "identity"
+    sort_order = 16
+    """
+    return __run_measure(measures.keep_impute, X, y, model_generator, method_name, 1, num_fcounts, __mean_pred)
+def keep_negative_impute(X, y, model_generator, method_name, num_fcounts=11):
+    """ Keep Negative (impute)
+    xlabel = "Max fraction of features kept"
+    ylabel = "Negative mean model output"
+    transform = "negate"
+    sort_order = 17
+    """
+    return __run_measure(measures.keep_impute, X, y, model_generator, method_name, -1, num_fcounts, __mean_pred)
+def keep_absolute_impute__r2(X, y, model_generator, method_name, num_fcounts=11):
+    """ Keep Absolute (impute)
+    xlabel = "Max fraction of features kept"
+    ylabel = "R^2"
+    transform = "identity"
+    sort_order = 18
+    """
+    return __run_measure(measures.keep_impute, X, y, model_generator, method_name, 0, num_fcounts, sklearn.metrics.r2_score)
+def keep_absolute_impute__roc_auc(X, y, model_generator, method_name, num_fcounts=11):
+    """ Keep Absolute (impute)
+    xlabel = "Max fraction of features kept"
+    ylabel = "ROC AUC"
+    transform = "identity"
+    sort_order = 19
+    """
+    return __run_measure(measures.keep_mask, X, y, model_generator, method_name, 0, num_fcounts, sklearn.metrics.roc_auc_score)
+def remove_positive_impute(X, y, model_generator, method_name, num_fcounts=11):
+    """ Remove Positive (impute)
+    xlabel = "Max fraction of features removed"
+    ylabel = "Negative mean model output"
+    transform = "negate"
+    sort_order = 7
+    """
+    return __run_measure(measures.remove_impute, X, y, model_generator, method_name, 1, num_fcounts, __mean_pred)
+def remove_negative_impute(X, y, model_generator, method_name, num_fcounts=11):
+    """ Remove Negative (impute)
+    xlabel = "Max fraction of features removed"
+    ylabel = "Mean model output"
+    transform = "identity"
+    sort_order = 8
+    """
+    return __run_measure(measures.remove_impute, X, y, model_generator, method_name, -1, num_fcounts, __mean_pred)
+def remove_absolute_impute__r2(X, y, model_generator, method_name, num_fcounts=11):
+    """ Remove Absolute (impute)
+    xlabel = "Max fraction of features removed"
+    ylabel = "1 - R^2"
+    transform = "one_minus"
+    sort_order = 9
+    """
+    return __run_measure(measures.remove_impute, X, y, model_generator, method_name, 0, num_fcounts, sklearn.metrics.r2_score)
+def remove_absolute_impute__roc_auc(X, y, model_generator, method_name, num_fcounts=11):
+    """ Remove Absolute (impute)
+    xlabel = "Max fraction of features removed"
+    ylabel = "1 - ROC AUC"
+    transform = "one_minus"
+    sort_order = 9
+    """
+    return __run_measure(measures.remove_mask, X, y, model_generator, method_name, 0, num_fcounts, sklearn.metrics.roc_auc_score)
+def keep_positive_retrain(X, y, model_generator, method_name, num_fcounts=11):
+    """ Keep Positive (retrain)
+    xlabel = "Max fraction of features kept"
+    ylabel = "Mean model output"
+    transform = "identity"
+    sort_order = 6
+    """
+    return __run_measure(measures.keep_retrain, X, y, model_generator, method_name, 1, num_fcounts, __mean_pred)
+def keep_negative_retrain(X, y, model_generator, method_name, num_fcounts=11):
+    """ Keep Negative (retrain)
+    xlabel = "Max fraction of features kept"
+    ylabel = "Negative mean model output"
+    transform = "negate"
+    sort_order = 7
+    """
+    return __run_measure(measures.keep_retrain, X, y, model_generator, method_name, -1, num_fcounts, __mean_pred)
+def remove_positive_retrain(X, y, model_generator, method_name, num_fcounts=11):
+    """ Remove Positive (retrain)
+    xlabel = "Max fraction of features removed"
+    ylabel = "Negative mean model output"
+    transform = "negate"
+    sort_order = 11
+    """
+    return __run_measure(measures.remove_retrain, X, y, model_generator, method_name, 1, num_fcounts, __mean_pred)
+def remove_negative_retrain(X, y, model_generator, method_name, num_fcounts=11):
+    """ Remove Negative (retrain)
+    xlabel = "Max fraction of features removed"
+    ylabel = "Mean model output"
+    transform = "identity"
+    sort_order = 12
+    """
+    return __run_measure(measures.remove_retrain, X, y, model_generator, method_name, -1, num_fcounts, __mean_pred)
+def __run_measure(measure, X, y, model_generator, method_name, attribution_sign, num_fcounts, summary_function):
+    def score_function(fcount, X_train, X_test, y_train, y_test, attr_function, trained_model, random_state):
+        if attribution_sign == 0:
+            A = np.abs(__strip_list(attr_function(X_test)))
+        else:
+            A = attribution_sign * __strip_list(attr_function(X_test))
+        nmask = np.ones(len(y_test)) * fcount
+        nmask = np.minimum(nmask, np.array(A >= 0).sum(1)).astype(int)
+        return measure(
+            nmask, X_train, y_train, X_test, y_test, A,
+            model_generator, summary_function, trained_model, random_state
+        )
+    fcounts = __intlogspace(0, X.shape[1], num_fcounts)
+    return fcounts, __score_method(X, y, fcounts, model_generator, score_function, method_name)
+def batch_remove_absolute_retrain__r2(X, y, model_generator, method_name, num_fcounts=11):
+    """ Batch Remove Absolute (retrain)
+    xlabel = "Fraction of features removed"
+    ylabel = "1 - R^2"
+    transform = "one_minus"
+    sort_order = 13
+    """
+    return __run_batch_abs_metric(measures.batch_remove_retrain, X, y, model_generator, method_name, sklearn.metrics.r2_score, num_fcounts)
+def batch_keep_absolute_retrain__r2(X, y, model_generator, method_name, num_fcounts=11):
+    """ Batch Keep Absolute (retrain)
+    xlabel = "Fraction of features kept"
+    ylabel = "R^2"
+    transform = "identity"
+    sort_order = 13
+    """
+    return __run_batch_abs_metric(measures.batch_keep_retrain, X, y, model_generator, method_name, sklearn.metrics.r2_score, num_fcounts)
+def batch_remove_absolute_retrain__roc_auc(X, y, model_generator, method_name, num_fcounts=11):
+    """ Batch Remove Absolute (retrain)
+    xlabel = "Fraction of features removed"
+    ylabel = "1 - ROC AUC"
+    transform = "one_minus"
+    sort_order = 13
+    """
+    return __run_batch_abs_metric(measures.batch_remove_retrain, X, y, model_generator, method_name, sklearn.metrics.roc_auc_score, num_fcounts)
+def batch_keep_absolute_retrain__roc_auc(X, y, model_generator, method_name, num_fcounts=11):
+    """ Batch Keep Absolute (retrain)
+    xlabel = "Fraction of features kept"
+    ylabel = "ROC AUC"
+    transform = "identity"
+    sort_order = 13
+    """
+    return __run_batch_abs_metric(measures.batch_keep_retrain, X, y, model_generator, method_name, sklearn.metrics.roc_auc_score, num_fcounts)
+def __run_batch_abs_metric(metric, X, y, model_generator, method_name, loss, num_fcounts):
+    def score_function(fcount, X_train, X_test, y_train, y_test, attr_function, trained_model):
+        A_train = np.abs(__strip_list(attr_function(X_train)))
+        nkeep_train = (np.ones(len(y_train)) * fcount).astype(int)
+        #nkeep_train = np.minimum(nkeep_train, np.array(A_train > 0).sum(1)).astype(int)
+        A_test = np.abs(__strip_list(attr_function(X_test)))
+        nkeep_test = (np.ones(len(y_test)) * fcount).astype(int)
+        #nkeep_test = np.minimum(nkeep_test, np.array(A_test >= 0).sum(1)).astype(int)
+        return metric(
+            nkeep_train, nkeep_test, X_train, y_train, X_test, y_test, A_train, A_test,
+            model_generator, loss
+        )
+    fcounts = __intlogspace(0, X.shape[1], num_fcounts)
+    return fcounts, __score_method(X, y, fcounts, model_generator, score_function, method_name)
+_attribution_cache = {}
+def __score_method(X, y, fcounts, model_generator, score_function, method_name, nreps=10, test_size=100, cache_dir="/tmp"):
+    """ Test an explanation method.
+    """
+    try:
+        pickle
+    except NameError:
+        raise ImportError("The 'dill' package could not be loaded and is needed for the benchmark!")
+    old_seed = np.random.seed()
+    np.random.seed(3293)
+    # average the method scores over several train/test splits
+    method_reps = []
+    data_hash = hashlib.sha256(__toarray(X).flatten()).hexdigest() + hashlib.sha256(__toarray(y)).hexdigest()
+    for i in range(nreps):
+        X_train, X_test, y_train, y_test = train_test_split(__toarray(X), y, test_size=test_size, random_state=i)
+        # define the model we are going to explain, caching so we onlu build it once
+        model_id = "model_cache__v" + "__".join([__version__, data_hash, model_generator.__name__])+".pickle"
+        cache_file = os.path.join(cache_dir, model_id + ".pickle")
+        if os.path.isfile(cache_file):
+            with open(cache_file, "rb") as f:
+                model = pickle.load(f)
+        else:
+            model = model_generator()
+            model.fit(X_train, y_train)
+            with open(cache_file, "wb") as f:
+                pickle.dump(model, f)
+        attr_key = "_".join([model_generator.__name__, method_name, str(test_size), str(nreps), str(i), data_hash])
+        def score(attr_function):
+            def cached_attr_function(X_inner):
+                if attr_key not in _attribution_cache:
+                    _attribution_cache[attr_key] = attr_function(X_inner)
+                return _attribution_cache[attr_key]
+            #cached_attr_function = lambda X: __check_cache(attr_function, X)
+            if fcounts is None:
+                return score_function(X_train, X_test, y_train, y_test, cached_attr_function, model, i)
+            else:
+                scores = []
+                for f in fcounts:
+                    scores.append(score_function(f, X_train, X_test, y_train, y_test, cached_attr_function, model, i))
+                return np.array(scores)
+        # evaluate the method (only building the attribution function if we need to)
+        if attr_key not in _attribution_cache:
+            method_reps.append(score(getattr(methods, method_name)(model, X_train)))
+        else:
+            method_reps.append(score(None))
+    np.random.seed(old_seed)
+    return np.array(method_reps).mean(0)
+# used to memoize explainer functions so we don't waste time re-explaining the same object
+__cache0 = None
+__cache_X0 = None
+__cache_f0 = None
+__cache1 = None
+__cache_X1 = None
+__cache_f1 = None
+def __check_cache(f, X):
+    global __cache0, __cache_X0, __cache_f0
+    global __cache1, __cache_X1, __cache_f1
+    if X is __cache_X0 and f is __cache_f0:
+        return __cache0
+    elif X is __cache_X1 and f is __cache_f1:
+        return __cache1
+    else:
+        __cache_f1 = __cache_f0
+        __cache_X1 = __cache_X0
+        __cache1 = __cache0
+        __cache_f0 = f
+        __cache_X0 = X
+        __cache0 = f(X)
+        return __cache0
+def __intlogspace(start, end, count):
+    return np.unique(np.round(start + (end-start) * (np.logspace(0, 1, count, endpoint=True) - 1) / 9).astype(int))
+def __toarray(X):
+    """ Converts DataFrames to numpy arrays.
+    """
+    if hasattr(X, "values"):
+        X = X.values
+    return X
+def __strip_list(attrs):
+    """ This assumes that if you have a list of outputs you just want the second one (the second class).
+    """
+    if isinstance(attrs, list):
+        return attrs[1]
+    else:
+        return attrs
+def _fit_human(model_generator, val00, val01, val11):
+    # force the model to fit a function with almost entirely zero background
+    N = 1000000
+    M = 3
+    X = np.zeros((N,M))
+    X.shape
+    y = np.ones(N) * val00
+    X[0:1000, 0] = 1
+    y[0:1000] = val01
+    for i in range(0,1000000,1000):
+        X[i, 1] = 1
+        y[i] = val01
+    y[0] = val11
+    model = model_generator()
+    model.fit(X, y)
+    return model
+def _human_and(X, model_generator, method_name, fever, cough):
+    assert np.abs(X).max() == 0, "Human agreement metrics are only for use with the human_agreement dataset!"
+    # these are from the sickness_score mturk user study experiment
+    X_test = np.zeros((100,3))
+    if not fever and not cough:
+        human_consensus = np.array([0., 0., 0.])
+        X_test[0,:] = np.array([[0., 0., 1.]])
+    elif not fever and cough:
+        human_consensus = np.array([0., 2., 0.])
+        X_test[0,:] = np.array([[0., 1., 1.]])
+    elif fever and cough:
+        human_consensus = np.array([5., 5., 0.])
+        X_test[0,:] = np.array([[1., 1., 1.]])
+    # force the model to fit an XOR function with almost entirely zero background
+    model = _fit_human(model_generator, 0, 2, 10)
+    attr_function = getattr(methods, method_name)(model, X)
+    methods_attrs = attr_function(X_test)
+    return "human", (human_consensus, methods_attrs[0,:])
+def human_and_00(X, y, model_generator, method_name):
+    """ AND (false/false)
+    This tests how well a feature attribution method agrees with human intuition
+    for an AND operation combined with linear effects. This metric deals
+    specifically with the question of credit allocation for the following function
+    when all three inputs are true:
+    if fever: +2 points
+    if cough: +2 points
+    if fever and cough: +6 points
+    transform = "identity"
+    sort_order = 0
+    """
+    return _human_and(X, model_generator, method_name, False, False)
+def human_and_01(X, y, model_generator, method_name):
+    """ AND (false/true)
+    This tests how well a feature attribution method agrees with human intuition
+    for an AND operation combined with linear effects. This metric deals
+    specifically with the question of credit allocation for the following function
+    when all three inputs are true:
+    if fever: +2 points
+    if cough: +2 points
+    if fever and cough: +6 points
+    transform = "identity"
+    sort_order = 1
+    """
+    return _human_and(X, model_generator, method_name, False, True)
+def human_and_11(X, y, model_generator, method_name):
+    """ AND (true/true)
+    This tests how well a feature attribution method agrees with human intuition
+    for an AND operation combined with linear effects. This metric deals
+    specifically with the question of credit allocation for the following function
+    when all three inputs are true:
+    if fever: +2 points
+    if cough: +2 points
+    if fever and cough: +6 points
+    transform = "identity"
+    sort_order = 2
+    """
+    return _human_and(X, model_generator, method_name, True, True)
+def _human_or(X, model_generator, method_name, fever, cough):
+    assert np.abs(X).max() == 0, "Human agreement metrics are only for use with the human_agreement dataset!"
+    # these are from the sickness_score mturk user study experiment
+    X_test = np.zeros((100,3))
+    if not fever and not cough:
+        human_consensus = np.array([0., 0., 0.])
+        X_test[0,:] = np.array([[0., 0., 1.]])
+    elif not fever and cough:
+        human_consensus = np.array([0., 8., 0.])
+        X_test[0,:] = np.array([[0., 1., 1.]])
+    elif fever and cough:
+        human_consensus = np.array([5., 5., 0.])
+        X_test[0,:] = np.array([[1., 1., 1.]])
+    # force the model to fit an XOR function with almost entirely zero background
+    model = _fit_human(model_generator, 0, 8, 10)
+    attr_function = getattr(methods, method_name)(model, X)
+    methods_attrs = attr_function(X_test)
+    return "human", (human_consensus, methods_attrs[0,:])
+def human_or_00(X, y, model_generator, method_name):
+    """ OR (false/false)
+    This tests how well a feature attribution method agrees with human intuition
+    for an OR operation combined with linear effects. This metric deals
+    specifically with the question of credit allocation for the following function
+    when all three inputs are true:
+    if fever: +2 points
+    if cough: +2 points
+    if fever or cough: +6 points
+    transform = "identity"
+    sort_order = 0
+    """
+    return _human_or(X, model_generator, method_name, False, False)
+def human_or_01(X, y, model_generator, method_name):
+    """ OR (false/true)
+    This tests how well a feature attribution method agrees with human intuition
+    for an OR operation combined with linear effects. This metric deals
+    specifically with the question of credit allocation for the following function
+    when all three inputs are true:
+    if fever: +2 points
+    if cough: +2 points
+    if fever or cough: +6 points
+    transform = "identity"
+    sort_order = 1
+    """
+    return _human_or(X, model_generator, method_name, False, True)
+def human_or_11(X, y, model_generator, method_name):
+    """ OR (true/true)
+    This tests how well a feature attribution method agrees with human intuition
+    for an OR operation combined with linear effects. This metric deals
+    specifically with the question of credit allocation for the following function
+    when all three inputs are true:
+    if fever: +2 points
+    if cough: +2 points
+    if fever or cough: +6 points
+    transform = "identity"
+    sort_order = 2
+    """
+    return _human_or(X, model_generator, method_name, True, True)
+def _human_xor(X, model_generator, method_name, fever, cough):
+    assert np.abs(X).max() == 0, "Human agreement metrics are only for use with the human_agreement dataset!"
+    # these are from the sickness_score mturk user study experiment
+    X_test = np.zeros((100,3))
+    if not fever and not cough:
+        human_consensus = np.array([0., 0., 0.])
+        X_test[0,:] = np.array([[0., 0., 1.]])
+    elif not fever and cough:
+        human_consensus = np.array([0., 8., 0.])
+        X_test[0,:] = np.array([[0., 1., 1.]])
+    elif fever and cough:
+        human_consensus = np.array([2., 2., 0.])
+        X_test[0,:] = np.array([[1., 1., 1.]])
+    # force the model to fit an XOR function with almost entirely zero background
+    model = _fit_human(model_generator, 0, 8, 4)
+    attr_function = getattr(methods, method_name)(model, X)
+    methods_attrs = attr_function(X_test)
+    return "human", (human_consensus, methods_attrs[0,:])
+def human_xor_00(X, y, model_generator, method_name):
+    """ XOR (false/false)
+    This tests how well a feature attribution method agrees with human intuition
+    for an eXclusive OR operation combined with linear effects. This metric deals
+    specifically with the question of credit allocation for the following function
+    when all three inputs are true:
+    if fever: +2 points
+    if cough: +2 points
+    if fever or cough but not both: +6 points
+    transform = "identity"
+    sort_order = 3
+    """
+    return _human_xor(X, model_generator, method_name, False, False)
+def human_xor_01(X, y, model_generator, method_name):
+    """ XOR (false/true)
+    This tests how well a feature attribution method agrees with human intuition
+    for an eXclusive OR operation combined with linear effects. This metric deals
+    specifically with the question of credit allocation for the following function
+    when all three inputs are true:
+    if fever: +2 points
+    if cough: +2 points
+    if fever or cough but not both: +6 points
+    transform = "identity"
+    sort_order = 4
+    """
+    return _human_xor(X, model_generator, method_name, False, True)
+def human_xor_11(X, y, model_generator, method_name):
+    """ XOR (true/true)
+    This tests how well a feature attribution method agrees with human intuition
+    for an eXclusive OR operation combined with linear effects. This metric deals
+    specifically with the question of credit allocation for the following function
+    when all three inputs are true:
+    if fever: +2 points
+    if cough: +2 points
+    if fever or cough but not both: +6 points
+    transform = "identity"
+    sort_order = 5
+    """
+    return _human_xor(X, model_generator, method_name, True, True)
+def _human_sum(X, model_generator, method_name, fever, cough):
+    assert np.abs(X).max() == 0, "Human agreement metrics are only for use with the human_agreement dataset!"
+    # these are from the sickness_score mturk user study experiment
+    X_test = np.zeros((100,3))
+    if not fever and not cough:
+        human_consensus = np.array([0., 0., 0.])
+        X_test[0,:] = np.array([[0., 0., 1.]])
+    elif not fever and cough:
+        human_consensus = np.array([0., 2., 0.])
+        X_test[0,:] = np.array([[0., 1., 1.]])
+    elif fever and cough:
+        human_consensus = np.array([2., 2., 0.])
+        X_test[0,:] = np.array([[1., 1., 1.]])
+    # force the model to fit an XOR function with almost entirely zero background
+    model = _fit_human(model_generator, 0, 2, 4)
+    attr_function = getattr(methods, method_name)(model, X)
+    methods_attrs = attr_function(X_test)
+    return "human", (human_consensus, methods_attrs[0,:])
+def human_sum_00(X, y, model_generator, method_name):
+    """ SUM (false/false)
+    This tests how well a feature attribution method agrees with human intuition
+    for a SUM operation. This metric deals
+    specifically with the question of credit allocation for the following function
+    when all three inputs are true:
+    if fever: +2 points
+    if cough: +2 points
+    transform = "identity"
+    sort_order = 0
+    """
+    return _human_sum(X, model_generator, method_name, False, False)
+def human_sum_01(X, y, model_generator, method_name):
+    """ SUM (false/true)
+    This tests how well a feature attribution method agrees with human intuition
+    for a SUM operation. This metric deals
+    specifically with the question of credit allocation for the following function
+    when all three inputs are true:
+    if fever: +2 points
+    if cough: +2 points
+    transform = "identity"
+    sort_order = 1
+    """
+    return _human_sum(X, model_generator, method_name, False, True)
+def human_sum_11(X, y, model_generator, method_name):
+    """ SUM (true/true)
+    This tests how well a feature attribution method agrees with human intuition
+    for a SUM operation. This metric deals
+    specifically with the question of credit allocation for the following function
+    when all three inputs are true:
+    if fever: +2 points
+    if cough: +2 points
+    transform = "identity"
+    sort_order = 2
+    """
+    return _human_sum(X, model_generator, method_name, True, True)

lib/shap/benchmark/models.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import numpy as np
+import sklearn
+import sklearn.ensemble
+from sklearn.preprocessing import StandardScaler
+class KerasWrap:
+    """ A wrapper that allows us to set parameters in the constructor and do a reset before fitting.
+    """
+    def __init__(self, model, epochs, flatten_output=False):
+        self.model = model
+        self.epochs = epochs
+        self.flatten_output = flatten_output
+        self.init_weights = None
+        self.scaler = StandardScaler()
+    def fit(self, X, y, verbose=0):
+        if self.init_weights is None:
+            self.init_weights = self.model.get_weights()
+        else:
+            self.model.set_weights(self.init_weights)
+        self.scaler.fit(X)
+        return self.model.fit(X, y, epochs=self.epochs, verbose=verbose)
+    def predict(self, X):
+        X = self.scaler.transform(X)
+        if self.flatten_output:
+            return self.model.predict(X).flatten()
+        else:
+            return self.model.predict(X)
+# This models are all tuned for the corrgroups60 dataset
+def corrgroups60__lasso():
+    """ Lasso Regression
+    """
+    return sklearn.linear_model.Lasso(alpha=0.1)
+def corrgroups60__ridge():
+    """ Ridge Regression
+    """
+    return sklearn.linear_model.Ridge(alpha=1.0)
+def corrgroups60__decision_tree():
+    """ Decision Tree
+    """
+    # max_depth was chosen to minimise test error
+    return sklearn.tree.DecisionTreeRegressor(random_state=0, max_depth=6)
+def corrgroups60__random_forest():
+    """ Random Forest
+    """
+    return sklearn.ensemble.RandomForestRegressor(100, random_state=0)
+def corrgroups60__gbm():
+    """ Gradient Boosted Trees
+    """
+    import xgboost
+    # max_depth and learning_rate were fixed then n_estimators was chosen using a train/test split
+    return xgboost.XGBRegressor(max_depth=6, n_estimators=50, learning_rate=0.1, n_jobs=8, random_state=0)
+def corrgroups60__ffnn():
+    """ 4-Layer Neural Network
+    """
+    from tensorflow.keras.layers import Dense
+    from tensorflow.keras.models import Sequential
+    model = Sequential()
+    model.add(Dense(32, activation='relu', input_dim=60))
+    model.add(Dense(20, activation='relu'))
+    model.add(Dense(20, activation='relu'))
+    model.add(Dense(1))
+    model.compile(optimizer='adam',
+                loss='mean_squared_error',
+                metrics=['mean_squared_error'])
+    return KerasWrap(model, 30, flatten_output=True)
+def independentlinear60__lasso():
+    """ Lasso Regression
+    """
+    return sklearn.linear_model.Lasso(alpha=0.1)
+def independentlinear60__ridge():
+    """ Ridge Regression
+    """
+    return sklearn.linear_model.Ridge(alpha=1.0)
+def independentlinear60__decision_tree():
+    """ Decision Tree
+    """
+    # max_depth was chosen to minimise test error
+    return sklearn.tree.DecisionTreeRegressor(random_state=0, max_depth=4)
+def independentlinear60__random_forest():
+    """ Random Forest
+    """
+    return sklearn.ensemble.RandomForestRegressor(100, random_state=0)
+def independentlinear60__gbm():
+    """ Gradient Boosted Trees
+    """
+    import xgboost
+     # max_depth and learning_rate were fixed then n_estimators was chosen using a train/test split
+    return xgboost.XGBRegressor(max_depth=6, n_estimators=100, learning_rate=0.1, n_jobs=8, random_state=0)
+def independentlinear60__ffnn():
+    """ 4-Layer Neural Network
+    """
+    from tensorflow.keras.layers import Dense
+    from tensorflow.keras.models import Sequential
+    model = Sequential()
+    model.add(Dense(32, activation='relu', input_dim=60))
+    model.add(Dense(20, activation='relu'))
+    model.add(Dense(20, activation='relu'))
+    model.add(Dense(1))
+    model.compile(optimizer='adam',
+                loss='mean_squared_error',
+                metrics=['mean_squared_error'])
+    return KerasWrap(model, 30, flatten_output=True)
+def cric__lasso():
+    """ Lasso Regression
+    """
+    model = sklearn.linear_model.LogisticRegression(penalty="l1", C=0.002)
+    # we want to explain the raw probability outputs of the trees
+    model.predict = lambda X: model.predict_proba(X)[:,1]
+    return model
+def cric__ridge():
+    """ Ridge Regression
+    """
+    model = sklearn.linear_model.LogisticRegression(penalty="l2")
+    # we want to explain the raw probability outputs of the trees
+    model.predict = lambda X: model.predict_proba(X)[:,1]
+    return model
+def cric__decision_tree():
+    """ Decision Tree
+    """
+    model = sklearn.tree.DecisionTreeClassifier(random_state=0, max_depth=4)
+    # we want to explain the raw probability outputs of the trees
+    model.predict = lambda X: model.predict_proba(X)[:,1]
+    return model
+def cric__random_forest():
+    """ Random Forest
+    """
+    model = sklearn.ensemble.RandomForestClassifier(100, random_state=0)
+    # we want to explain the raw probability outputs of the trees
+    model.predict = lambda X: model.predict_proba(X)[:,1]
+    return model
+def cric__gbm():
+    """ Gradient Boosted Trees
+    """
+    import xgboost
+    # max_depth and subsample match the params used for the full cric data in the paper
+    # learning_rate was set a bit higher to allow for faster runtimes
+    # n_estimators was chosen based on a train/test split of the data
+    model = xgboost.XGBClassifier(max_depth=5, n_estimators=400, learning_rate=0.01, subsample=0.2, n_jobs=8, random_state=0)
+    # we want to explain the margin, not the transformed probability outputs
+    model.__orig_predict = model.predict
+    model.predict = lambda X: model.__orig_predict(X, output_margin=True)
+    return model
+def cric__ffnn():
+    """ 4-Layer Neural Network
+    """
+    from tensorflow.keras.layers import Dense, Dropout
+    from tensorflow.keras.models import Sequential
+    model = Sequential()
+    model.add(Dense(10, activation='relu', input_dim=336))
+    model.add(Dropout(0.5))
+    model.add(Dense(10, activation='relu'))
+    model.add(Dropout(0.5))
+    model.add(Dense(1, activation='sigmoid'))
+    model.compile(optimizer='adam',
+                loss='binary_crossentropy',
+                metrics=['accuracy'])
+    return KerasWrap(model, 30, flatten_output=True)
+def human__decision_tree():
+    """ Decision Tree
+    """
+    # build data
+    N = 1000000
+    M = 3
+    X = np.zeros((N,M))
+    X.shape
+    y = np.zeros(N)
+    X[0, 0] = 1
+    y[0] = 8
+    X[1, 1] = 1
+    y[1] = 8
+    X[2, 0:2] = 1
+    y[2] = 4
+    # fit model
+    xor_model = sklearn.tree.DecisionTreeRegressor(max_depth=2)
+    xor_model.fit(X, y)
+    return xor_model

lib/shap/benchmark/plots.py ADDED Viewed

	@@ -0,0 +1,566 @@

+import base64
+import io
+import os
+import numpy as np
+import sklearn
+from matplotlib.colors import LinearSegmentedColormap
+from .. import __version__
+from ..plots import colors
+from . import methods, metrics, models
+from .experiments import run_experiments
+try:
+    import matplotlib
+    import matplotlib.pyplot as pl
+    from IPython.display import HTML
+except ImportError:
+    pass
+metadata = {
+    # "runtime": {
+    #     "title": "Runtime",
+    #     "sort_order": 1
+    # },
+    # "local_accuracy": {
+    #     "title": "Local Accuracy",
+    #     "sort_order": 2
+    # },
+    # "consistency_guarantees": {
+    #     "title": "Consistency Guarantees",
+    #     "sort_order": 3
+    # },
+    # "keep_positive_mask": {
+    #     "title": "Keep Positive (mask)",
+    #     "xlabel": "Max fraction of features kept",
+    #     "ylabel": "Mean model output",
+    #     "sort_order": 4
+    # },
+    # "keep_negative_mask": {
+    #     "title": "Keep Negative (mask)",
+    #     "xlabel": "Max fraction of features kept",
+    #     "ylabel": "Negative mean model output",
+    #     "sort_order": 5
+    # },
+    # "keep_absolute_mask__r2": {
+    #     "title": "Keep Absolute (mask)",
+    #     "xlabel": "Max fraction of features kept",
+    #     "ylabel": "R^2",
+    #     "sort_order": 6
+    # },
+    # "keep_absolute_mask__roc_auc": {
+    #     "title": "Keep Absolute (mask)",
+    #     "xlabel": "Max fraction of features kept",
+    #     "ylabel": "ROC AUC",
+    #     "sort_order": 6
+    # },
+    # "remove_positive_mask": {
+    #     "title": "Remove Positive (mask)",
+    #     "xlabel": "Max fraction of features removed",
+    #     "ylabel": "Negative mean model output",
+    #     "sort_order": 7
+    # },
+    # "remove_negative_mask": {
+    #     "title": "Remove Negative (mask)",
+    #     "xlabel": "Max fraction of features removed",
+    #     "ylabel": "Mean model output",
+    #     "sort_order": 8
+    # },
+    # "remove_absolute_mask__r2": {
+    #     "title": "Remove Absolute (mask)",
+    #     "xlabel": "Max fraction of features removed",
+    #     "ylabel": "1 - R^2",
+    #     "sort_order": 9
+    # },
+    # "remove_absolute_mask__roc_auc": {
+    #     "title": "Remove Absolute (mask)",
+    #     "xlabel": "Max fraction of features removed",
+    #     "ylabel": "1 - ROC AUC",
+    #     "sort_order": 9
+    # },
+    # "keep_positive_resample": {
+    #     "title": "Keep Positive (resample)",
+    #     "xlabel": "Max fraction of features kept",
+    #     "ylabel": "Mean model output",
+    #     "sort_order": 10
+    # },
+    # "keep_negative_resample": {
+    #     "title": "Keep Negative (resample)",
+    #     "xlabel": "Max fraction of features kept",
+    #     "ylabel": "Negative mean model output",
+    #     "sort_order": 11
+    # },
+    # "keep_absolute_resample__r2": {
+    #     "title": "Keep Absolute (resample)",
+    #     "xlabel": "Max fraction of features kept",
+    #     "ylabel": "R^2",
+    #     "sort_order": 12
+    # },
+    # "keep_absolute_resample__roc_auc": {
+    #     "title": "Keep Absolute (resample)",
+    #     "xlabel": "Max fraction of features kept",
+    #     "ylabel": "ROC AUC",
+    #     "sort_order": 12
+    # },
+    # "remove_positive_resample": {
+    #     "title": "Remove Positive (resample)",
+    #     "xlabel": "Max fraction of features removed",
+    #     "ylabel": "Negative mean model output",
+    #     "sort_order": 13
+    # },
+    # "remove_negative_resample": {
+    #     "title": "Remove Negative (resample)",
+    #     "xlabel": "Max fraction of features removed",
+    #     "ylabel": "Mean model output",
+    #     "sort_order": 14
+    # },
+    # "remove_absolute_resample__r2": {
+    #     "title": "Remove Absolute (resample)",
+    #     "xlabel": "Max fraction of features removed",
+    #     "ylabel": "1 - R^2",
+    #     "sort_order": 15
+    # },
+    # "remove_absolute_resample__roc_auc": {
+    #     "title": "Remove Absolute (resample)",
+    #     "xlabel": "Max fraction of features removed",
+    #     "ylabel": "1 - ROC AUC",
+    #     "sort_order": 15
+    # },
+    # "remove_positive_retrain": {
+    #     "title": "Remove Positive (retrain)",
+    #     "xlabel": "Max fraction of features removed",
+    #     "ylabel": "Negative mean model output",
+    #     "sort_order": 11
+    # },
+    # "remove_negative_retrain": {
+    #     "title": "Remove Negative (retrain)",
+    #     "xlabel": "Max fraction of features removed",
+    #     "ylabel": "Mean model output",
+    #     "sort_order": 12
+    # },
+    # "keep_positive_retrain": {
+    #     "title": "Keep Positive (retrain)",
+    #     "xlabel": "Max fraction of features kept",
+    #     "ylabel": "Mean model output",
+    #     "sort_order": 6
+    # },
+    # "keep_negative_retrain": {
+    #     "title": "Keep Negative (retrain)",
+    #     "xlabel": "Max fraction of features kept",
+    #     "ylabel": "Negative mean model output",
+    #     "sort_order": 7
+    # },
+    # "batch_remove_absolute__r2": {
+    #     "title": "Batch Remove Absolute",
+    #     "xlabel": "Fraction of features removed",
+    #     "ylabel": "1 - R^2",
+    #     "sort_order": 13
+    # },
+    # "batch_keep_absolute__r2": {
+    #     "title": "Batch Keep Absolute",
+    #     "xlabel": "Fraction of features kept",
+    #     "ylabel": "R^2",
+    #     "sort_order": 8
+    # },
+    # "batch_remove_absolute__roc_auc": {
+    #     "title": "Batch Remove Absolute",
+    #     "xlabel": "Fraction of features removed",
+    #     "ylabel": "1 - ROC AUC",
+    #     "sort_order": 13
+    # },
+    # "batch_keep_absolute__roc_auc": {
+    #     "title": "Batch Keep Absolute",
+    #     "xlabel": "Fraction of features kept",
+    #     "ylabel": "ROC AUC",
+    #     "sort_order": 8
+    # },
+    # "linear_shap_corr": {
+    #     "title": "Linear SHAP (corr)"
+    # },
+    # "linear_shap_ind": {
+    #     "title": "Linear SHAP (ind)"
+    # },
+    # "coef": {
+    #     "title": "Coefficients"
+    # },
+    # "random": {
+    #     "title": "Random"
+    # },
+    # "kernel_shap_1000_meanref": {
+    #     "title": "Kernel SHAP 1000 mean ref."
+    # },
+    # "sampling_shap_1000": {
+    #     "title": "Sampling SHAP 1000"
+    # },
+    # "tree_shap_tree_path_dependent": {
+    #     "title": "Tree SHAP"
+    # },
+    # "saabas": {
+    #     "title": "Saabas"
+    # },
+    # "tree_gain": {
+    #     "title": "Gain/Gini Importance"
+    # },
+    # "mean_abs_tree_shap": {
+    #     "title": "mean(|Tree SHAP|)"
+    # },
+    # "lasso_regression": {
+    #     "title": "Lasso Regression"
+    # },
+    # "ridge_regression": {
+    #     "title": "Ridge Regression"
+    # },
+    # "gbm_regression": {
+    #     "title": "Gradient Boosting Regression"
+    # }
+}
+benchmark_color_map = {
+    "tree_shap": "#1E88E5",
+    "deep_shap": "#1E88E5",
+    "linear_shap_corr": "#1E88E5",
+    "linear_shap_ind": "#ff0d57",
+    "coef": "#13B755",
+    "random": "#999999",
+    "const_random": "#666666",
+    "kernel_shap_1000_meanref": "#7C52FF"
+}
+# negated_metrics = [
+#     "runtime",
+#     "remove_positive_retrain",
+#     "remove_positive_mask",
+#     "remove_positive_resample",
+#     "keep_negative_retrain",
+#     "keep_negative_mask",
+#     "keep_negative_resample"
+# ]
+# one_minus_metrics = [
+#     "remove_absolute_mask__r2",
+#     "remove_absolute_mask__roc_auc",
+#     "remove_absolute_resample__r2",
+#     "remove_absolute_resample__roc_auc"
+# ]
+def get_method_color(method):
+    for line in getattr(methods, method).__doc__.split("\n"):
+        line = line.strip()
+        if line.startswith("color = "):
+            v = line.split("=")[1].strip()
+            if v.startswith("red_blue_circle("):
+                return colors.red_blue_circle(float(v[16:-1]))
+            else:
+                return v
+    return "#000000"
+def get_method_linestyle(method):
+    for line in getattr(methods, method).__doc__.split("\n"):
+        line = line.strip()
+        if line.startswith("linestyle = "):
+            return line.split("=")[1].strip()
+    return "solid"
+def get_metric_attr(metric, attr):
+    for line in getattr(metrics, metric).__doc__.split("\n"):
+        line = line.strip()
+        # string
+        prefix = attr+" = \""
+        suffix = "\""
+        if line.startswith(prefix) and line.endswith(suffix):
+            return line[len(prefix):-len(suffix)]
+        # number
+        prefix = attr+" = "
+        if line.startswith(prefix):
+            return float(line[len(prefix):])
+    return ""
+def plot_curve(dataset, model, metric, cmap=benchmark_color_map):
+    experiments = run_experiments(dataset=dataset, model=model, metric=metric)
+    pl.figure()
+    method_arr = []
+    for (name,(fcounts,scores)) in experiments:
+        _,_,method,_ = name
+        transform = get_metric_attr(metric, "transform")
+        if transform == "negate":
+            scores = -scores
+        elif transform == "one_minus":
+            scores = 1 - scores
+        auc = sklearn.metrics.auc(fcounts, scores) / fcounts[-1]
+        method_arr.append((auc, method, scores))
+    for (auc,method,scores) in sorted(method_arr):
+        method_title = getattr(methods, method).__doc__.split("\n")[0].strip()
+        label = f"{auc:6.3f} - " + method_title
+        pl.plot(
+            fcounts / fcounts[-1], scores, label=label,
+            color=get_method_color(method), linewidth=2,
+            linestyle=get_method_linestyle(method)
+            )
+    metric_title = getattr(metrics, metric).__doc__.split("\n")[0].strip()
+    pl.xlabel(get_metric_attr(metric, "xlabel"))
+    pl.ylabel(get_metric_attr(metric, "ylabel"))
+    model_title = getattr(models, dataset+"__"+model).__doc__.split("\n")[0].strip()
+    pl.title(metric_title + " - " + model_title)
+    pl.gca().xaxis.set_ticks_position('bottom')
+    pl.gca().yaxis.set_ticks_position('left')
+    pl.gca().spines['right'].set_visible(False)
+    pl.gca().spines['top'].set_visible(False)
+    ahandles, alabels = pl.gca().get_legend_handles_labels()
+    pl.legend(reversed(ahandles), reversed(alabels))
+    return pl.gcf()
+def plot_human(dataset, model, metric, cmap=benchmark_color_map):
+    experiments = run_experiments(dataset=dataset, model=model, metric=metric)
+    pl.figure()
+    method_arr = []
+    for (name,(fcounts,scores)) in experiments:
+        _,_,method,_ = name
+        diff_sum = np.sum(np.abs(scores[1] - scores[0]))
+        method_arr.append((diff_sum, method, scores[0], scores[1]))
+    inds = np.arange(3)    # the x locations for the groups
+    inc_width = (1.0 / len(method_arr)) * 0.8
+    width = inc_width * 0.9
+    pl.bar(inds, method_arr[0][2], width, label="Human Consensus", color="black", edgecolor="white")
+    i = 1
+    line_style_to_hatch = {
+        "dashed": "///",
+        "dotted": "..."
+    }
+    for (diff_sum, method, _, methods_attrs) in sorted(method_arr):
+        method_title = getattr(methods, method).__doc__.split("\n")[0].strip()
+        label = f"{diff_sum:.2f} - " + method_title
+        pl.bar(
+            inds + inc_width * i, methods_attrs.flatten(), width, label=label, edgecolor="white",
+            color=get_method_color(method), hatch=line_style_to_hatch.get(get_method_linestyle(method), None)
+        )
+        i += 1
+    metric_title = getattr(metrics, metric).__doc__.split("\n")[0].strip()
+    pl.xlabel("Features in the model")
+    pl.ylabel("Feature attribution value")
+    model_title = getattr(models, dataset+"__"+model).__doc__.split("\n")[0].strip()
+    pl.title(metric_title + " - " + model_title)
+    pl.gca().xaxis.set_ticks_position('bottom')
+    pl.gca().yaxis.set_ticks_position('left')
+    pl.gca().spines['right'].set_visible(False)
+    pl.gca().spines['top'].set_visible(False)
+    ahandles, alabels = pl.gca().get_legend_handles_labels()
+    #pl.legend(ahandles, alabels)
+    pl.xticks(np.array([0, 1, 2, 3]) - (inc_width + width)/2, ["", "", "", ""])
+    pl.gca().xaxis.set_minor_locator(matplotlib.ticker.FixedLocator([0.4, 1.4, 2.4]))
+    pl.gca().xaxis.set_minor_formatter(matplotlib.ticker.FixedFormatter(["Fever", "Cough", "Headache"]))
+    pl.gca().tick_params(which='minor', length=0)
+    pl.axhline(0, color="#aaaaaa", linewidth=0.5)
+    box = pl.gca().get_position()
+    pl.gca().set_position([
+        box.x0, box.y0 + box.height * 0.3,
+        box.width, box.height * 0.7
+    ])
+    # Put a legend below current axis
+    pl.gca().legend(ahandles, alabels, loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)
+    return pl.gcf()
+def _human_score_map(human_consensus, methods_attrs):
+    """ Converts human agreement differences to numerical scores for coloring.
+    """
+    v = 1 - min(np.sum(np.abs(methods_attrs - human_consensus)) / (np.abs(human_consensus).sum() + 1), 1.0)
+    return v
+def make_grid(scores, dataset, model, normalize=True, transform=True):
+    color_vals = {}
+    metric_sort_order = {}
+    for (_,_,method,metric),(fcounts,score) in filter(lambda x: x[0][0] == dataset and x[0][1] == model, scores):
+        metric_sort_order[metric] = get_metric_attr(metric, "sort_order")
+        if metric not in color_vals:
+            color_vals[metric] = {}
+        if transform:
+            transform_type = get_metric_attr(metric, "transform")
+            if transform_type == "negate":
+                score = -score
+            elif transform_type == "one_minus":
+                score = 1 - score
+            elif transform_type == "negate_log":
+                score = -np.log10(score)
+        if fcounts is None:
+            color_vals[metric][method] = score
+        elif fcounts == "human":
+            color_vals[metric][method] = _human_score_map(*score)
+        else:
+            auc = sklearn.metrics.auc(fcounts, score) / fcounts[-1]
+            color_vals[metric][method] = auc
+    # print(metric_sort_order)
+    # col_keys = sorted(list(color_vals.keys()), key=lambda v: metric_sort_order[v])
+    # print(col_keys)
+    col_keys = list(color_vals.keys())
+    row_keys = list({v for k in col_keys for v in color_vals[k].keys()})
+    data = -28567 * np.ones((len(row_keys), len(col_keys)))
+    for i in range(len(row_keys)):
+        for j in range(len(col_keys)):
+            data[i,j] = color_vals[col_keys[j]][row_keys[i]]
+    assert np.sum(data == -28567) == 0, "There are missing data values!"
+    if normalize:
+        data = (data - data.min(0)) / (data.max(0) - data.min(0) + 1e-8)
+    # sort by performans
+    inds = np.argsort(-data.mean(1))
+    row_keys = [row_keys[i] for i in inds]
+    data = data[inds,:]
+    return row_keys, col_keys, data
+red_blue_solid = LinearSegmentedColormap('red_blue_solid', {
+    'red': ((0.0, 198./255, 198./255),
+            (1.0, 5./255, 5./255)),
+    'green': ((0.0, 34./255, 34./255),
+              (1.0, 198./255, 198./255)),
+    'blue': ((0.0, 5./255, 5./255),
+             (1.0, 24./255, 24./255)),
+    'alpha': ((0.0, 1, 1),
+              (1.0, 1, 1))
+})
+def plot_grids(dataset, model_names, out_dir=None):
+    if out_dir is not None:
+        os.mkdir(out_dir)
+    scores = []
+    for model in model_names:
+        scores.extend(run_experiments(dataset=dataset, model=model))
+    prefix = "<style type='text/css'> .shap_benchmark__select:focus { outline-width: 0 }</style>"
+    out = "" # background: rgb(30, 136, 229)
+    # out += "<div style='font-weight: regular; font-size: 24px; text-align: center; background: #f8f8f8; color: #000; padding: 20px;'>SHAP Benchmark</div>\n"
+    # out += "<div style='height: 1px; background: #ddd;'></div>\n"
+    #out += "<div style='height: 7px; background-image: linear-gradient(to right, rgb(30, 136, 229), rgb(255, 13, 87));'></div>"
+    out += "<div style='position: fixed; left: 0px; top: 0px; right: 0px; height: 230px; background: #fff;'>\n" # box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.2), 0 6px 20px 0 rgba(0, 0, 0, 0.19);
+    out += "<div style='position: absolute; bottom: 0px; left: 0px; right: 0px;' align='center'><table style='border-width: 1px; margin-right: 100px'>\n"
+    for ind,model in enumerate(model_names):
+        row_keys, col_keys, data = make_grid(scores, dataset, model)
+#         print(data)
+#         print(colors.red_blue_solid(0.))
+#         print(colors.red_blue_solid(1.))
+#         return
+        for metric in col_keys:
+            save_plot = False
+            if metric.startswith("human_"):
+                plot_human(dataset, model, metric)
+                save_plot = True
+            elif metric not in ["local_accuracy", "runtime", "consistency_guarantees"]:
+                plot_curve(dataset, model, metric)
+                save_plot = True
+            if save_plot:
+                buf = io.BytesIO()
+                pl.gcf().set_size_inches(1200.0/175,1000.0/175)
+                pl.savefig(buf, format='png', dpi=175)
+                if out_dir is not None:
+                    pl.savefig(f"{out_dir}/plot_{dataset}_{model}_{metric}.pdf", format='pdf')
+                pl.close()
+                buf.seek(0)
+                data_uri = base64.b64encode(buf.read()).decode('utf-8').replace('\n', '')
+                plot_id = "plot__"+dataset+"__"+model+"__"+metric
+                prefix += f"<div onclick='document.getElementById(\"{plot_id}\").style.display = \"none\"' style='display: none; position: fixed; z-index: 10000; left: 0px; right: 0px; top: 0px; bottom: 0px; background: rgba(255,255,255,0.9);' id='{plot_id}'>"
+                prefix += "<img width='600' height='500' style='margin-left: auto; margin-right: auto; margin-top: 230px; box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.2), 0 6px 20px 0 rgba(0, 0, 0, 0.19);' src='data:image/png;base64,%s'>" % data_uri
+                prefix += "</div>"
+        model_title = getattr(models, dataset+"__"+model).__doc__.split("\n")[0].strip()
+        if ind == 0:
+            out += "<tr><td style='background: #fff; width: 250px'></td></td>"
+            for j in range(data.shape[1]):
+                metric_title = getattr(metrics, col_keys[j]).__doc__.split("\n")[0].strip()
+                out += "<td style='width: 40px; min-width: 40px; background: #fff; text-align: right;'><div style='margin-left: 10px; margin-bottom: -5px; white-space: nowrap; transform: rotate(-45deg); transform-origin: left top 0; width: 1.5em; margin-top: 8em'>" + metric_title + "</div></td>"
+            out += "</tr>\n"
+            out += "</table></div></div>\n"
+            out += "<table style='border-width: 1px; margin-right: 100px; margin-top: 230px;'>\n"
+        out += "<tr><td style='background: #fff'></td><td colspan='%d' style='background: #fff; font-weight: bold; text-align: center; margin-top: 10px;'>%s</td></tr>\n" % (data.shape[1], model_title)
+        for i in range(data.shape[0]):
+            out += "<tr>"
+#             if i == 0:
+#                 out += "<td rowspan='%d' style='background: #fff; text-align: center; white-space: nowrap; vertical-align: middle; '><div style='font-weight: bold; transform: rotate(-90deg); transform-origin: left top 0; width: 1.5em; margin-top: 8em'>%s</div></td>" % (data.shape[0], model_name)
+            method_title = getattr(methods, row_keys[i]).__doc__.split("\n")[0].strip()
+            out += "<td style='background: #ffffff; text-align: right; width: 250px' title='shap.LinearExplainer(model)'>" + method_title + "</td>\n"
+            for j in range(data.shape[1]):
+                plot_id = "plot__"+dataset+"__"+model+"__"+col_keys[j]
+                out += "<td onclick='document.getElementById(\"%s\").style.display = \"block\"' style='padding: 0px; padding-left: 0px; padding-right: 0px; border-left: 0px solid #999; width: 42px; min-width: 42px; height: 34px; background-color: #fff'>" % plot_id
+                #out += "<div style='opacity: "+str(2*(max(1-data[i,j], data[i,j])-0.5))+"; background-color: rgb" + str(tuple(v*255 for v in colors.red_blue_solid(0. if data[i,j] < 0.5 else 1.)[:-1])) + "; height: "+str((30*max(1-data[i,j], data[i,j])))+"px; margin-left: auto; margin-right: auto; width:"+str((30*max(1-data[i,j], data[i,j])))+"px'></div>"
+                out += "<div style='opacity: "+str(1)+"; background-color: rgb" + str(tuple(int(v*255) for v in colors.red_blue_no_bounds(5*(data[i,j]-0.8))[:-1])) + "; height: "+str(30*data[i,j])+"px; margin-left: auto; margin-right: auto; width:"+str(30*data[i,j])+"px'></div>"
+                #out += "<div style='float: left; background-color: #eee; height: 10px; width: "+str((40*(1-data[i,j])))+"px'></div>"
+                out += "</td>\n"
+            out += "</tr>\n" #
+        out += "<tr><td colspan='%d' style='background: #fff'></td></tr>" % (data.shape[1] + 1)
+    out += "</table>"
+    out += "<div style='position: fixed; left: 0px; top: 0px; right: 0px; text-align: left; padding: 20px; text-align: right'>\n"
+    out += "<div style='float: left; font-weight: regular; font-size: 24px; color: #000;'>SHAP Benchmark <span style='font-size: 14px; color: #777777;'>v"+__version__+"</span></div>\n"
+# select {
+#   margin: 50px;
+#   width: 150px;
+#   padding: 5px 35px 5px 5px;
+#   font-size: 16px;
+#   border: 1px solid #ccc;
+#   height: 34px;
+#   -webkit-appearance: none;
+#   -moz-appearance: none;
+#   appearance: none;
+#   background: url(http://www.stackoverflow.com/favicon.ico) 96% / 15% no-repeat #eee;
+# }
+    #out += "<div style='display: inline-block; margin-right: 20px; font-weight: normal; text-decoration: none; font-size: 18px; color: #000;'>Dataset:</div>\n"
+    out += "<select id='shap_benchmark__select' onchange=\"document.location = '../' + this.value + '/index.html'\"dir='rtl' class='shap_benchmark__select' style='font-weight: normal; font-size: 20px; color: #000; padding: 10px; background: #fff; border: 1px solid #fff; -webkit-appearance: none; appearance: none;'>\n"
+    out += "<option value='human' "+("selected" if dataset == "human" else "")+">Agreement with Human Intuition</option>\n"
+    out += "<option value='corrgroups60' "+("selected" if dataset == "corrgroups60" else "")+">Correlated Groups 60 Dataset</option>\n"
+    out += "<option value='independentlinear60' "+("selected" if dataset == "independentlinear60" else "")+">Independent Linear 60 Dataset</option>\n"
+    #out += "<option>CRIC</option>\n"
+    out += "</select>\n"
+    #out += "<script> document.onload = function() { document.getElementById('shap_benchmark__select').value = '"+dataset+"'; }</script>"
+    #out += "<div style='display: inline-block; margin-left: 20px; font-weight: normal; text-decoration: none; font-size: 18px; color: #000;'>CRIC</div>\n"
+    out += "</div>\n"
+    # output the legend
+    out += "<table style='border-width: 0px; width: 100px; position: fixed; right: 50px; top: 200px; background: rgba(255, 255, 255, 0.9)'>\n"
+    out += "<tr><td style='background: #fff; font-weight: normal; text-align: center'>Higher score</td></tr>\n"
+    legend_size = 21
+    for i in range(legend_size-9):
+        out += "<tr>"
+        out += "<td style='padding: 0px; padding-left: 0px; padding-right: 0px; border-left: 0px solid #999; height: 34px'>"
+        val = (legend_size-i-1) / (legend_size-1)
+        out += "<div style='opacity: 1; background-color: rgb" + str(tuple(int(v*255) for v in colors.red_blue_no_bounds(5*(val-0.8)))[:-1]) + "; height: "+str(30*val)+"px; margin-left: auto; margin-right: auto; width:"+str(30*val)+"px'></div>"
+        out += "</td>"
+        out += "</tr>\n" #
+    out += "<tr><td style='background: #fff; font-weight: normal; text-align: center'>Lower score</td></tr>\n"
+    out += "</table>\n"
+    if out_dir is not None:
+        with open(out_dir + "/index.html", "w") as f:
+            f.write("<html><body style='margin: 0px; font-size: 16px; font-family: \"Myriad Pro\", Arial, sans-serif;'><center>")
+            f.write(prefix)
+            f.write(out)
+            f.write("</center></body></html>")
+    else:
+        return HTML(prefix + out)

lib/shap/cext/_cext.cc ADDED Viewed

	@@ -0,0 +1,560 @@

+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <Python.h>
+#include <numpy/arrayobject.h>
+#include "tree_shap.h"
+#include <iostream>
+static PyObject *_cext_dense_tree_shap(PyObject *self, PyObject *args);
+static PyObject *_cext_dense_tree_predict(PyObject *self, PyObject *args);
+static PyObject *_cext_dense_tree_update_weights(PyObject *self, PyObject *args);
+static PyObject *_cext_dense_tree_saabas(PyObject *self, PyObject *args);
+static PyObject *_cext_compute_expectations(PyObject *self, PyObject *args);
+static PyMethodDef module_methods[] = {
+    {"dense_tree_shap", _cext_dense_tree_shap, METH_VARARGS, "C implementation of Tree SHAP for dense."},
+    {"dense_tree_predict", _cext_dense_tree_predict, METH_VARARGS, "C implementation of tree predictions."},
+    {"dense_tree_update_weights", _cext_dense_tree_update_weights, METH_VARARGS, "C implementation of tree node weight compuatations."},
+    {"dense_tree_saabas", _cext_dense_tree_saabas, METH_VARARGS, "C implementation of Saabas (rough fast approximation to Tree SHAP)."},
+    {"compute_expectations", _cext_compute_expectations, METH_VARARGS, "Compute expectations of internal nodes."},
+    {NULL, NULL, 0, NULL}
+};
+#if PY_MAJOR_VERSION >= 3
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "_cext",
+    "This module provides an interface for a fast Tree SHAP implementation.",
+    -1,
+    module_methods,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+#endif
+#if PY_MAJOR_VERSION >= 3
+PyMODINIT_FUNC PyInit__cext(void)
+#else
+PyMODINIT_FUNC init_cext(void)
+#endif
+{
+    #if PY_MAJOR_VERSION >= 3
+        PyObject *module = PyModule_Create(&moduledef);
+        if (!module) return NULL;
+    #else
+        PyObject *module = Py_InitModule("_cext", module_methods);
+        if (!module) return;
+    #endif
+    /* Load `numpy` functionality. */
+    import_array();
+    #if PY_MAJOR_VERSION >= 3
+        return module;
+    #endif
+}
+static PyObject *_cext_compute_expectations(PyObject *self, PyObject *args)
+{
+    PyObject *children_left_obj;
+    PyObject *children_right_obj;
+    PyObject *node_sample_weight_obj;
+    PyObject *values_obj;
+    /* Parse the input tuple */
+    if (!PyArg_ParseTuple(
+        args, "OOOO", &children_left_obj, &children_right_obj, &node_sample_weight_obj, &values_obj
+    )) return NULL;
+    /* Interpret the input objects as numpy arrays. */
+    PyArrayObject *children_left_array = (PyArrayObject*)PyArray_FROM_OTF(children_left_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *children_right_array = (PyArrayObject*)PyArray_FROM_OTF(children_right_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *node_sample_weight_array = (PyArrayObject*)PyArray_FROM_OTF(node_sample_weight_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *values_array = (PyArrayObject*)PyArray_FROM_OTF(values_obj, NPY_DOUBLE, NPY_ARRAY_INOUT_ARRAY);
+    /* If that didn't work, throw an exception. */
+    if (children_left_array == NULL || children_right_array == NULL ||
+        values_array == NULL || node_sample_weight_array == NULL) {
+        Py_XDECREF(children_left_array);
+        Py_XDECREF(children_right_array);
+        //PyArray_ResolveWritebackIfCopy(values_array);
+        Py_XDECREF(values_array);
+        Py_XDECREF(node_sample_weight_array);
+        return NULL;
+    }
+    TreeEnsemble tree;
+    // number of outputs
+    tree.num_outputs = PyArray_DIM(values_array, 1);
+    /* Get pointers to the data as C-types. */
+    tree.children_left = (int*)PyArray_DATA(children_left_array);
+    tree.children_right = (int*)PyArray_DATA(children_right_array);
+    tree.values = (tfloat*)PyArray_DATA(values_array);
+    tree.node_sample_weights = (tfloat*)PyArray_DATA(node_sample_weight_array);
+    const int max_depth = compute_expectations(tree);
+    // clean up the created python objects
+    Py_XDECREF(children_left_array);
+    Py_XDECREF(children_right_array);
+    //PyArray_ResolveWritebackIfCopy(values_array);
+    Py_XDECREF(values_array);
+    Py_XDECREF(node_sample_weight_array);
+    PyObject *ret = Py_BuildValue("i", max_depth);
+    return ret;
+}
+static PyObject *_cext_dense_tree_shap(PyObject *self, PyObject *args)
+{
+    PyObject *children_left_obj;
+    PyObject *children_right_obj;
+    PyObject *children_default_obj;
+    PyObject *features_obj;
+    PyObject *thresholds_obj;
+    PyObject *values_obj;
+    PyObject *node_sample_weights_obj;
+    int max_depth;
+    PyObject *X_obj;
+    PyObject *X_missing_obj;
+    PyObject *y_obj;
+    PyObject *R_obj;
+    PyObject *R_missing_obj;
+    int tree_limit;
+    PyObject *out_contribs_obj;
+    int feature_dependence;
+    int model_output;
+    PyObject *base_offset_obj;
+    bool interactions;
+    /* Parse the input tuple */
+    if (!PyArg_ParseTuple(
+        args, "OOOOOOOiOOOOOiOOiib", &children_left_obj, &children_right_obj, &children_default_obj,
+        &features_obj, &thresholds_obj, &values_obj, &node_sample_weights_obj,
+        &max_depth, &X_obj, &X_missing_obj, &y_obj, &R_obj, &R_missing_obj, &tree_limit, &base_offset_obj,
+        &out_contribs_obj, &feature_dependence, &model_output, &interactions
+    )) return NULL;
+    /* Interpret the input objects as numpy arrays. */
+    PyArrayObject *children_left_array = (PyArrayObject*)PyArray_FROM_OTF(children_left_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *children_right_array = (PyArrayObject*)PyArray_FROM_OTF(children_right_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *children_default_array = (PyArrayObject*)PyArray_FROM_OTF(children_default_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *features_array = (PyArrayObject*)PyArray_FROM_OTF(features_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *thresholds_array = (PyArrayObject*)PyArray_FROM_OTF(thresholds_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *values_array = (PyArrayObject*)PyArray_FROM_OTF(values_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *node_sample_weights_array = (PyArrayObject*)PyArray_FROM_OTF(node_sample_weights_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *X_array = (PyArrayObject*)PyArray_FROM_OTF(X_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *X_missing_array = (PyArrayObject*)PyArray_FROM_OTF(X_missing_obj, NPY_BOOL, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *y_array = NULL;
+    if (y_obj != Py_None) y_array = (PyArrayObject*)PyArray_FROM_OTF(y_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *R_array = NULL;
+    if (R_obj != Py_None) R_array = (PyArrayObject*)PyArray_FROM_OTF(R_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *R_missing_array = NULL;
+    if (R_missing_obj != Py_None) R_missing_array = (PyArrayObject*)PyArray_FROM_OTF(R_missing_obj, NPY_BOOL, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *out_contribs_array = (PyArrayObject*)PyArray_FROM_OTF(out_contribs_obj, NPY_DOUBLE, NPY_ARRAY_INOUT_ARRAY);
+    PyArrayObject *base_offset_array = (PyArrayObject*)PyArray_FROM_OTF(base_offset_obj, NPY_DOUBLE, NPY_ARRAY_INOUT_ARRAY);
+    /* If that didn't work, throw an exception. Note that R and y are optional. */
+    if (children_left_array == NULL || children_right_array == NULL ||
+        children_default_array == NULL || features_array == NULL || thresholds_array == NULL ||
+        values_array == NULL || node_sample_weights_array == NULL || X_array == NULL ||
+        X_missing_array == NULL || out_contribs_array == NULL) {
+        Py_XDECREF(children_left_array);
+        Py_XDECREF(children_right_array);
+        Py_XDECREF(children_default_array);
+        Py_XDECREF(features_array);
+        Py_XDECREF(thresholds_array);
+        Py_XDECREF(values_array);
+        Py_XDECREF(node_sample_weights_array);
+        Py_XDECREF(X_array);
+        Py_XDECREF(X_missing_array);
+        if (y_array != NULL) Py_XDECREF(y_array);
+        if (R_array != NULL) Py_XDECREF(R_array);
+        if (R_missing_array != NULL) Py_XDECREF(R_missing_array);
+        //PyArray_ResolveWritebackIfCopy(out_contribs_array);
+        Py_XDECREF(out_contribs_array);
+        Py_XDECREF(base_offset_array);
+        return NULL;
+    }
+    const unsigned num_X = PyArray_DIM(X_array, 0);
+    const unsigned M = PyArray_DIM(X_array, 1);
+    const unsigned max_nodes = PyArray_DIM(values_array, 1);
+    const unsigned num_outputs = PyArray_DIM(values_array, 2);
+    unsigned num_R = 0;
+    if (R_array != NULL) num_R = PyArray_DIM(R_array, 0);
+    // Get pointers to the data as C-types
+    int *children_left = (int*)PyArray_DATA(children_left_array);
+    int *children_right = (int*)PyArray_DATA(children_right_array);
+    int *children_default = (int*)PyArray_DATA(children_default_array);
+    int *features = (int*)PyArray_DATA(features_array);
+    tfloat *thresholds = (tfloat*)PyArray_DATA(thresholds_array);
+    tfloat *values = (tfloat*)PyArray_DATA(values_array);
+    tfloat *node_sample_weights = (tfloat*)PyArray_DATA(node_sample_weights_array);
+    tfloat *X = (tfloat*)PyArray_DATA(X_array);
+    bool *X_missing = (bool*)PyArray_DATA(X_missing_array);
+    tfloat *y = NULL;
+    if (y_array != NULL) y = (tfloat*)PyArray_DATA(y_array);
+    tfloat *R = NULL;
+    if (R_array != NULL) R = (tfloat*)PyArray_DATA(R_array);
+    bool *R_missing = NULL;
+    if (R_missing_array != NULL) R_missing = (bool*)PyArray_DATA(R_missing_array);
+    tfloat *out_contribs = (tfloat*)PyArray_DATA(out_contribs_array);
+    tfloat *base_offset = (tfloat*)PyArray_DATA(base_offset_array);
+    // these are just a wrapper objects for all the pointers and numbers associated with
+    // the ensemble tree model and the dataset we are explaining
+    TreeEnsemble trees = TreeEnsemble(
+        children_left, children_right, children_default, features, thresholds, values,
+        node_sample_weights, max_depth, tree_limit, base_offset,
+        max_nodes, num_outputs
+    );
+    ExplanationDataset data = ExplanationDataset(X, X_missing, y, R, R_missing, num_X, M, num_R);
+    dense_tree_shap(trees, data, out_contribs, feature_dependence, model_output, interactions);
+    // retrieve return value before python cleanup of objects
+    tfloat ret_value = (double)values[0];
+    // clean up the created python objects
+    Py_XDECREF(children_left_array);
+    Py_XDECREF(children_right_array);
+    Py_XDECREF(children_default_array);
+    Py_XDECREF(features_array);
+    Py_XDECREF(thresholds_array);
+    Py_XDECREF(values_array);
+    Py_XDECREF(node_sample_weights_array);
+    Py_XDECREF(X_array);
+    Py_XDECREF(X_missing_array);
+    if (y_array != NULL) Py_XDECREF(y_array);
+    if (R_array != NULL) Py_XDECREF(R_array);
+    if (R_missing_array != NULL) Py_XDECREF(R_missing_array);
+    //PyArray_ResolveWritebackIfCopy(out_contribs_array);
+    Py_XDECREF(out_contribs_array);
+    Py_XDECREF(base_offset_array);
+    /* Build the output tuple */
+    PyObject *ret = Py_BuildValue("d", ret_value);
+    return ret;
+}
+static PyObject *_cext_dense_tree_predict(PyObject *self, PyObject *args)
+{
+    PyObject *children_left_obj;
+    PyObject *children_right_obj;
+    PyObject *children_default_obj;
+    PyObject *features_obj;
+    PyObject *thresholds_obj;
+    PyObject *values_obj;
+    int max_depth;
+    int tree_limit;
+    PyObject *base_offset_obj;
+    int model_output;
+    PyObject *X_obj;
+    PyObject *X_missing_obj;
+    PyObject *y_obj;
+    PyObject *out_pred_obj;
+    /* Parse the input tuple */
+    if (!PyArg_ParseTuple(
+        args, "OOOOOOiiOiOOOO", &children_left_obj, &children_right_obj, &children_default_obj,
+        &features_obj, &thresholds_obj, &values_obj, &max_depth, &tree_limit, &base_offset_obj, &model_output,
+        &X_obj, &X_missing_obj, &y_obj, &out_pred_obj
+    )) return NULL;
+    /* Interpret the input objects as numpy arrays. */
+    PyArrayObject *children_left_array = (PyArrayObject*)PyArray_FROM_OTF(children_left_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *children_right_array = (PyArrayObject*)PyArray_FROM_OTF(children_right_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *children_default_array = (PyArrayObject*)PyArray_FROM_OTF(children_default_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *features_array = (PyArrayObject*)PyArray_FROM_OTF(features_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *thresholds_array = (PyArrayObject*)PyArray_FROM_OTF(thresholds_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *values_array = (PyArrayObject*)PyArray_FROM_OTF(values_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *base_offset_array = (PyArrayObject*)PyArray_FROM_OTF(base_offset_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *X_array = (PyArrayObject*)PyArray_FROM_OTF(X_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *X_missing_array = (PyArrayObject*)PyArray_FROM_OTF(X_missing_obj, NPY_BOOL, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *y_array = NULL;
+    if (y_obj != Py_None) y_array = (PyArrayObject*)PyArray_FROM_OTF(y_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *out_pred_array = (PyArrayObject*)PyArray_FROM_OTF(out_pred_obj, NPY_DOUBLE, NPY_ARRAY_INOUT_ARRAY);
+    /* If that didn't work, throw an exception. Note that R and y are optional. */
+    if (children_left_array == NULL || children_right_array == NULL ||
+        children_default_array == NULL || features_array == NULL || thresholds_array == NULL ||
+        values_array == NULL || X_array == NULL ||
+        X_missing_array == NULL || out_pred_array == NULL) {
+        Py_XDECREF(children_left_array);
+        Py_XDECREF(children_right_array);
+        Py_XDECREF(children_default_array);
+        Py_XDECREF(features_array);
+        Py_XDECREF(thresholds_array);
+        Py_XDECREF(values_array);
+        Py_XDECREF(base_offset_array);
+        Py_XDECREF(X_array);
+        Py_XDECREF(X_missing_array);
+        if (y_array != NULL) Py_XDECREF(y_array);
+        //PyArray_ResolveWritebackIfCopy(out_pred_array);
+        Py_XDECREF(out_pred_array);
+        return NULL;
+    }
+    const unsigned num_X = PyArray_DIM(X_array, 0);
+    const unsigned M = PyArray_DIM(X_array, 1);
+    const unsigned max_nodes = PyArray_DIM(values_array, 1);
+    const unsigned num_outputs = PyArray_DIM(values_array, 2);
+    const unsigned num_offsets = PyArray_DIM(base_offset_array, 0);
+    if (num_offsets != num_outputs) {
+        std::cerr << "The passed base_offset array does that have the same number of outputs as the values array: " << num_offsets << " vs. " << num_outputs << std::endl;
+        return NULL;
+    }
+    // Get pointers to the data as C-types
+    int *children_left = (int*)PyArray_DATA(children_left_array);
+    int *children_right = (int*)PyArray_DATA(children_right_array);
+    int *children_default = (int*)PyArray_DATA(children_default_array);
+    int *features = (int*)PyArray_DATA(features_array);
+    tfloat *thresholds = (tfloat*)PyArray_DATA(thresholds_array);
+    tfloat *values = (tfloat*)PyArray_DATA(values_array);
+    tfloat *base_offset = (tfloat*)PyArray_DATA(base_offset_array);
+    tfloat *X = (tfloat*)PyArray_DATA(X_array);
+    bool *X_missing = (bool*)PyArray_DATA(X_missing_array);
+    tfloat *y = NULL;
+    if (y_array != NULL) y = (tfloat*)PyArray_DATA(y_array);
+    tfloat *out_pred = (tfloat*)PyArray_DATA(out_pred_array);
+    // these are just wrapper objects for all the pointers and numbers associated with
+    // the ensemble tree model and the dataset we are explaining
+    TreeEnsemble trees = TreeEnsemble(
+        children_left, children_right, children_default, features, thresholds, values,
+        NULL, max_depth, tree_limit, base_offset,
+        max_nodes, num_outputs
+    );
+    ExplanationDataset data = ExplanationDataset(X, X_missing, y, NULL, NULL, num_X, M, 0);
+    dense_tree_predict(out_pred, trees, data, model_output);
+    // clean up the created python objects
+    Py_XDECREF(children_left_array);
+    Py_XDECREF(children_right_array);
+    Py_XDECREF(children_default_array);
+    Py_XDECREF(features_array);
+    Py_XDECREF(thresholds_array);
+    Py_XDECREF(values_array);
+    Py_XDECREF(base_offset_array);
+    Py_XDECREF(X_array);
+    Py_XDECREF(X_missing_array);
+    if (y_array != NULL) Py_XDECREF(y_array);
+    //PyArray_ResolveWritebackIfCopy(out_pred_array);
+    Py_XDECREF(out_pred_array);
+    /* Build the output tuple */
+    PyObject *ret = Py_BuildValue("d", (double)values[0]);
+    return ret;
+}
+static PyObject *_cext_dense_tree_update_weights(PyObject *self, PyObject *args)
+{
+    PyObject *children_left_obj;
+    PyObject *children_right_obj;
+    PyObject *children_default_obj;
+    PyObject *features_obj;
+    PyObject *thresholds_obj;
+    PyObject *values_obj;
+    int tree_limit;
+    PyObject *node_sample_weight_obj;
+    PyObject *X_obj;
+    PyObject *X_missing_obj;
+    /* Parse the input tuple */
+    if (!PyArg_ParseTuple(
+        args, "OOOOOOiOOO", &children_left_obj, &children_right_obj, &children_default_obj,
+        &features_obj, &thresholds_obj, &values_obj, &tree_limit, &node_sample_weight_obj, &X_obj, &X_missing_obj
+    )) return NULL;
+    /* Interpret the input objects as numpy arrays. */
+    PyArrayObject *children_left_array = (PyArrayObject*)PyArray_FROM_OTF(children_left_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *children_right_array = (PyArrayObject*)PyArray_FROM_OTF(children_right_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *children_default_array = (PyArrayObject*)PyArray_FROM_OTF(children_default_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *features_array = (PyArrayObject*)PyArray_FROM_OTF(features_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *thresholds_array = (PyArrayObject*)PyArray_FROM_OTF(thresholds_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *values_array = (PyArrayObject*)PyArray_FROM_OTF(values_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *node_sample_weight_array = (PyArrayObject*)PyArray_FROM_OTF(node_sample_weight_obj, NPY_DOUBLE, NPY_ARRAY_INOUT_ARRAY);
+    PyArrayObject *X_array = (PyArrayObject*)PyArray_FROM_OTF(X_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *X_missing_array = (PyArrayObject*)PyArray_FROM_OTF(X_missing_obj, NPY_BOOL, NPY_ARRAY_IN_ARRAY);
+    /* If that didn't work, throw an exception. */
+    if (children_left_array == NULL || children_right_array == NULL ||
+        children_default_array == NULL || features_array == NULL || thresholds_array == NULL ||
+        values_array == NULL || node_sample_weight_array == NULL || X_array == NULL ||
+        X_missing_array == NULL) {
+        Py_XDECREF(children_left_array);
+        Py_XDECREF(children_right_array);
+        Py_XDECREF(children_default_array);
+        Py_XDECREF(features_array);
+        Py_XDECREF(thresholds_array);
+        Py_XDECREF(values_array);
+        //PyArray_ResolveWritebackIfCopy(node_sample_weight_array);
+        Py_XDECREF(node_sample_weight_array);
+        Py_XDECREF(X_array);
+        Py_XDECREF(X_missing_array);
+        std::cerr << "Found a NULL input array in _cext_dense_tree_update_weights!\n";
+        return NULL;
+    }
+    const unsigned num_X = PyArray_DIM(X_array, 0);
+    const unsigned M = PyArray_DIM(X_array, 1);
+    const unsigned max_nodes = PyArray_DIM(values_array, 1);
+    // Get pointers to the data as C-types
+    int *children_left = (int*)PyArray_DATA(children_left_array);
+    int *children_right = (int*)PyArray_DATA(children_right_array);
+    int *children_default = (int*)PyArray_DATA(children_default_array);
+    int *features = (int*)PyArray_DATA(features_array);
+    tfloat *thresholds = (tfloat*)PyArray_DATA(thresholds_array);
+    tfloat *values = (tfloat*)PyArray_DATA(values_array);
+    tfloat *node_sample_weight = (tfloat*)PyArray_DATA(node_sample_weight_array);
+    tfloat *X = (tfloat*)PyArray_DATA(X_array);
+    bool *X_missing = (bool*)PyArray_DATA(X_missing_array);
+    // these are just wrapper objects for all the pointers and numbers associated with
+    // the ensemble tree model and the dataset we are explaining
+    TreeEnsemble trees = TreeEnsemble(
+        children_left, children_right, children_default, features, thresholds, values,
+        node_sample_weight, 0, tree_limit, 0, max_nodes, 0
+    );
+    ExplanationDataset data = ExplanationDataset(X, X_missing, NULL, NULL, NULL, num_X, M, 0);
+    dense_tree_update_weights(trees, data);
+    // clean up the created python objects
+    Py_XDECREF(children_left_array);
+    Py_XDECREF(children_right_array);
+    Py_XDECREF(children_default_array);
+    Py_XDECREF(features_array);
+    Py_XDECREF(thresholds_array);
+    Py_XDECREF(values_array);
+    // PyArray_ResolveWritebackIfCopy(node_sample_weight_array);
+    Py_XDECREF(node_sample_weight_array);
+    Py_XDECREF(X_array);
+    Py_XDECREF(X_missing_array);
+    /* Build the output tuple */
+    PyObject *ret = Py_BuildValue("d", 1);
+    return ret;
+}
+static PyObject *_cext_dense_tree_saabas(PyObject *self, PyObject *args)
+{
+    PyObject *children_left_obj;
+    PyObject *children_right_obj;
+    PyObject *children_default_obj;
+    PyObject *features_obj;
+    PyObject *thresholds_obj;
+    PyObject *values_obj;
+    int max_depth;
+    int tree_limit;
+    PyObject *base_offset_obj;
+    int model_output;
+    PyObject *X_obj;
+    PyObject *X_missing_obj;
+    PyObject *y_obj;
+    PyObject *out_pred_obj;
+    /* Parse the input tuple */
+    if (!PyArg_ParseTuple(
+        args, "OOOOOOiiOiOOOO", &children_left_obj, &children_right_obj, &children_default_obj,
+        &features_obj, &thresholds_obj, &values_obj, &max_depth, &tree_limit, &base_offset_obj, &model_output,
+        &X_obj, &X_missing_obj, &y_obj, &out_pred_obj
+    )) return NULL;
+    /* Interpret the input objects as numpy arrays. */
+    PyArrayObject *children_left_array = (PyArrayObject*)PyArray_FROM_OTF(children_left_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *children_right_array = (PyArrayObject*)PyArray_FROM_OTF(children_right_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *children_default_array = (PyArrayObject*)PyArray_FROM_OTF(children_default_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *features_array = (PyArrayObject*)PyArray_FROM_OTF(features_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *thresholds_array = (PyArrayObject*)PyArray_FROM_OTF(thresholds_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *values_array = (PyArrayObject*)PyArray_FROM_OTF(values_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *base_offset_array = (PyArrayObject*)PyArray_FROM_OTF(base_offset_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *X_array = (PyArrayObject*)PyArray_FROM_OTF(X_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *X_missing_array = (PyArrayObject*)PyArray_FROM_OTF(X_missing_obj, NPY_BOOL, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *y_array = NULL;
+    if (y_obj != Py_None) y_array = (PyArrayObject*)PyArray_FROM_OTF(y_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *out_pred_array = (PyArrayObject*)PyArray_FROM_OTF(out_pred_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    /* If that didn't work, throw an exception. Note that R and y are optional. */
+    if (children_left_array == NULL || children_right_array == NULL ||
+        children_default_array == NULL || features_array == NULL || thresholds_array == NULL ||
+        values_array == NULL || X_array == NULL ||
+        X_missing_array == NULL || out_pred_array == NULL) {
+        Py_XDECREF(children_left_array);
+        Py_XDECREF(children_right_array);
+        Py_XDECREF(children_default_array);
+        Py_XDECREF(features_array);
+        Py_XDECREF(thresholds_array);
+        Py_XDECREF(values_array);
+        Py_XDECREF(base_offset_array);
+        Py_XDECREF(X_array);
+        Py_XDECREF(X_missing_array);
+        if (y_array != NULL) Py_XDECREF(y_array);
+        //PyArray_ResolveWritebackIfCopy(out_pred_array);
+        Py_XDECREF(out_pred_array);
+        return NULL;
+    }
+    const unsigned num_X = PyArray_DIM(X_array, 0);
+    const unsigned M = PyArray_DIM(X_array, 1);
+    const unsigned max_nodes = PyArray_DIM(values_array, 1);
+    const unsigned num_outputs = PyArray_DIM(values_array, 2);
+    // Get pointers to the data as C-types
+    int *children_left = (int*)PyArray_DATA(children_left_array);
+    int *children_right = (int*)PyArray_DATA(children_right_array);
+    int *children_default = (int*)PyArray_DATA(children_default_array);
+    int *features = (int*)PyArray_DATA(features_array);
+    tfloat *thresholds = (tfloat*)PyArray_DATA(thresholds_array);
+    tfloat *values = (tfloat*)PyArray_DATA(values_array);
+    tfloat *base_offset = (tfloat*)PyArray_DATA(base_offset_array);
+    tfloat *X = (tfloat*)PyArray_DATA(X_array);
+    bool *X_missing = (bool*)PyArray_DATA(X_missing_array);
+    tfloat *y = NULL;
+    if (y_array != NULL) y = (tfloat*)PyArray_DATA(y_array);
+    tfloat *out_pred = (tfloat*)PyArray_DATA(out_pred_array);
+    // these are just wrapper objects for all the pointers and numbers associated with
+    // the ensemble tree model and the dataset we are explaining
+    TreeEnsemble trees = TreeEnsemble(
+        children_left, children_right, children_default, features, thresholds, values,
+        NULL, max_depth, tree_limit, base_offset,
+        max_nodes, num_outputs
+    );
+    ExplanationDataset data = ExplanationDataset(X, X_missing, y, NULL, NULL, num_X, M, 0);
+    dense_tree_saabas(out_pred, trees, data);
+    // clean up the created python objects
+    Py_XDECREF(children_left_array);
+    Py_XDECREF(children_right_array);
+    Py_XDECREF(children_default_array);
+    Py_XDECREF(features_array);
+    Py_XDECREF(thresholds_array);
+    Py_XDECREF(values_array);
+    Py_XDECREF(base_offset_array);
+    Py_XDECREF(X_array);
+    Py_XDECREF(X_missing_array);
+    if (y_array != NULL) Py_XDECREF(y_array);
+    //PyArray_ResolveWritebackIfCopy(out_pred_array);
+    Py_XDECREF(out_pred_array);
+    /* Build the output tuple */
+    PyObject *ret = Py_BuildValue("d", (double)values[0]);
+    return ret;
+}

lib/shap/cext/_cext_gpu.cc ADDED Viewed

	@@ -0,0 +1,187 @@

+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <Python.h>
+#include <numpy/arrayobject.h>
+#include "tree_shap.h"
+#include <iostream>
+static PyObject *_cext_dense_tree_shap(PyObject *self, PyObject *args);
+static PyMethodDef module_methods[] = {
+    {"dense_tree_shap", _cext_dense_tree_shap, METH_VARARGS, "C implementation of Tree SHAP for dense."},
+    {NULL, NULL, 0, NULL}
+};
+#if PY_MAJOR_VERSION >= 3
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "_cext_gpu",
+    "This module provides an interface for a fast Tree SHAP implementation.",
+    -1,
+    module_methods,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+#endif
+#if PY_MAJOR_VERSION >= 3
+PyMODINIT_FUNC PyInit__cext_gpu(void)
+#else
+PyMODINIT_FUNC init_cext(void)
+#endif
+{
+    #if PY_MAJOR_VERSION >= 3
+        PyObject *module = PyModule_Create(&moduledef);
+        if (!module) return NULL;
+    #else
+        PyObject *module = Py_InitModule("_cext", module_methods);
+        if (!module) return;
+    #endif
+    /* Load `numpy` functionality. */
+    import_array();
+    #if PY_MAJOR_VERSION >= 3
+        return module;
+    #endif
+}
+void dense_tree_shap_gpu(const TreeEnsemble& trees, const ExplanationDataset &data, tfloat *out_contribs,
+                     const int feature_dependence, unsigned model_transform, bool interactions);
+static PyObject *_cext_dense_tree_shap(PyObject *self, PyObject *args)
+{
+    PyObject *children_left_obj;
+    PyObject *children_right_obj;
+    PyObject *children_default_obj;
+    PyObject *features_obj;
+    PyObject *thresholds_obj;
+    PyObject *values_obj;
+    PyObject *node_sample_weights_obj;
+    int max_depth;
+    PyObject *X_obj;
+    PyObject *X_missing_obj;
+    PyObject *y_obj;
+    PyObject *R_obj;
+    PyObject *R_missing_obj;
+    int tree_limit;
+    PyObject *out_contribs_obj;
+    int feature_dependence;
+    int model_output;
+    PyObject *base_offset_obj;
+    bool interactions;
+    /* Parse the input tuple */
+    if (!PyArg_ParseTuple(
+        args, "OOOOOOOiOOOOOiOOiib", &children_left_obj, &children_right_obj, &children_default_obj,
+        &features_obj, &thresholds_obj, &values_obj, &node_sample_weights_obj,
+        &max_depth, &X_obj, &X_missing_obj, &y_obj, &R_obj, &R_missing_obj, &tree_limit, &base_offset_obj,
+        &out_contribs_obj, &feature_dependence, &model_output, &interactions
+    )) return NULL;
+    /* Interpret the input objects as numpy arrays. */
+    PyArrayObject *children_left_array = (PyArrayObject*)PyArray_FROM_OTF(children_left_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *children_right_array = (PyArrayObject*)PyArray_FROM_OTF(children_right_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *children_default_array = (PyArrayObject*)PyArray_FROM_OTF(children_default_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *features_array = (PyArrayObject*)PyArray_FROM_OTF(features_obj, NPY_INT, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *thresholds_array = (PyArrayObject*)PyArray_FROM_OTF(thresholds_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *values_array = (PyArrayObject*)PyArray_FROM_OTF(values_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *node_sample_weights_array = (PyArrayObject*)PyArray_FROM_OTF(node_sample_weights_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *X_array = (PyArrayObject*)PyArray_FROM_OTF(X_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *X_missing_array = (PyArrayObject*)PyArray_FROM_OTF(X_missing_obj, NPY_BOOL, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *y_array = NULL;
+    if (y_obj != Py_None) y_array = (PyArrayObject*)PyArray_FROM_OTF(y_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *R_array = NULL;
+    if (R_obj != Py_None) R_array = (PyArrayObject*)PyArray_FROM_OTF(R_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *R_missing_array = NULL;
+    if (R_missing_obj != Py_None) R_missing_array = (PyArrayObject*)PyArray_FROM_OTF(R_missing_obj, NPY_BOOL, NPY_ARRAY_IN_ARRAY);
+    PyArrayObject *out_contribs_array = (PyArrayObject*)PyArray_FROM_OTF(out_contribs_obj, NPY_DOUBLE, NPY_ARRAY_INOUT_ARRAY);
+    PyArrayObject *base_offset_array = (PyArrayObject*)PyArray_FROM_OTF(base_offset_obj, NPY_DOUBLE, NPY_ARRAY_INOUT_ARRAY);
+    /* If that didn't work, throw an exception. Note that R and y are optional. */
+    if (children_left_array == NULL || children_right_array == NULL ||
+        children_default_array == NULL || features_array == NULL || thresholds_array == NULL ||
+        values_array == NULL || node_sample_weights_array == NULL || X_array == NULL ||
+        X_missing_array == NULL || out_contribs_array == NULL) {
+        Py_XDECREF(children_left_array);
+        Py_XDECREF(children_right_array);
+        Py_XDECREF(children_default_array);
+        Py_XDECREF(features_array);
+        Py_XDECREF(thresholds_array);
+        Py_XDECREF(values_array);
+        Py_XDECREF(node_sample_weights_array);
+        Py_XDECREF(X_array);
+        Py_XDECREF(X_missing_array);
+        if (y_array != NULL) Py_XDECREF(y_array);
+        if (R_array != NULL) Py_XDECREF(R_array);
+        if (R_missing_array != NULL) Py_XDECREF(R_missing_array);
+        //PyArray_ResolveWritebackIfCopy(out_contribs_array);
+        Py_XDECREF(out_contribs_array);
+        Py_XDECREF(base_offset_array);
+        return NULL;
+    }
+    const unsigned num_X = PyArray_DIM(X_array, 0);
+    const unsigned M = PyArray_DIM(X_array, 1);
+    const unsigned max_nodes = PyArray_DIM(values_array, 1);
+    const unsigned num_outputs = PyArray_DIM(values_array, 2);
+    unsigned num_R = 0;
+    if (R_array != NULL) num_R = PyArray_DIM(R_array, 0);
+    // Get pointers to the data as C-types
+    int *children_left = (int*)PyArray_DATA(children_left_array);
+    int *children_right = (int*)PyArray_DATA(children_right_array);
+    int *children_default = (int*)PyArray_DATA(children_default_array);
+    int *features = (int*)PyArray_DATA(features_array);
+    tfloat *thresholds = (tfloat*)PyArray_DATA(thresholds_array);
+    tfloat *values = (tfloat*)PyArray_DATA(values_array);
+    tfloat *node_sample_weights = (tfloat*)PyArray_DATA(node_sample_weights_array);
+    tfloat *X = (tfloat*)PyArray_DATA(X_array);
+    bool *X_missing = (bool*)PyArray_DATA(X_missing_array);
+    tfloat *y = NULL;
+    if (y_array != NULL) y = (tfloat*)PyArray_DATA(y_array);
+    tfloat *R = NULL;
+    if (R_array != NULL) R = (tfloat*)PyArray_DATA(R_array);
+    bool *R_missing = NULL;
+    if (R_missing_array != NULL) R_missing = (bool*)PyArray_DATA(R_missing_array);
+    tfloat *out_contribs = (tfloat*)PyArray_DATA(out_contribs_array);
+    tfloat *base_offset = (tfloat*)PyArray_DATA(base_offset_array);
+    // these are just a wrapper objects for all the pointers and numbers associated with
+    // the ensemble tree model and the dataset we are explaining
+    TreeEnsemble trees = TreeEnsemble(
+        children_left, children_right, children_default, features, thresholds, values,
+        node_sample_weights, max_depth, tree_limit, base_offset,
+        max_nodes, num_outputs
+    );
+    ExplanationDataset data = ExplanationDataset(X, X_missing, y, R, R_missing, num_X, M, num_R);
+    dense_tree_shap_gpu(trees, data, out_contribs, feature_dependence, model_output, interactions);
+    // retrieve return value before python cleanup of objects
+    tfloat ret_value = (double)values[0];
+    // clean up the created python objects
+    Py_XDECREF(children_left_array);
+    Py_XDECREF(children_right_array);
+    Py_XDECREF(children_default_array);
+    Py_XDECREF(features_array);
+    Py_XDECREF(thresholds_array);
+    Py_XDECREF(values_array);
+    Py_XDECREF(node_sample_weights_array);
+    Py_XDECREF(X_array);
+    Py_XDECREF(X_missing_array);
+    if (y_array != NULL) Py_XDECREF(y_array);
+    if (R_array != NULL) Py_XDECREF(R_array);
+    if (R_missing_array != NULL) Py_XDECREF(R_missing_array);
+    //PyArray_ResolveWritebackIfCopy(out_contribs_array);
+    Py_XDECREF(out_contribs_array);
+    Py_XDECREF(base_offset_array);
+    /* Build the output tuple */
+    PyObject *ret = Py_BuildValue("d", ret_value);
+    return ret;
+}

lib/shap/cext/_cext_gpu.cu ADDED Viewed

	@@ -0,0 +1,353 @@

+#include <Python.h>
+#include "gpu_treeshap.h"
+#include "tree_shap.h"
+const float inf = std::numeric_limits<tfloat>::infinity();
+struct ShapSplitCondition {
+  ShapSplitCondition() = default;
+  ShapSplitCondition(tfloat feature_lower_bound, tfloat feature_upper_bound,
+                     bool is_missing_branch)
+      : feature_lower_bound(feature_lower_bound),
+        feature_upper_bound(feature_upper_bound),
+        is_missing_branch(is_missing_branch) {
+    assert(feature_lower_bound <= feature_upper_bound);
+  }
+  /*! Feature values >= lower and < upper flow down this path. */
+  tfloat feature_lower_bound;
+  tfloat feature_upper_bound;
+  /*! Do missing values flow down this path? */
+  bool is_missing_branch;
+  // Does this instance flow down this path?
+  __host__ __device__ bool EvaluateSplit(float x) const {
+    // is nan
+    if (isnan(x)) {
+      return is_missing_branch;
+    }
+    return x > feature_lower_bound && x <= feature_upper_bound;
+  }
+  // Combine two split conditions on the same feature
+  __host__ __device__ void
+  Merge(const ShapSplitCondition &other) {  // Combine duplicate features
+    feature_lower_bound = max(feature_lower_bound, other.feature_lower_bound);
+    feature_upper_bound = min(feature_upper_bound, other.feature_upper_bound);
+    is_missing_branch = is_missing_branch && other.is_missing_branch;
+  }
+};
+// Inspired by: https://en.cppreference.com/w/cpp/iterator/size
+// Limited implementation of std::size fo arrays
+template <class T, size_t N>
+constexpr size_t array_size(const T (&array)[N]) noexcept
+{
+    return N;
+}
+void RecurseTree(
+    unsigned pos, const TreeEnsemble &tree,
+    std::vector<gpu_treeshap::PathElement<ShapSplitCondition>> *tmp_path,
+    std::vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
+    size_t *path_idx, int num_outputs) {
+  if (tree.is_leaf(pos)) {
+    for (auto j = 0ull; j < num_outputs; j++) {
+      auto v = tree.values[pos * num_outputs + j];
+      if (v == 0.0) {
+        // The tree has no output for this class, don't bother adding the path
+        continue;
+      }
+      // Go back over path, setting v, path_idx
+      for (auto &e : *tmp_path) {
+        e.v = v;
+        e.group = j;
+        e.path_idx = *path_idx;
+      }
+      paths->insert(paths->end(), tmp_path->begin(), tmp_path->end());
+      // Increment path index
+      (*path_idx)++;
+    }
+    return;
+  }
+  // Add left split to the path
+  unsigned left_child = tree.children_left[pos];
+  double left_zero_fraction =
+      tree.node_sample_weights[left_child] / tree.node_sample_weights[pos];
+  // Encode the range of feature values that flow down this path
+  tmp_path->emplace_back(0, tree.features[pos], 0,
+                         ShapSplitCondition{-inf, tree.thresholds[pos], false},
+                         left_zero_fraction, 0.0f);
+  RecurseTree(left_child, tree, tmp_path, paths, path_idx, num_outputs);
+  // Add left split to the path
+  tmp_path->back() = gpu_treeshap::PathElement<ShapSplitCondition>(
+      0, tree.features[pos], 0,
+      ShapSplitCondition{tree.thresholds[pos], inf, false},
+      1.0 - left_zero_fraction, 0.0f);
+  RecurseTree(tree.children_right[pos], tree, tmp_path, paths, path_idx,
+              num_outputs);
+  tmp_path->pop_back();
+}
+std::vector<gpu_treeshap::PathElement<ShapSplitCondition>>
+ExtractPaths(const TreeEnsemble &trees) {
+  std::vector<gpu_treeshap::PathElement<ShapSplitCondition>> paths;
+  size_t path_idx = 0;
+  for (auto i = 0; i < trees.tree_limit; i++) {
+    TreeEnsemble tree;
+    trees.get_tree(tree, i);
+    std::vector<gpu_treeshap::PathElement<ShapSplitCondition>> tmp_path;
+    tmp_path.reserve(tree.max_depth);
+    tmp_path.emplace_back(0, -1, 0, ShapSplitCondition{-inf, inf, false}, 1.0,
+                          0.0f);
+    RecurseTree(0, tree, &tmp_path, &paths, &path_idx, tree.num_outputs);
+  }
+  return paths;
+}
+class DeviceExplanationDataset {
+  thrust::device_vector<tfloat> data;
+  thrust::device_vector<bool> missing;
+  size_t num_features;
+  size_t num_rows;
+ public:
+  DeviceExplanationDataset(const ExplanationDataset &host_data,
+                           bool background_dataset = false) {
+    num_features = host_data.M;
+    if (background_dataset) {
+      num_rows = host_data.num_R;
+      data = thrust::device_vector<tfloat>(
+          host_data.R, host_data.R + host_data.num_R * host_data.M);
+      missing = thrust::device_vector<bool>(host_data.R_missing,
+                                            host_data.R_missing +
+                                                host_data.num_R * host_data.M);
+    } else {
+      num_rows = host_data.num_X;
+      data = thrust::device_vector<tfloat>(
+          host_data.X, host_data.X + host_data.num_X * host_data.M);
+      missing = thrust::device_vector<bool>(host_data.X_missing,
+                                            host_data.X_missing +
+                                                host_data.num_X * host_data.M);
+    }
+  }
+  class DenseDatasetWrapper {
+    const tfloat *data;
+    const bool *missing;
+    int num_rows;
+    int num_cols;
+   public:
+    DenseDatasetWrapper() = default;
+    DenseDatasetWrapper(const tfloat *data, const bool *missing, int num_rows,
+                        int num_cols)
+        : data(data), missing(missing), num_rows(num_rows), num_cols(num_cols) {
+    }
+    __device__ tfloat GetElement(size_t row_idx, size_t col_idx) const {
+      auto idx = row_idx * num_cols + col_idx;
+      if (missing[idx]) {
+        return std::numeric_limits<tfloat>::quiet_NaN();
+      }
+      return data[idx];
+    }
+    __host__ __device__ size_t NumRows() const { return num_rows; }
+    __host__ __device__ size_t NumCols() const { return num_cols; }
+  };
+  DenseDatasetWrapper GetDeviceAccessor() {
+    return DenseDatasetWrapper(data.data().get(), missing.data().get(),
+                               num_rows, num_features);
+  }
+};
+inline void dense_tree_path_dependent_gpu(
+    const TreeEnsemble &trees, const ExplanationDataset &data,
+    tfloat *out_contribs, tfloat transform(const tfloat, const tfloat)) {
+  auto paths = ExtractPaths(trees);
+  DeviceExplanationDataset device_data(data);
+  DeviceExplanationDataset::DenseDatasetWrapper X =
+      device_data.GetDeviceAccessor();
+  thrust::device_vector<float> phis((X.NumCols() + 1) * X.NumRows() *
+                                    trees.num_outputs);
+  gpu_treeshap::GPUTreeShap(X, paths.begin(), paths.end(), trees.num_outputs,
+                            phis.begin(), phis.end());
+  // Add the base offset term to bias
+  thrust::device_vector<double> base_offset(
+      trees.base_offset, trees.base_offset + trees.num_outputs);
+  auto counting = thrust::make_counting_iterator(size_t(0));
+  auto d_phis = phis.data().get();
+  auto d_base_offset = base_offset.data().get();
+  size_t num_groups = trees.num_outputs;
+  thrust::for_each(counting, counting + X.NumRows() * trees.num_outputs,
+                   [=] __device__(size_t idx) {
+                     size_t row_idx = idx / num_groups;
+                     size_t group = idx % num_groups;
+                     auto phi_idx = gpu_treeshap::IndexPhi(
+                         row_idx, num_groups, group, X.NumCols(), X.NumCols());
+                     d_phis[phi_idx] += d_base_offset[group];
+                   });
+  // Shap uses a slightly different layout for multiclass
+  thrust::device_vector<float> transposed_phis(phis.size());
+  auto d_transposed_phis = transposed_phis.data();
+  thrust::for_each(
+      counting, counting + phis.size(), [=] __device__(size_t idx) {
+        size_t old_shape[] = {X.NumRows(), num_groups, (X.NumCols() + 1)};
+        size_t old_idx[array_size(old_shape)];
+        gpu_treeshap::FlatIdxToTensorIdx(idx, old_shape, old_idx);
+        // Define new tensor format, switch num_groups axis to end
+        size_t new_shape[] = {X.NumRows(), (X.NumCols() + 1), num_groups};
+        size_t new_idx[] = {old_idx[0], old_idx[2], old_idx[1]};
+        size_t transposed_idx =
+            gpu_treeshap::TensorIdxToFlatIdx(new_shape, new_idx);
+        d_transposed_phis[transposed_idx] = d_phis[idx];
+      });
+  thrust::copy(transposed_phis.begin(), transposed_phis.end(), out_contribs);
+}
+inline void
+dense_tree_independent_gpu(const TreeEnsemble &trees,
+                           const ExplanationDataset &data, tfloat *out_contribs,
+                           tfloat transform(const tfloat, const tfloat)) {
+  auto paths = ExtractPaths(trees);
+  DeviceExplanationDataset device_data(data);
+  DeviceExplanationDataset::DenseDatasetWrapper X =
+      device_data.GetDeviceAccessor();
+  DeviceExplanationDataset background_device_data(data, true);
+  DeviceExplanationDataset::DenseDatasetWrapper R =
+      background_device_data.GetDeviceAccessor();
+  thrust::device_vector<float> phis((X.NumCols() + 1) * X.NumRows() *
+                                    trees.num_outputs);
+  gpu_treeshap::GPUTreeShapInterventional(X, R, paths.begin(), paths.end(),
+                                          trees.num_outputs, phis.begin(),
+                                          phis.end());
+  // Add the base offset term to bias
+  thrust::device_vector<double> base_offset(
+      trees.base_offset, trees.base_offset + trees.num_outputs);
+  auto counting = thrust::make_counting_iterator(size_t(0));
+  auto d_phis = phis.data().get();
+  auto d_base_offset = base_offset.data().get();
+  size_t num_groups = trees.num_outputs;
+  thrust::for_each(counting, counting + X.NumRows() * trees.num_outputs,
+                   [=] __device__(size_t idx) {
+                     size_t row_idx = idx / num_groups;
+                     size_t group = idx % num_groups;
+                     auto phi_idx = gpu_treeshap::IndexPhi(
+                         row_idx, num_groups, group, X.NumCols(), X.NumCols());
+                     d_phis[phi_idx] += d_base_offset[group];
+                   });
+  // Shap uses a slightly different layout for multiclass
+  thrust::device_vector<float> transposed_phis(phis.size());
+  auto d_transposed_phis = transposed_phis.data();
+  thrust::for_each(
+      counting, counting + phis.size(), [=] __device__(size_t idx) {
+        size_t old_shape[] = {X.NumRows(), num_groups, (X.NumCols() + 1)};
+        size_t old_idx[array_size(old_shape)];
+        gpu_treeshap::FlatIdxToTensorIdx(idx, old_shape, old_idx);
+        // Define new tensor format, switch num_groups axis to end
+        size_t new_shape[] = {X.NumRows(), (X.NumCols() + 1), num_groups};
+        size_t new_idx[] = {old_idx[0], old_idx[2], old_idx[1]};
+        size_t transposed_idx =
+            gpu_treeshap::TensorIdxToFlatIdx(new_shape, new_idx);
+        d_transposed_phis[transposed_idx] = d_phis[idx];
+      });
+  thrust::copy(transposed_phis.begin(), transposed_phis.end(), out_contribs);
+}
+inline void dense_tree_path_dependent_interactions_gpu(
+    const TreeEnsemble &trees, const ExplanationDataset &data,
+    tfloat *out_contribs, tfloat transform(const tfloat, const tfloat)) {
+  auto paths = ExtractPaths(trees);
+  DeviceExplanationDataset device_data(data);
+  DeviceExplanationDataset::DenseDatasetWrapper X =
+      device_data.GetDeviceAccessor();
+  thrust::device_vector<float> phis((X.NumCols() + 1) * (X.NumCols() + 1) *
+                                    X.NumRows() * trees.num_outputs);
+  gpu_treeshap::GPUTreeShapInteractions(X, paths.begin(), paths.end(),
+                                        trees.num_outputs, phis.begin(),
+                                        phis.end());
+  // Add the base offset term to bias
+  thrust::device_vector<double> base_offset(
+      trees.base_offset, trees.base_offset + trees.num_outputs);
+  auto counting = thrust::make_counting_iterator(size_t(0));
+  auto d_phis = phis.data().get();
+  auto d_base_offset = base_offset.data().get();
+  size_t num_groups = trees.num_outputs;
+  thrust::for_each(counting, counting + X.NumRows() * num_groups,
+                   [=] __device__(size_t idx) {
+                     size_t row_idx = idx / num_groups;
+                     size_t group = idx % num_groups;
+                     auto phi_idx = gpu_treeshap::IndexPhiInteractions(
+                         row_idx, num_groups, group, X.NumCols(), X.NumCols(),
+                         X.NumCols());
+                     d_phis[phi_idx] += d_base_offset[group];
+                   });
+  // Shap uses a slightly different layout for multiclass
+  thrust::device_vector<float> transposed_phis(phis.size());
+  auto d_transposed_phis = transposed_phis.data();
+  thrust::for_each(
+      counting, counting + phis.size(), [=] __device__(size_t idx) {
+        size_t old_shape[] = {X.NumRows(), num_groups, (X.NumCols() + 1),
+                              (X.NumCols() + 1)};
+        size_t old_idx[array_size(old_shape)];
+        gpu_treeshap::FlatIdxToTensorIdx(idx, old_shape, old_idx);
+        // Define new tensor format, switch num_groups axis to end
+        size_t new_shape[] = {X.NumRows(), (X.NumCols() + 1), (X.NumCols() + 1),
+                              num_groups};
+        size_t new_idx[] = {old_idx[0], old_idx[2], old_idx[3], old_idx[1]};
+        size_t transposed_idx =
+            gpu_treeshap::TensorIdxToFlatIdx(new_shape, new_idx);
+        d_transposed_phis[transposed_idx] = d_phis[idx];
+      });
+  thrust::copy(transposed_phis.begin(), transposed_phis.end(), out_contribs);
+}
+void dense_tree_shap_gpu(const TreeEnsemble &trees,
+                         const ExplanationDataset &data, tfloat *out_contribs,
+                         const int feature_dependence, unsigned model_transform,
+                         bool interactions) {
+  // see what transform (if any) we have
+  transform_f transform = get_transform(model_transform);
+  // dispatch to the correct algorithm handler
+  switch (feature_dependence) {
+  case FEATURE_DEPENDENCE::independent:
+    if (interactions) {
+      std::cerr << "FEATURE_DEPENDENCE::independent with interactions not yet "
+                   "supported\n";
+    } else {
+      dense_tree_independent_gpu(trees, data, out_contribs, transform);
+    }
+    return;
+  case FEATURE_DEPENDENCE::tree_path_dependent:
+    if (interactions) {
+      dense_tree_path_dependent_interactions_gpu(trees, data, out_contribs,
+                                                 transform);
+    } else {
+      dense_tree_path_dependent_gpu(trees, data, out_contribs, transform);
+    }
+    return;
+  case FEATURE_DEPENDENCE::global_path_dependent:
+    std::cerr << "FEATURE_DEPENDENCE::global_path_dependent not supported\n";
+    return;
+  default:
+    std::cerr << "Unknown feature dependence option\n";
+    return;
+  }
+}

lib/shap/cext/gpu_treeshap.h ADDED Viewed

	@@ -0,0 +1,1535 @@

+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <thrust/device_allocator.h>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/logical.h>
+#include <thrust/reduce.h>
+#include <thrust/host_vector.h>
+#if (CUDART_VERSION >= 11000)
+#include <cub/cub.cuh>
+#else
+// Hack to get cub device reduce on older toolkits
+#include <thrust/system/cuda/detail/cub/device/device_reduce.cuh>
+using namespace thrust::cuda_cub;
+#endif
+#include <algorithm>
+#include <functional>
+#include <set>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+namespace gpu_treeshap {
+struct XgboostSplitCondition {
+  XgboostSplitCondition() = default;
+  XgboostSplitCondition(float feature_lower_bound, float feature_upper_bound,
+                        bool is_missing_branch)
+      : feature_lower_bound(feature_lower_bound),
+        feature_upper_bound(feature_upper_bound),
+        is_missing_branch(is_missing_branch) {
+    assert(feature_lower_bound <= feature_upper_bound);
+  }
+  /*! Feature values >= lower and < upper flow down this path. */
+  float feature_lower_bound;
+  float feature_upper_bound;
+  /*! Do missing values flow down this path? */
+  bool is_missing_branch;
+  // Does this instance flow down this path?
+  __host__ __device__ bool EvaluateSplit(float x) const {
+    // is nan
+    if (isnan(x)) {
+      return is_missing_branch;
+    }
+    return x >= feature_lower_bound && x < feature_upper_bound;
+  }
+  // Combine two split conditions on the same feature
+  __host__ __device__ void Merge(
+      const XgboostSplitCondition& other) {  // Combine duplicate features
+    feature_lower_bound = max(feature_lower_bound, other.feature_lower_bound);
+    feature_upper_bound = min(feature_upper_bound, other.feature_upper_bound);
+    is_missing_branch = is_missing_branch && other.is_missing_branch;
+  }
+};
+/*!
+ * An element of a unique path through a decision tree. Can implement various
+ * types of splits via the templated SplitConditionT. Some decision tree
+ * implementations may wish to use double precision or single precision, some
+ * may use < or <= as the threshold, missing values can be handled differently,
+ * categoricals may be supported.
+ *
+ * \tparam  SplitConditionT A split condition implementing the methods
+ * EvaluateSplit and Merge.
+ */
+template <typename SplitConditionT>
+struct PathElement {
+  using split_type = SplitConditionT;
+  __host__ __device__ PathElement(size_t path_idx, int64_t feature_idx,
+                                  int group, SplitConditionT split_condition,
+                                  double zero_fraction, float v)
+      : path_idx(path_idx),
+        feature_idx(feature_idx),
+        group(group),
+        split_condition(split_condition),
+        zero_fraction(zero_fraction),
+        v(v) {}
+  PathElement() = default;
+  __host__ __device__ bool IsRoot() const { return feature_idx == -1; }
+  template <typename DatasetT>
+  __host__ __device__ bool EvaluateSplit(DatasetT X, size_t row_idx) const {
+    if (this->IsRoot()) {
+      return 1.0;
+    }
+    return split_condition.EvaluateSplit(X.GetElement(row_idx, feature_idx));
+  }
+  /*! Unique path index. */
+  size_t path_idx;
+  /*! Feature of this split, -1 indicates bias term. */
+  int64_t feature_idx;
+  /*! Indicates class for multiclass problems. */
+  int group;
+  SplitConditionT split_condition;
+  /*! Probability of following this path when feature_idx is not in the active
+   * set. */
+  double zero_fraction;
+  float v;  // Leaf weight at the end of the path
+};
+// Helper function that accepts an index into a flat contiguous array and the
+// dimensions of a tensor and returns the indices with respect to the tensor
+template <typename T, size_t N>
+__device__ void FlatIdxToTensorIdx(T flat_idx, const T (&shape)[N],
+                                   T (&out_idx)[N]) {
+  T current_size = shape[0];
+  for (auto i = 1ull; i < N; i++) {
+    current_size *= shape[i];
+  }
+  for (auto i = 0ull; i < N; i++) {
+    current_size /= shape[i];
+    out_idx[i] = flat_idx / current_size;
+    flat_idx -= current_size * out_idx[i];
+  }
+}
+// Given a shape and coordinates into a tensor, return the index into the
+// backing storage one-dimensional array
+template <typename T, size_t N>
+__device__ T TensorIdxToFlatIdx(const T (&shape)[N], const T (&tensor_idx)[N]) {
+  T current_size = shape[0];
+  for (auto i = 1ull; i < N; i++) {
+    current_size *= shape[i];
+  }
+  T idx = 0;
+  for (auto i = 0ull; i < N; i++) {
+    current_size /= shape[i];
+    idx += tensor_idx[i] * current_size;
+  }
+  return idx;
+}
+// Maps values to the phi array according to row, group and column
+__host__ __device__ inline size_t IndexPhi(size_t row_idx, size_t num_groups,
+                                           size_t group, size_t num_columns,
+                                           size_t column_idx) {
+  return (row_idx * num_groups + group) * (num_columns + 1) + column_idx;
+}
+__host__ __device__ inline size_t IndexPhiInteractions(size_t row_idx,
+                                                       size_t num_groups,
+                                                       size_t group,
+                                                       size_t num_columns,
+                                                       size_t i, size_t j) {
+  size_t matrix_size = (num_columns + 1) * (num_columns + 1);
+  size_t matrix_offset = (row_idx * num_groups + group) * matrix_size;
+  return matrix_offset + i * (num_columns + 1) + j;
+}
+namespace detail {
+// Shorthand for creating a device vector with an appropriate allocator type
+template <class T, class DeviceAllocatorT>
+using RebindVector =
+    thrust::device_vector<T,
+                          typename DeviceAllocatorT::template rebind<T>::other>;
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
+__device__ __forceinline__ double atomicAddDouble(double* address, double val) {
+  return atomicAdd(address, val);
+}
+#else  // In device code and CUDA < 600
+__device__ __forceinline__ double atomicAddDouble(double* address,
+                                                  double val) {  // NOLINT
+  unsigned long long int* address_as_ull =                       // NOLINT
+      (unsigned long long int*)address;                          // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;         // NOLINT
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN !=
+    // NaN)
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+#endif
+__forceinline__ __device__ unsigned int lanemask32_lt() {
+  unsigned int lanemask32_lt;
+  asm volatile("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt));
+  return (lanemask32_lt);
+}
+// Like a coalesced group, except we can make the assumption that all threads in
+// a group are next to each other. This makes shuffle operations much cheaper.
+class ContiguousGroup {
+ public:
+  __device__ ContiguousGroup(uint32_t mask) : mask_(mask) {}
+  __device__ uint32_t size() const { return __popc(mask_); }
+  __device__ uint32_t thread_rank() const {
+    return __popc(mask_ & lanemask32_lt());
+  }
+  template <typename T>
+  __device__ T shfl(T val, uint32_t src) const {
+    return __shfl_sync(mask_, val, src + __ffs(mask_) - 1);
+  }
+  template <typename T>
+  __device__ T shfl_up(T val, uint32_t delta) const {
+    return __shfl_up_sync(mask_, val, delta);
+  }
+  __device__ uint32_t ballot(int predicate) const {
+    return __ballot_sync(mask_, predicate) >> (__ffs(mask_) - 1);
+  }
+  template <typename T, typename OpT>
+  __device__ T reduce(T val, OpT op) {
+    for (int i = 1; i < this->size(); i *= 2) {
+      T shfl = shfl_up(val, i);
+      if (static_cast<int>(thread_rank()) - i >= 0) {
+        val = op(val, shfl);
+      }
+    }
+    return shfl(val, size() - 1);
+  }
+  uint32_t mask_;
+};
+// Separate the active threads by labels
+// This functionality is available in cuda 11.0 on cc >=7.0
+// We reimplement for backwards compatibility
+// Assumes partitions are contiguous
+inline __device__ ContiguousGroup active_labeled_partition(uint32_t mask,
+                                                           int label) {
+#if __CUDA_ARCH__ >= 700
+  uint32_t subgroup_mask = __match_any_sync(mask, label);
+#else
+  uint32_t subgroup_mask = 0;
+  for (int i = 0; i < 32;) {
+    int current_label = __shfl_sync(mask, label, i);
+    uint32_t ballot = __ballot_sync(mask, label == current_label);
+    if (label == current_label) {
+      subgroup_mask = ballot;
+    }
+    uint32_t completed_mask =
+        (1 << (32 - __clz(ballot))) - 1;  // Threads that have finished
+    // Find the start of the next group, mask off completed threads from active
+    // threads Then use ffs - 1 to find the position of the next group
+    int next_i = __ffs(mask & ~completed_mask) - 1;
+    if (next_i == -1) break;  // -1 indicates all finished
+    assert(next_i > i);  // Prevent infinite loops when the constraints not met
+    i = next_i;
+  }
+#endif
+  return ContiguousGroup(subgroup_mask);
+}
+// Group of threads where each thread holds a path element
+class GroupPath {
+ protected:
+  const ContiguousGroup& g_;
+  // These are combined so we can communicate them in a single 64 bit shuffle
+  // instruction
+  float zero_one_fraction_[2];
+  float pweight_;
+  int unique_depth_;
+ public:
+  __device__ GroupPath(const ContiguousGroup& g, float zero_fraction,
+                       float one_fraction)
+      : g_(g),
+        zero_one_fraction_{zero_fraction, one_fraction},
+        pweight_(g.thread_rank() == 0 ? 1.0f : 0.0f),
+        unique_depth_(0) {}
+  // Cooperatively extend the path with a group of threads
+  // Each thread maintains pweight for its path element in register
+  __device__ void Extend() {
+    unique_depth_++;
+    // Broadcast the zero and one fraction from the newly added path element
+    // Combine 2 shuffle operations into 64 bit word
+    const size_t rank = g_.thread_rank();
+    const float inv_unique_depth =
+        __fdividef(1.0f, static_cast<float>(unique_depth_ + 1));
+    uint64_t res = g_.shfl(*reinterpret_cast<uint64_t*>(&zero_one_fraction_),
+                           unique_depth_);
+    const float new_zero_fraction = reinterpret_cast<float*>(&res)[0];
+    const float new_one_fraction = reinterpret_cast<float*>(&res)[1];
+    float left_pweight = g_.shfl_up(pweight_, 1);
+    // pweight of threads with rank < unique_depth_ is 0
+    // We use max(x,0) to avoid using a branch
+    // pweight_ *=
+    // new_zero_fraction * max(unique_depth_ - rank, 0llu) * inv_unique_depth;
+    pweight_ = __fmul_rn(
+        __fmul_rn(pweight_, new_zero_fraction),
+        __fmul_rn(max(unique_depth_ - rank, size_t(0)), inv_unique_depth));
+    // pweight_  += new_one_fraction * left_pweight * rank * inv_unique_depth;
+    pweight_ = __fmaf_rn(__fmul_rn(new_one_fraction, left_pweight),
+                         __fmul_rn(rank, inv_unique_depth), pweight_);
+  }
+  // Each thread unwinds the path for its feature and returns the sum
+  __device__ float UnwoundPathSum() {
+    float next_one_portion = g_.shfl(pweight_, unique_depth_);
+    float total = 0.0f;
+    const float zero_frac_div_unique_depth = __fdividef(
+        zero_one_fraction_[0], static_cast<float>(unique_depth_ + 1));
+    for (int i = unique_depth_ - 1; i >= 0; i--) {
+      float ith_pweight = g_.shfl(pweight_, i);
+      float precomputed =
+          __fmul_rn((unique_depth_ - i), zero_frac_div_unique_depth);
+      const float tmp =
+          __fdividef(__fmul_rn(next_one_portion, unique_depth_ + 1), i + 1);
+      total = __fmaf_rn(tmp, zero_one_fraction_[1], total);
+      next_one_portion = __fmaf_rn(-tmp, precomputed, ith_pweight);
+      float numerator =
+          __fmul_rn(__fsub_rn(1.0f, zero_one_fraction_[1]), ith_pweight);
+      if (precomputed > 0.0f) {
+        total += __fdividef(numerator, precomputed);
+      }
+    }
+    return total;
+  }
+};
+// Has different permutation weightings to the above
+// Used in Taylor Shapley interaction index
+class TaylorGroupPath : GroupPath {
+ public:
+  __device__ TaylorGroupPath(const ContiguousGroup& g, float zero_fraction,
+                             float one_fraction)
+      : GroupPath(g, zero_fraction, one_fraction) {}
+  // Extend the path is normal, all reweighting can happen in UnwoundPathSum
+  __device__ void Extend() { GroupPath::Extend(); }
+  // Each thread unwinds the path for its feature and returns the sum
+  // We use a different permutation weighting for Taylor interactions
+  // As if the total number of features was one larger
+  __device__ float UnwoundPathSum() {
+    float one_fraction = zero_one_fraction_[1];
+    float zero_fraction = zero_one_fraction_[0];
+    float next_one_portion = g_.shfl(pweight_, unique_depth_) /
+                             static_cast<float>(unique_depth_ + 2);
+    float total = 0.0f;
+    for (int i = unique_depth_ - 1; i >= 0; i--) {
+      float ith_pweight =
+          g_.shfl(pweight_, i) * (static_cast<float>(unique_depth_ - i + 1) /
+                                  static_cast<float>(unique_depth_ + 2));
+      if (one_fraction > 0.0f) {
+        const float tmp =
+            next_one_portion * (unique_depth_ + 2) / ((i + 1) * one_fraction);
+        total += tmp;
+        next_one_portion =
+            ith_pweight - tmp * zero_fraction *
+                              ((unique_depth_ - i + 1) /
+                               static_cast<float>(unique_depth_ + 2));
+      } else if (zero_fraction > 0.0f) {
+        total +=
+            (ith_pweight / zero_fraction) /
+            ((unique_depth_ - i + 1) / static_cast<float>(unique_depth_ + 2));
+      }
+    }
+    return 2 * total;
+  }
+};
+template <typename DatasetT, typename SplitConditionT>
+__device__ float ComputePhi(const PathElement<SplitConditionT>& e,
+                            size_t row_idx, const DatasetT& X,
+                            const ContiguousGroup& group, float zero_fraction) {
+  float one_fraction =
+      e.EvaluateSplit(X, row_idx);
+  GroupPath path(group, zero_fraction, one_fraction);
+  size_t unique_path_length = group.size();
+  // Extend the path
+  for (auto unique_depth = 1ull; unique_depth < unique_path_length;
+       unique_depth++) {
+    path.Extend();
+  }
+  float sum = path.UnwoundPathSum();
+  return sum * (one_fraction - zero_fraction) * e.v;
+}
+inline __host__ __device__ size_t DivRoundUp(size_t a, size_t b) {
+  return (a + b - 1) / b;
+}
+template <typename DatasetT, size_t kBlockSize, size_t kRowsPerWarp,
+          typename SplitConditionT>
+void __device__
+ConfigureThread(const DatasetT& X, const size_t bins_per_row,
+                const PathElement<SplitConditionT>* path_elements,
+                const size_t* bin_segments, size_t* start_row, size_t* end_row,
+                PathElement<SplitConditionT>* e, bool* thread_active) {
+  // Partition work
+  // Each warp processes a set of training instances applied to a path
+  size_t tid = kBlockSize * blockIdx.x + threadIdx.x;
+  const size_t warp_size = 32;
+  size_t warp_rank = tid / warp_size;
+  if (warp_rank >= bins_per_row * DivRoundUp(X.NumRows(), kRowsPerWarp)) {
+    *thread_active = false;
+    return;
+  }
+  size_t bin_idx = warp_rank % bins_per_row;
+  size_t bank = warp_rank / bins_per_row;
+  size_t path_start = bin_segments[bin_idx];
+  size_t path_end = bin_segments[bin_idx + 1];
+  uint32_t thread_rank = threadIdx.x % warp_size;
+  if (thread_rank >= path_end - path_start) {
+    *thread_active = false;
+  } else {
+    *e = path_elements[path_start + thread_rank];
+    *start_row = bank * kRowsPerWarp;
+    *end_row = min((bank + 1) * kRowsPerWarp, X.NumRows());
+    *thread_active = true;
+  }
+}
+#define GPUTREESHAP_MAX_THREADS_PER_BLOCK 256
+#define FULL_MASK 0xffffffff
+template <typename DatasetT, size_t kBlockSize, size_t kRowsPerWarp,
+          typename SplitConditionT>
+__global__ void __launch_bounds__(GPUTREESHAP_MAX_THREADS_PER_BLOCK)
+    ShapKernel(DatasetT X, size_t bins_per_row,
+               const PathElement<SplitConditionT>* path_elements,
+               const size_t* bin_segments, size_t num_groups, double* phis) {
+  // Use shared memory for structs, otherwise nvcc puts in local memory
+  __shared__ DatasetT s_X;
+  s_X = X;
+  __shared__ PathElement<SplitConditionT> s_elements[kBlockSize];
+  PathElement<SplitConditionT>& e = s_elements[threadIdx.x];
+  size_t start_row, end_row;
+  bool thread_active;
+  ConfigureThread<DatasetT, kBlockSize, kRowsPerWarp>(
+      s_X, bins_per_row, path_elements, bin_segments, &start_row, &end_row, &e,
+      &thread_active);
+  uint32_t mask = __ballot_sync(FULL_MASK, thread_active);
+  if (!thread_active) return;
+  float zero_fraction = e.zero_fraction;
+  auto labelled_group = active_labeled_partition(mask, e.path_idx);
+  for (int64_t row_idx = start_row; row_idx < end_row; row_idx++) {
+    float phi = ComputePhi(e, row_idx, X, labelled_group, zero_fraction);
+    if (!e.IsRoot()) {
+      atomicAddDouble(&phis[IndexPhi(row_idx, num_groups, e.group, X.NumCols(),
+                                     e.feature_idx)],
+                      phi);
+    }
+  }
+}
+template <typename DatasetT, typename SizeTAllocatorT, typename PathAllocatorT,
+          typename SplitConditionT>
+void ComputeShap(
+    DatasetT X,
+    const thrust::device_vector<size_t, SizeTAllocatorT>& bin_segments,
+    const thrust::device_vector<PathElement<SplitConditionT>, PathAllocatorT>&
+        path_elements,
+    size_t num_groups, double* phis) {
+  size_t bins_per_row = bin_segments.size() - 1;
+  const int kBlockThreads = GPUTREESHAP_MAX_THREADS_PER_BLOCK;
+  const int warps_per_block = kBlockThreads / 32;
+  const int kRowsPerWarp = 1024;
+  size_t warps_needed = bins_per_row * DivRoundUp(X.NumRows(), kRowsPerWarp);
+  const uint32_t grid_size = DivRoundUp(warps_needed, warps_per_block);
+  ShapKernel<DatasetT, kBlockThreads, kRowsPerWarp>
+      <<<grid_size, kBlockThreads>>>(
+          X, bins_per_row, path_elements.data().get(),
+          bin_segments.data().get(), num_groups, phis);
+}
+template <typename PathT, typename DatasetT, typename SplitConditionT>
+__device__ float ComputePhiCondition(const PathElement<SplitConditionT>& e,
+                                     size_t row_idx, const DatasetT& X,
+                                     const ContiguousGroup& group,
+                                     int64_t condition_feature) {
+  float one_fraction = e.EvaluateSplit(X, row_idx);
+  PathT path(group, e.zero_fraction, one_fraction);
+  size_t unique_path_length = group.size();
+  float condition_on_fraction = 1.0f;
+  float condition_off_fraction = 1.0f;
+  // Extend the path
+  for (auto i = 1ull; i < unique_path_length; i++) {
+    bool is_condition_feature =
+        group.shfl(e.feature_idx, i) == condition_feature;
+    float o_i = group.shfl(one_fraction, i);
+    float z_i = group.shfl(e.zero_fraction, i);
+    if (is_condition_feature) {
+      condition_on_fraction = o_i;
+      condition_off_fraction = z_i;
+    } else {
+      path.Extend();
+    }
+  }
+  float sum = path.UnwoundPathSum();
+  if (e.feature_idx == condition_feature) {
+    return 0.0f;
+  }
+  float phi = sum * (one_fraction - e.zero_fraction) * e.v;
+  return phi * (condition_on_fraction - condition_off_fraction) * 0.5f;
+}
+// If there is a feature in the path we are conditioning on, swap it to the end
+// of the path
+template <typename SplitConditionT>
+inline __device__ void SwapConditionedElement(
+    PathElement<SplitConditionT>** e, PathElement<SplitConditionT>* s_elements,
+    uint32_t condition_rank, const ContiguousGroup& group) {
+  auto last_rank = group.size() - 1;
+  auto this_rank = group.thread_rank();
+  if (this_rank == last_rank) {
+    *e = &s_elements[(threadIdx.x - this_rank) + condition_rank];
+  } else if (this_rank == condition_rank) {
+    *e = &s_elements[(threadIdx.x - this_rank) + last_rank];
+  }
+}
+template <typename DatasetT, size_t kBlockSize, size_t kRowsPerWarp,
+          typename SplitConditionT>
+__global__ void __launch_bounds__(GPUTREESHAP_MAX_THREADS_PER_BLOCK)
+    ShapInteractionsKernel(DatasetT X, size_t bins_per_row,
+                           const PathElement<SplitConditionT>* path_elements,
+                           const size_t* bin_segments, size_t num_groups,
+                           double* phis_interactions) {
+  // Use shared memory for structs, otherwise nvcc puts in local memory
+  __shared__ DatasetT s_X;
+  s_X = X;
+  __shared__ PathElement<SplitConditionT> s_elements[kBlockSize];
+  PathElement<SplitConditionT>* e = &s_elements[threadIdx.x];
+  size_t start_row, end_row;
+  bool thread_active;
+  ConfigureThread<DatasetT, kBlockSize, kRowsPerWarp>(
+      s_X, bins_per_row, path_elements, bin_segments, &start_row, &end_row, e,
+      &thread_active);
+  uint32_t mask = __ballot_sync(FULL_MASK, thread_active);
+  if (!thread_active) return;
+  auto labelled_group = active_labeled_partition(mask, e->path_idx);
+  for (int64_t row_idx = start_row; row_idx < end_row; row_idx++) {
+    float phi = ComputePhi(*e, row_idx, X, labelled_group, e->zero_fraction);
+    if (!e->IsRoot()) {
+      auto phi_offset =
+          IndexPhiInteractions(row_idx, num_groups, e->group, X.NumCols(),
+                               e->feature_idx, e->feature_idx);
+      atomicAddDouble(phis_interactions + phi_offset, phi);
+    }
+    for (auto condition_rank = 1ull; condition_rank < labelled_group.size();
+         condition_rank++) {
+      e = &s_elements[threadIdx.x];
+      int64_t condition_feature =
+          labelled_group.shfl(e->feature_idx, condition_rank);
+      SwapConditionedElement(&e, s_elements, condition_rank, labelled_group);
+      float x = ComputePhiCondition<GroupPath>(*e, row_idx, X, labelled_group,
+                                               condition_feature);
+      if (!e->IsRoot()) {
+        auto phi_offset =
+            IndexPhiInteractions(row_idx, num_groups, e->group, X.NumCols(),
+                                 e->feature_idx, condition_feature);
+        atomicAddDouble(phis_interactions + phi_offset, x);
+        // Subtract effect from diagonal
+        auto phi_diag =
+            IndexPhiInteractions(row_idx, num_groups, e->group, X.NumCols(),
+                                 e->feature_idx, e->feature_idx);
+        atomicAddDouble(phis_interactions + phi_diag, -x);
+      }
+    }
+  }
+}
+template <typename DatasetT, typename SizeTAllocatorT, typename PathAllocatorT,
+          typename SplitConditionT>
+void ComputeShapInteractions(
+    DatasetT X,
+    const thrust::device_vector<size_t, SizeTAllocatorT>& bin_segments,
+    const thrust::device_vector<PathElement<SplitConditionT>, PathAllocatorT>&
+        path_elements,
+    size_t num_groups, double* phis) {
+  size_t bins_per_row = bin_segments.size() - 1;
+  const int kBlockThreads = GPUTREESHAP_MAX_THREADS_PER_BLOCK;
+  const int warps_per_block = kBlockThreads / 32;
+  const int kRowsPerWarp = 100;
+  size_t warps_needed = bins_per_row * DivRoundUp(X.NumRows(), kRowsPerWarp);
+  const uint32_t grid_size = DivRoundUp(warps_needed, warps_per_block);
+  ShapInteractionsKernel<DatasetT, kBlockThreads, kRowsPerWarp>
+      <<<grid_size, kBlockThreads>>>(
+          X, bins_per_row, path_elements.data().get(),
+          bin_segments.data().get(), num_groups, phis);
+}
+template <typename DatasetT, size_t kBlockSize, size_t kRowsPerWarp,
+          typename SplitConditionT>
+__global__ void __launch_bounds__(GPUTREESHAP_MAX_THREADS_PER_BLOCK)
+    ShapTaylorInteractionsKernel(
+        DatasetT X, size_t bins_per_row,
+        const PathElement<SplitConditionT>* path_elements,
+        const size_t* bin_segments, size_t num_groups,
+        double* phis_interactions) {
+  // Use shared memory for structs, otherwise nvcc puts in local memory
+  __shared__ DatasetT s_X;
+  if (threadIdx.x == 0) {
+    s_X = X;
+  }
+  __syncthreads();
+  __shared__ PathElement<SplitConditionT> s_elements[kBlockSize];
+  PathElement<SplitConditionT>* e = &s_elements[threadIdx.x];
+  size_t start_row, end_row;
+  bool thread_active;
+  ConfigureThread<DatasetT, kBlockSize, kRowsPerWarp>(
+      s_X, bins_per_row, path_elements, bin_segments, &start_row, &end_row, e,
+      &thread_active);
+  uint32_t mask = __ballot_sync(FULL_MASK, thread_active);
+  if (!thread_active) return;
+  auto labelled_group = active_labeled_partition(mask, e->path_idx);
+  for (int64_t row_idx = start_row; row_idx < end_row; row_idx++) {
+    for (auto condition_rank = 1ull; condition_rank < labelled_group.size();
+         condition_rank++) {
+      e = &s_elements[threadIdx.x];
+      // Compute the diagonal terms
+      // TODO(Rory): this can be more efficient
+      float reduce_input =
+          e->IsRoot() || labelled_group.thread_rank() == condition_rank
+              ? 1.0f
+              : e->zero_fraction;
+      float reduce =
+          labelled_group.reduce(reduce_input, thrust::multiplies<float>());
+      if (labelled_group.thread_rank() == condition_rank) {
+        float one_fraction = e->split_condition.EvaluateSplit(
+            X.GetElement(row_idx, e->feature_idx));
+        auto phi_offset =
+            IndexPhiInteractions(row_idx, num_groups, e->group, X.NumCols(),
+                                 e->feature_idx, e->feature_idx);
+        atomicAddDouble(phis_interactions + phi_offset,
+                        reduce * (one_fraction - e->zero_fraction) * e->v);
+      }
+      int64_t condition_feature =
+          labelled_group.shfl(e->feature_idx, condition_rank);
+      SwapConditionedElement(&e, s_elements, condition_rank, labelled_group);
+      float x = ComputePhiCondition<TaylorGroupPath>(
+          *e, row_idx, X, labelled_group, condition_feature);
+      if (!e->IsRoot()) {
+        auto phi_offset =
+            IndexPhiInteractions(row_idx, num_groups, e->group, X.NumCols(),
+                                 e->feature_idx, condition_feature);
+        atomicAddDouble(phis_interactions + phi_offset, x);
+      }
+    }
+  }
+}
+template <typename DatasetT, typename SizeTAllocatorT, typename PathAllocatorT,
+          typename SplitConditionT>
+void ComputeShapTaylorInteractions(
+    DatasetT X,
+    const thrust::device_vector<size_t, SizeTAllocatorT>& bin_segments,
+    const thrust::device_vector<PathElement<SplitConditionT>, PathAllocatorT>&
+        path_elements,
+    size_t num_groups, double* phis) {
+  size_t bins_per_row = bin_segments.size() - 1;
+  const int kBlockThreads = GPUTREESHAP_MAX_THREADS_PER_BLOCK;
+  const int warps_per_block = kBlockThreads / 32;
+  const int kRowsPerWarp = 100;
+  size_t warps_needed = bins_per_row * DivRoundUp(X.NumRows(), kRowsPerWarp);
+  const uint32_t grid_size = DivRoundUp(warps_needed, warps_per_block);
+  ShapTaylorInteractionsKernel<DatasetT, kBlockThreads, kRowsPerWarp>
+      <<<grid_size, kBlockThreads>>>(
+          X, bins_per_row, path_elements.data().get(),
+          bin_segments.data().get(), num_groups, phis);
+}
+inline __host__ __device__ int64_t Factorial(int64_t x) {
+  int64_t y = 1;
+  for (auto i = 2; i <= x; i++) {
+    y *= i;
+  }
+  return y;
+}
+// Compute factorials in log space using lgamma to avoid overflow
+inline __host__ __device__ double W(double s, double n) {
+  assert(n - s - 1 >= 0);
+  return exp(lgamma(s + 1) - lgamma(n + 1) + lgamma(n - s));
+}
+template <typename DatasetT, size_t kBlockSize, size_t kRowsPerWarp,
+          typename SplitConditionT>
+__global__ void __launch_bounds__(GPUTREESHAP_MAX_THREADS_PER_BLOCK)
+    ShapInterventionalKernel(DatasetT X, DatasetT R, size_t bins_per_row,
+                             const PathElement<SplitConditionT>* path_elements,
+                             const size_t* bin_segments, size_t num_groups,
+                             double* phis) {
+  // Cache W coefficients
+  __shared__ float s_W[33][33];
+  for (int i = threadIdx.x; i < 33 * 33; i += kBlockSize) {
+    auto s = i % 33;
+    auto n = i / 33;
+    if (n - s - 1 >= 0) {
+      s_W[s][n] = W(s, n);
+    } else {
+      s_W[s][n] = 0.0;
+    }
+  }
+  __syncthreads();
+  __shared__ PathElement<SplitConditionT> s_elements[kBlockSize];
+  PathElement<SplitConditionT>& e = s_elements[threadIdx.x];
+  size_t start_row, end_row;
+  bool thread_active;
+  ConfigureThread<DatasetT, kBlockSize, kRowsPerWarp>(
+      X, bins_per_row, path_elements, bin_segments, &start_row, &end_row, &e,
+      &thread_active);
+  uint32_t mask = __ballot_sync(FULL_MASK, thread_active);
+  if (!thread_active) return;
+  auto labelled_group = active_labeled_partition(mask, e.path_idx);
+  for (int64_t x_idx = start_row; x_idx < end_row; x_idx++) {
+    float result = 0.0f;
+    bool x_cond = e.EvaluateSplit(X, x_idx);
+    uint32_t x_ballot = labelled_group.ballot(x_cond);
+    for (int64_t r_idx = 0; r_idx < R.NumRows(); r_idx++) {
+      bool r_cond = e.EvaluateSplit(R, r_idx);
+      uint32_t r_ballot = labelled_group.ballot(r_cond);
+      assert(!e.IsRoot() ||
+             (x_cond == r_cond));  // These should be the same for the root
+      uint32_t s = __popc(x_ballot & ~r_ballot);
+      uint32_t n = __popc(x_ballot ^ r_ballot);
+      float tmp = 0.0f;
+      // Theorem 1
+      if (x_cond && !r_cond) {
+        tmp += s_W[s - 1][n];
+      }
+      tmp -= s_W[s][n] * (r_cond && !x_cond);
+      // No foreground samples make it to this leaf, increment bias
+      if (e.IsRoot() && s == 0) {
+        tmp += 1.0f;
+      }
+      // If neither foreground or background go down this path, ignore this path
+      bool reached_leaf = !labelled_group.ballot(!x_cond && !r_cond);
+      tmp *= reached_leaf;
+      result += tmp;
+    }
+    if (result != 0.0) {
+      result /= R.NumRows();
+      // Root writes bias
+      auto feature = e.IsRoot() ? X.NumCols() : e.feature_idx;
+      atomicAddDouble(
+          &phis[IndexPhi(x_idx, num_groups, e.group, X.NumCols(), feature)],
+          result * e.v);
+    }
+  }
+}
+template <typename DatasetT, typename SizeTAllocatorT, typename PathAllocatorT,
+          typename SplitConditionT>
+void ComputeShapInterventional(
+    DatasetT X, DatasetT R,
+    const thrust::device_vector<size_t, SizeTAllocatorT>& bin_segments,
+    const thrust::device_vector<PathElement<SplitConditionT>, PathAllocatorT>&
+        path_elements,
+    size_t num_groups, double* phis) {
+  size_t bins_per_row = bin_segments.size() - 1;
+  const int kBlockThreads = GPUTREESHAP_MAX_THREADS_PER_BLOCK;
+  const int warps_per_block = kBlockThreads / 32;
+  const int kRowsPerWarp = 100;
+  size_t warps_needed = bins_per_row * DivRoundUp(X.NumRows(), kRowsPerWarp);
+  const uint32_t grid_size = DivRoundUp(warps_needed, warps_per_block);
+  ShapInterventionalKernel<DatasetT, kBlockThreads, kRowsPerWarp>
+      <<<grid_size, kBlockThreads>>>(
+          X, R, bins_per_row, path_elements.data().get(),
+          bin_segments.data().get(), num_groups, phis);
+}
+template <typename PathVectorT, typename SizeVectorT, typename DeviceAllocatorT>
+void GetBinSegments(const PathVectorT& paths, const SizeVectorT& bin_map,
+                    SizeVectorT* bin_segments) {
+  DeviceAllocatorT alloc;
+  size_t num_bins =
+      thrust::reduce(thrust::cuda::par(alloc), bin_map.begin(), bin_map.end(),
+                     size_t(0), thrust::maximum<size_t>()) +
+      1;
+  bin_segments->resize(num_bins + 1, 0);
+  auto counting = thrust::make_counting_iterator(0llu);
+  auto d_paths = paths.data().get();
+  auto d_bin_segments = bin_segments->data().get();
+  auto d_bin_map = bin_map.data();
+  thrust::for_each_n(counting, paths.size(), [=] __device__(size_t idx) {
+    auto path_idx = d_paths[idx].path_idx;
+    atomicAdd(reinterpret_cast<unsigned long long*>(d_bin_segments) +  // NOLINT
+                  d_bin_map[path_idx],
+              1);
+  });
+  thrust::exclusive_scan(thrust::cuda::par(alloc), bin_segments->begin(),
+                         bin_segments->end(), bin_segments->begin());
+}
+struct DeduplicateKeyTransformOp {
+  template <typename SplitConditionT>
+  __device__ thrust::pair<size_t, int64_t> operator()(
+      const PathElement<SplitConditionT>& e) {
+    return {e.path_idx, e.feature_idx};
+  }
+};
+inline void CheckCuda(cudaError_t err) {
+  if (err != cudaSuccess) {
+    throw thrust::system_error(err, thrust::cuda_category());
+  }
+}
+template <typename Return>
+class DiscardOverload : public thrust::discard_iterator<Return> {
+ public:
+  using value_type = Return;  // NOLINT
+};
+template <typename PathVectorT, typename DeviceAllocatorT,
+          typename SplitConditionT>
+void DeduplicatePaths(PathVectorT* device_paths,
+                      PathVectorT* deduplicated_paths) {
+  DeviceAllocatorT alloc;
+  // Sort by feature
+  thrust::sort(thrust::cuda::par(alloc), device_paths->begin(),
+               device_paths->end(),
+               [=] __device__(const PathElement<SplitConditionT>& a,
+                              const PathElement<SplitConditionT>& b) {
+                 if (a.path_idx < b.path_idx) return true;
+                 if (b.path_idx < a.path_idx) return false;
+                 if (a.feature_idx < b.feature_idx) return true;
+                 if (b.feature_idx < a.feature_idx) return false;
+                 return false;
+               });
+  deduplicated_paths->resize(device_paths->size());
+  using Pair = thrust::pair<size_t, int64_t>;
+  auto key_transform = thrust::make_transform_iterator(
+      device_paths->begin(), DeduplicateKeyTransformOp());
+  thrust::device_vector<size_t> d_num_runs_out(1);
+  size_t* h_num_runs_out;
+  CheckCuda(cudaMallocHost(&h_num_runs_out, sizeof(size_t)));
+  auto combine = [] __device__(PathElement<SplitConditionT> a,
+                               PathElement<SplitConditionT> b) {
+    // Combine duplicate features
+    a.split_condition.Merge(b.split_condition);
+    a.zero_fraction *= b.zero_fraction;
+    return a;
+  };  // NOLINT
+  size_t temp_size = 0;
+  CheckCuda(cub::DeviceReduce::ReduceByKey(
+      nullptr, temp_size, key_transform, DiscardOverload<Pair>(),
+      device_paths->begin(), deduplicated_paths->begin(),
+      d_num_runs_out.begin(), combine, device_paths->size()));
+  using TempAlloc = RebindVector<char, DeviceAllocatorT>;
+  TempAlloc tmp(temp_size);
+  CheckCuda(cub::DeviceReduce::ReduceByKey(
+      tmp.data().get(), temp_size, key_transform, DiscardOverload<Pair>(),
+      device_paths->begin(), deduplicated_paths->begin(),
+      d_num_runs_out.begin(), combine, device_paths->size()));
+  CheckCuda(cudaMemcpy(h_num_runs_out, d_num_runs_out.data().get(),
+                       sizeof(size_t), cudaMemcpyDeviceToHost));
+  deduplicated_paths->resize(*h_num_runs_out);
+  CheckCuda(cudaFreeHost(h_num_runs_out));
+}
+template <typename PathVectorT, typename SplitConditionT, typename SizeVectorT,
+          typename DeviceAllocatorT>
+void SortPaths(PathVectorT* paths, const SizeVectorT& bin_map) {
+  auto d_bin_map = bin_map.data();
+  DeviceAllocatorT alloc;
+  thrust::sort(thrust::cuda::par(alloc), paths->begin(), paths->end(),
+               [=] __device__(const PathElement<SplitConditionT>& a,
+                              const PathElement<SplitConditionT>& b) {
+                 size_t a_bin = d_bin_map[a.path_idx];
+                 size_t b_bin = d_bin_map[b.path_idx];
+                 if (a_bin < b_bin) return true;
+                 if (b_bin < a_bin) return false;
+                 if (a.path_idx < b.path_idx) return true;
+                 if (b.path_idx < a.path_idx) return false;
+                 if (a.feature_idx < b.feature_idx) return true;
+                 if (b.feature_idx < a.feature_idx) return false;
+                 return false;
+               });
+}
+using kv = std::pair<size_t, int>;
+struct BFDCompare {
+  bool operator()(const kv& lhs, const kv& rhs) const {
+    if (lhs.second == rhs.second) {
+      return lhs.first < rhs.first;
+    }
+    return lhs.second < rhs.second;
+  }
+};
+// Best Fit Decreasing bin packing
+// Efficient O(nlogn) implementation with balanced tree using std::set
+template <typename IntVectorT>
+std::vector<size_t> BFDBinPacking(const IntVectorT& counts,
+                                  int bin_limit = 32) {
+  thrust::host_vector<int> counts_host(counts);
+  std::vector<kv> path_lengths(counts_host.size());
+  for (auto i = 0ull; i < counts_host.size(); i++) {
+    path_lengths[i] = {i, counts_host[i]};
+  }
+  std::sort(path_lengths.begin(), path_lengths.end(),
+            [&](const kv& a, const kv& b) {
+              std::greater<> op;
+              return op(a.second, b.second);
+            });
+  // map unique_id -> bin
+  std::vector<size_t> bin_map(counts_host.size());
+  std::set<kv, BFDCompare> bin_capacities;
+  bin_capacities.insert({bin_capacities.size(), bin_limit});
+  for (auto pair : path_lengths) {
+    int new_size = pair.second;
+    auto itr = bin_capacities.lower_bound({0, new_size});
+    // Does not fit in any bin
+    if (itr == bin_capacities.end()) {
+      size_t new_bin_idx = bin_capacities.size();
+      bin_capacities.insert({new_bin_idx, bin_limit - new_size});
+      bin_map[pair.first] = new_bin_idx;
+    } else {
+      kv entry = *itr;
+      entry.second -= new_size;
+      bin_map[pair.first] = entry.first;
+      bin_capacities.erase(itr);
+      bin_capacities.insert(entry);
+    }
+  }
+  return bin_map;
+}
+// First Fit Decreasing bin packing
+// Inefficient O(n^2) implementation
+template <typename IntVectorT>
+std::vector<size_t> FFDBinPacking(const IntVectorT& counts,
+                                  int bin_limit = 32) {
+  thrust::host_vector<int> counts_host(counts);
+  std::vector<kv> path_lengths(counts_host.size());
+  for (auto i = 0ull; i < counts_host.size(); i++) {
+    path_lengths[i] = {i, counts_host[i]};
+  }
+  std::sort(path_lengths.begin(), path_lengths.end(),
+            [&](const kv& a, const kv& b) {
+              std::greater<> op;
+              return op(a.second, b.second);
+            });
+  // map unique_id -> bin
+  std::vector<size_t> bin_map(counts_host.size());
+  std::vector<int> bin_capacities(path_lengths.size(), bin_limit);
+  for (auto pair : path_lengths) {
+    int new_size = pair.second;
+    for (auto j = 0ull; j < bin_capacities.size(); j++) {
+      int& capacity = bin_capacities[j];
+      if (capacity >= new_size) {
+        capacity -= new_size;
+        bin_map[pair.first] = j;
+        break;
+      }
+    }
+  }
+  return bin_map;
+}
+// Next Fit bin packing
+// O(n) implementation
+template <typename IntVectorT>
+std::vector<size_t> NFBinPacking(const IntVectorT& counts, int bin_limit = 32) {
+  thrust::host_vector<int> counts_host(counts);
+  std::vector<size_t> bin_map(counts_host.size());
+  size_t current_bin = 0;
+  int current_capacity = bin_limit;
+  for (auto i = 0ull; i < counts_host.size(); i++) {
+    int new_size = counts_host[i];
+    size_t path_idx = i;
+    if (new_size <= current_capacity) {
+      current_capacity -= new_size;
+      bin_map[path_idx] = current_bin;
+    } else {
+      current_capacity = bin_limit - new_size;
+      bin_map[path_idx] = ++current_bin;
+    }
+  }
+  return bin_map;
+}
+template <typename DeviceAllocatorT, typename SplitConditionT,
+          typename PathVectorT, typename LengthVectorT>
+void GetPathLengths(const PathVectorT& device_paths,
+                    LengthVectorT* path_lengths) {
+  path_lengths->resize(
+      static_cast<PathElement<SplitConditionT>>(device_paths.back()).path_idx +
+          1,
+      0);
+  auto counting = thrust::make_counting_iterator(0llu);
+  auto d_paths = device_paths.data().get();
+  auto d_lengths = path_lengths->data().get();
+  thrust::for_each_n(counting, device_paths.size(), [=] __device__(size_t idx) {
+    auto path_idx = d_paths[idx].path_idx;
+    atomicAdd(d_lengths + path_idx, 1ull);
+  });
+}
+struct PathTooLongOp {
+  __device__ size_t operator()(size_t length) { return length > 32; }
+};
+template <typename SplitConditionT>
+struct IncorrectVOp {
+  const PathElement<SplitConditionT>* paths;
+  __device__ size_t operator()(size_t idx) {
+    auto a = paths[idx - 1];
+    auto b = paths[idx];
+    return a.path_idx == b.path_idx && a.v != b.v;
+  }
+};
+template <typename DeviceAllocatorT, typename SplitConditionT,
+          typename PathVectorT, typename LengthVectorT>
+void ValidatePaths(const PathVectorT& device_paths,
+                   const LengthVectorT& path_lengths) {
+  DeviceAllocatorT alloc;
+  PathTooLongOp too_long_op;
+  auto invalid_length =
+      thrust::any_of(thrust::cuda::par(alloc), path_lengths.begin(),
+                     path_lengths.end(), too_long_op);
+  if (invalid_length) {
+    throw std::invalid_argument("Tree depth must be < 32");
+  }
+  IncorrectVOp<SplitConditionT> incorrect_v_op{device_paths.data().get()};
+  auto counting = thrust::counting_iterator<size_t>(0);
+  auto incorrect_v =
+      thrust::any_of(thrust::cuda::par(alloc), counting + 1,
+                     counting + device_paths.size(), incorrect_v_op);
+  if (incorrect_v) {
+    throw std::invalid_argument(
+        "Leaf value v should be the same across a single path");
+  }
+}
+template <typename DeviceAllocatorT, typename SplitConditionT,
+          typename PathVectorT, typename SizeVectorT>
+void PreprocessPaths(PathVectorT* device_paths, PathVectorT* deduplicated_paths,
+                     SizeVectorT* bin_segments) {
+  // Sort paths by length and feature
+  detail::DeduplicatePaths<PathVectorT, DeviceAllocatorT, SplitConditionT>(
+      device_paths, deduplicated_paths);
+  using int_vector = RebindVector<int, DeviceAllocatorT>;
+  int_vector path_lengths;
+  detail::GetPathLengths<DeviceAllocatorT, SplitConditionT>(*deduplicated_paths,
+                                                            &path_lengths);
+  SizeVectorT device_bin_map = detail::BFDBinPacking(path_lengths);
+  ValidatePaths<DeviceAllocatorT, SplitConditionT>(*deduplicated_paths,
+                                                   path_lengths);
+  detail::SortPaths<PathVectorT, SplitConditionT, SizeVectorT,
+                    DeviceAllocatorT>(deduplicated_paths, device_bin_map);
+  detail::GetBinSegments<PathVectorT, SizeVectorT, DeviceAllocatorT>(
+      *deduplicated_paths, device_bin_map, bin_segments);
+}
+struct PathIdxTransformOp {
+  template <typename SplitConditionT>
+  __device__ size_t operator()(const PathElement<SplitConditionT>& e) {
+    return e.path_idx;
+  }
+};
+struct GroupIdxTransformOp {
+  template <typename SplitConditionT>
+  __device__ size_t operator()(const PathElement<SplitConditionT>& e) {
+    return e.group;
+  }
+};
+struct BiasTransformOp {
+  template <typename SplitConditionT>
+  __device__ double operator()(const PathElement<SplitConditionT>& e) {
+    return e.zero_fraction * e.v;
+  }
+};
+// While it is possible to compute bias in the primary kernel, we do it here
+// using double precision to avoid numerical stability issues
+template <typename PathVectorT, typename DoubleVectorT,
+          typename DeviceAllocatorT, typename SplitConditionT>
+void ComputeBias(const PathVectorT& device_paths, DoubleVectorT* bias) {
+  using double_vector = thrust::device_vector<
+      double, typename DeviceAllocatorT::template rebind<double>::other>;
+  PathVectorT sorted_paths(device_paths);
+  DeviceAllocatorT alloc;
+  // Make sure groups are contiguous
+  thrust::sort(thrust::cuda::par(alloc), sorted_paths.begin(),
+               sorted_paths.end(),
+               [=] __device__(const PathElement<SplitConditionT>& a,
+                              const PathElement<SplitConditionT>& b) {
+                 if (a.group < b.group) return true;
+                 if (b.group < a.group) return false;
+                 if (a.path_idx < b.path_idx) return true;
+                 if (b.path_idx < a.path_idx) return false;
+                 return false;
+               });
+  // Combine zero fraction for all paths
+  auto path_key = thrust::make_transform_iterator(sorted_paths.begin(),
+                                                  PathIdxTransformOp());
+  PathVectorT combined(sorted_paths.size());
+  auto combined_out = thrust::reduce_by_key(
+      thrust::cuda ::par(alloc), path_key, path_key + sorted_paths.size(),
+      sorted_paths.begin(), thrust::make_discard_iterator(), combined.begin(),
+      thrust::equal_to<size_t>(),
+      [=] __device__(PathElement<SplitConditionT> a,
+                     const PathElement<SplitConditionT>& b) {
+        a.zero_fraction *= b.zero_fraction;
+        return a;
+      });
+  size_t num_paths = combined_out.second - combined.begin();
+  // Combine bias for each path, over each group
+  using size_vector = thrust::device_vector<
+      size_t, typename DeviceAllocatorT::template rebind<size_t>::other>;
+  size_vector keys_out(num_paths);
+  double_vector values_out(num_paths);
+  auto group_key =
+      thrust::make_transform_iterator(combined.begin(), GroupIdxTransformOp());
+  auto values =
+      thrust::make_transform_iterator(combined.begin(), BiasTransformOp());
+  auto out_itr = thrust::reduce_by_key(thrust::cuda::par(alloc), group_key,
+                                       group_key + num_paths, values,
+                                       keys_out.begin(), values_out.begin());
+  // Write result
+  size_t n = out_itr.first - keys_out.begin();
+  auto counting = thrust::make_counting_iterator(0llu);
+  auto d_keys_out = keys_out.data().get();
+  auto d_values_out = values_out.data().get();
+  auto d_bias = bias->data().get();
+  thrust::for_each_n(counting, n, [=] __device__(size_t idx) {
+    d_bias[d_keys_out[idx]] = d_values_out[idx];
+  });
+}
+};  // namespace detail
+/*!
+ * Compute feature contributions on the GPU given a set of unique paths through
+ * a tree ensemble and a dataset. Uses device memory proportional to the tree
+ * ensemble size.
+ *
+ * \exception std::invalid_argument Thrown when an invalid argument error
+ * condition occurs. \tparam  PathIteratorT     Thrust type iterator, may be
+ * thrust::device_ptr for device memory, or stl iterator/raw pointer for host
+ * memory. \tparam  PhiIteratorT      Thrust type iterator, may be
+ * thrust::device_ptr for device memory, or stl iterator/raw pointer for host
+ * memory. Value type must be floating point. \tparam  DatasetT User-specified
+ * dataset container. \tparam  DeviceAllocatorT  Optional thrust style
+ * allocator.
+ *
+ * \param X           Thin wrapper over a dataset allocated in device memory. X
+ * should be trivially copyable as a kernel parameter (i.e. contain only
+ * pointers to actual data) and must implement the methods
+ * NumRows()/NumCols()/GetElement(size_t row_idx, size_t col_idx) as __device__
+ * functions. GetElement may return NaN where the feature value is missing.
+ * \param begin       Iterator to paths, where separate paths are delineated by
+ *                    PathElement.path_idx. Each unique path should contain 1
+ * root with feature_idx = -1 and zero_fraction = 1.0. The ordering of path
+ * elements inside a unique path does not matter - the result will be the same.
+ * Paths may contain duplicate features. See the PathElement class for more
+ * information. \param end         Path end iterator. \param num_groups  Number
+ * of output groups. In multiclass classification the algorithm outputs feature
+ * contributions per output class. \param phis_begin  Begin iterator for output
+ * phis. \param phis_end    End iterator for output phis.
+ */
+template <typename DeviceAllocatorT = thrust::device_allocator<int>,
+          typename DatasetT, typename PathIteratorT, typename PhiIteratorT>
+void GPUTreeShap(DatasetT X, PathIteratorT begin, PathIteratorT end,
+                 size_t num_groups, PhiIteratorT phis_begin,
+                 PhiIteratorT phis_end) {
+  if (X.NumRows() == 0 || X.NumCols() == 0 || end - begin <= 0) return;
+  if (size_t(phis_end - phis_begin) <
+      X.NumRows() * (X.NumCols() + 1) * num_groups) {
+    throw std::invalid_argument(
+        "phis_out must be at least of size X.NumRows() * (X.NumCols() + 1) * "
+        "num_groups");
+  }
+  using size_vector = detail::RebindVector<size_t, DeviceAllocatorT>;
+  using double_vector = detail::RebindVector<double, DeviceAllocatorT>;
+  using path_vector = detail::RebindVector<
+      typename std::iterator_traits<PathIteratorT>::value_type,
+      DeviceAllocatorT>;
+  using split_condition =
+      typename std::iterator_traits<PathIteratorT>::value_type::split_type;
+  // Compute the global bias
+  double_vector temp_phi(phis_end - phis_begin, 0.0);
+  path_vector device_paths(begin, end);
+  double_vector bias(num_groups, 0.0);
+  detail::ComputeBias<path_vector, double_vector, DeviceAllocatorT,
+                      split_condition>(device_paths, &bias);
+  auto d_bias = bias.data().get();
+  auto d_temp_phi = temp_phi.data().get();
+  thrust::for_each_n(thrust::make_counting_iterator(0llu),
+                     X.NumRows() * num_groups, [=] __device__(size_t idx) {
+                       size_t group = idx % num_groups;
+                       size_t row_idx = idx / num_groups;
+                       d_temp_phi[IndexPhi(row_idx, num_groups, group,
+                                           X.NumCols(), X.NumCols())] +=
+                           d_bias[group];
+                     });
+  path_vector deduplicated_paths;
+  size_vector device_bin_segments;
+  detail::PreprocessPaths<DeviceAllocatorT, split_condition>(
+      &device_paths, &deduplicated_paths, &device_bin_segments);
+  detail::ComputeShap(X, device_bin_segments, deduplicated_paths, num_groups,
+                      temp_phi.data().get());
+  thrust::copy(temp_phi.begin(), temp_phi.end(), phis_begin);
+}
+/*!
+ * Compute feature interaction contributions on the GPU given a set of unique
+ * paths through a tree ensemble and a dataset. Uses device memory
+ * proportional to the tree ensemble size.
+ *
+ * \exception std::invalid_argument Thrown when an invalid argument error
+ *                                  condition occurs.
+ * \tparam  DeviceAllocatorT  Optional thrust style allocator.
+ * \tparam  DatasetT          User-specified dataset container.
+ * \tparam  PathIteratorT     Thrust type iterator, may be thrust::device_ptr
+ *                            for device memory, or stl iterator/raw pointer for
+ *                            host memory.
+ * \tparam  PhiIteratorT      Thrust type iterator, may be thrust::device_ptr
+ *                            for device memory, or stl iterator/raw pointer for
+ *                            host memory. Value type must be floating point.
+ *
+ * \param X           Thin wrapper over a dataset allocated in device memory. X
+ *                    should be trivially copyable as a kernel parameter (i.e.
+ *                    contain only pointers to actual data) and must implement
+ *                    the methods NumRows()/NumCols()/GetElement(size_t row_idx,
+ *                    size_t col_idx) as __device__ functions. GetElement may
+ *                    return NaN where the feature value is missing.
+ * \param begin       Iterator to paths, where separate paths are delineated by
+ *                    PathElement.path_idx. Each unique path should contain 1
+ *                    root with feature_idx = -1 and zero_fraction = 1.0. The
+ *                    ordering of path elements inside a unique path does not
+ *                    matter - the result will be the same. Paths may contain
+ *                    duplicate features. See the PathElement class for more
+ *                    information.
+ * \param end         Path end iterator.
+ * \param num_groups  Number of output groups. In multiclass classification the
+ *                    algorithm outputs feature contributions per output class.
+ * \param phis_begin  Begin iterator for output phis.
+ * \param phis_end    End iterator for output phis.
+ */
+template <typename DeviceAllocatorT = thrust::device_allocator<int>,
+          typename DatasetT, typename PathIteratorT, typename PhiIteratorT>
+void GPUTreeShapInteractions(DatasetT X, PathIteratorT begin, PathIteratorT end,
+                             size_t num_groups, PhiIteratorT phis_begin,
+                             PhiIteratorT phis_end) {
+  if (X.NumRows() == 0 || X.NumCols() == 0 || end - begin <= 0) return;
+  if (size_t(phis_end - phis_begin) <
+      X.NumRows() * (X.NumCols() + 1) * (X.NumCols() + 1) * num_groups) {
+    throw std::invalid_argument(
+        "phis_out must be at least of size X.NumRows() * (X.NumCols() + 1)  * "
+        "(X.NumCols() + 1) * "
+        "num_groups");
+  }
+  using size_vector = detail::RebindVector<size_t, DeviceAllocatorT>;
+  using double_vector = detail::RebindVector<double, DeviceAllocatorT>;
+  using path_vector = detail::RebindVector<
+      typename std::iterator_traits<PathIteratorT>::value_type,
+      DeviceAllocatorT>;
+  using split_condition =
+      typename std::iterator_traits<PathIteratorT>::value_type::split_type;
+  // Compute the global bias
+  double_vector temp_phi(phis_end - phis_begin, 0.0);
+  path_vector device_paths(begin, end);
+  double_vector bias(num_groups, 0.0);
+  detail::ComputeBias<path_vector, double_vector, DeviceAllocatorT,
+                      split_condition>(device_paths, &bias);
+  auto d_bias = bias.data().get();
+  auto d_temp_phi = temp_phi.data().get();
+  thrust::for_each_n(
+      thrust::make_counting_iterator(0llu), X.NumRows() * num_groups,
+      [=] __device__(size_t idx) {
+        size_t group = idx % num_groups;
+        size_t row_idx = idx / num_groups;
+        d_temp_phi[IndexPhiInteractions(row_idx, num_groups, group, X.NumCols(),
+                                        X.NumCols(), X.NumCols())] +=
+            d_bias[group];
+      });
+  path_vector deduplicated_paths;
+  size_vector device_bin_segments;
+  detail::PreprocessPaths<DeviceAllocatorT, split_condition>(
+      &device_paths, &deduplicated_paths, &device_bin_segments);
+  detail::ComputeShapInteractions(X, device_bin_segments, deduplicated_paths,
+                                  num_groups, temp_phi.data().get());
+  thrust::copy(temp_phi.begin(), temp_phi.end(), phis_begin);
+}
+/*!
+ * Compute feature interaction contributions using the Shapley Taylor index on
+ * the GPU, given a set of unique paths through a tree ensemble and a dataset.
+ * Uses device memory proportional to the tree ensemble size.
+ *
+ * \exception std::invalid_argument Thrown when an invalid argument error
+ *                                  condition occurs.
+ * \tparam  PhiIteratorT      Thrust type iterator, may be thrust::device_ptr
+ *                            for device memory, or stl iterator/raw pointer for
+ *                            host memory. Value type must be floating point.
+ * \tparam  PathIteratorT     Thrust type iterator, may be thrust::device_ptr
+ *                            for device memory, or stl iterator/raw pointer for
+ *                            host memory.
+ * \tparam  DatasetT          User-specified dataset container.
+ * \tparam  DeviceAllocatorT  Optional thrust style allocator.
+ *
+ * \param X           Thin wrapper over a dataset allocated in device memory. X
+ *                    should be trivially copyable as a kernel parameter (i.e.
+ *                    contain only pointers to actual data) and must implement
+ *                    the methods NumRows()/NumCols()/GetElement(size_t row_idx,
+ *                    size_t col_idx) as __device__ functions. GetElement may
+ *                    return NaN where the feature value is missing.
+ * \param begin       Iterator to paths, where separate paths are delineated by
+ *                    PathElement.path_idx. Each unique path should contain 1
+ *                    root with feature_idx = -1 and zero_fraction = 1.0. The
+ *                    ordering of path elements inside a unique path does not
+ *                    matter - the result will be the same. Paths may contain
+ *                    duplicate features. See the PathElement class for more
+ *                    information.
+ * \param end         Path end iterator.
+ * \param num_groups  Number of output groups. In multiclass classification the
+ *                    algorithm outputs feature contributions per output class.
+ * \param phis_begin  Begin iterator for output phis.
+ * \param phis_end    End iterator for output phis.
+ */
+template <typename DeviceAllocatorT = thrust::device_allocator<int>,
+          typename DatasetT, typename PathIteratorT, typename PhiIteratorT>
+void GPUTreeShapTaylorInteractions(DatasetT X, PathIteratorT begin,
+                                   PathIteratorT end, size_t num_groups,
+                                   PhiIteratorT phis_begin,
+                                   PhiIteratorT phis_end) {
+  using phis_type = typename std::iterator_traits<PhiIteratorT>::value_type;
+  static_assert(std::is_floating_point<phis_type>::value,
+                "Phis type must be floating point");
+  if (X.NumRows() == 0 || X.NumCols() == 0 || end - begin <= 0) return;
+  if (size_t(phis_end - phis_begin) <
+      X.NumRows() * (X.NumCols() + 1) * (X.NumCols() + 1) * num_groups) {
+    throw std::invalid_argument(
+        "phis_out must be at least of size X.NumRows() * (X.NumCols() + 1)  * "
+        "(X.NumCols() + 1) * "
+        "num_groups");
+  }
+  using size_vector = detail::RebindVector<size_t, DeviceAllocatorT>;
+  using double_vector = detail::RebindVector<double, DeviceAllocatorT>;
+  using path_vector = detail::RebindVector<
+      typename std::iterator_traits<PathIteratorT>::value_type,
+      DeviceAllocatorT>;
+  using split_condition =
+      typename std::iterator_traits<PathIteratorT>::value_type::split_type;
+  // Compute the global bias
+  double_vector temp_phi(phis_end - phis_begin, 0.0);
+  path_vector device_paths(begin, end);
+  double_vector bias(num_groups, 0.0);
+  detail::ComputeBias<path_vector, double_vector, DeviceAllocatorT,
+                      split_condition>(device_paths, &bias);
+  auto d_bias = bias.data().get();
+  auto d_temp_phi = temp_phi.data().get();
+  thrust::for_each_n(
+      thrust::make_counting_iterator(0llu), X.NumRows() * num_groups,
+      [=] __device__(size_t idx) {
+        size_t group = idx % num_groups;
+        size_t row_idx = idx / num_groups;
+        d_temp_phi[IndexPhiInteractions(row_idx, num_groups, group, X.NumCols(),
+                                        X.NumCols(), X.NumCols())] +=
+            d_bias[group];
+      });
+  path_vector deduplicated_paths;
+  size_vector device_bin_segments;
+  detail::PreprocessPaths<DeviceAllocatorT, split_condition>(
+      &device_paths, &deduplicated_paths, &device_bin_segments);
+  detail::ComputeShapTaylorInteractions(X, device_bin_segments,
+                                        deduplicated_paths, num_groups,
+                                        temp_phi.data().get());
+  thrust::copy(temp_phi.begin(), temp_phi.end(), phis_begin);
+}
+/*!
+ * Compute feature contributions on the GPU given a set of unique paths through a tree ensemble
+ * and a dataset. Uses device memory proportional to the tree ensemble size. This variant
+ * implements the interventional tree shap algorithm described here:
+ * https://drafts.distill.pub/HughChen/its_blog/
+ *
+ * It requires a background dataset R.
+ *
+ * \exception std::invalid_argument Thrown when an invalid argument error condition occurs.
+ * \tparam  DeviceAllocatorT  Optional thrust style allocator.
+ * \tparam  DatasetT          User-specified dataset container.
+ * \tparam  PathIteratorT     Thrust type iterator, may be thrust::device_ptr for device memory, or
+ *                            stl iterator/raw pointer for host memory.
+ *
+ * \param X           Thin wrapper over a dataset allocated in device memory. X should be trivially
+ *                    copyable as a kernel parameter (i.e. contain only pointers to actual data) and
+ *                    must implement the methods NumRows()/NumCols()/GetElement(size_t row_idx,
+ *                    size_t col_idx) as __device__ functions. GetElement may return NaN where the
+ *                    feature value is missing.
+ * \param R           Background dataset.
+ * \param begin       Iterator to paths, where separate paths are delineated by
+ *                    PathElement.path_idx. Each unique path should contain 1 root with feature_idx =
+ *                    -1 and zero_fraction = 1.0. The ordering of path elements inside a unique path
+ *                    does not matter - the result will be the same. Paths may contain duplicate
+ *                    features. See the PathElement class for more information.
+ * \param end         Path end iterator.
+ * \param num_groups  Number of output groups. In multiclass classification the algorithm outputs
+ *                    feature contributions per output class.
+ * \param phis_begin  Begin iterator for output phis.
+ * \param phis_end    End iterator for output phis.
+ */
+template <typename DeviceAllocatorT = thrust::device_allocator<int>,
+          typename DatasetT, typename PathIteratorT, typename PhiIteratorT>
+void GPUTreeShapInterventional(DatasetT X, DatasetT R, PathIteratorT begin,
+                               PathIteratorT end, size_t num_groups,
+                               PhiIteratorT phis_begin, PhiIteratorT phis_end) {
+  if (X.NumRows() == 0 || X.NumCols() == 0 || end - begin <= 0) return;
+  if (size_t(phis_end - phis_begin) <
+      X.NumRows() * (X.NumCols() + 1) * num_groups) {
+    throw std::invalid_argument(
+        "phis_out must be at least of size X.NumRows() * (X.NumCols() + 1) * "
+        "num_groups");
+  }
+  using size_vector = detail::RebindVector<size_t, DeviceAllocatorT>;
+  using double_vector = detail::RebindVector<double, DeviceAllocatorT>;
+  using path_vector = detail::RebindVector<
+      typename std::iterator_traits<PathIteratorT>::value_type,
+      DeviceAllocatorT>;
+  using split_condition =
+      typename std::iterator_traits<PathIteratorT>::value_type::split_type;
+  double_vector temp_phi(phis_end - phis_begin, 0.0);
+  path_vector device_paths(begin, end);
+  path_vector deduplicated_paths;
+  size_vector device_bin_segments;
+  detail::PreprocessPaths<DeviceAllocatorT, split_condition>(
+      &device_paths, &deduplicated_paths, &device_bin_segments);
+  detail::ComputeShapInterventional(X, R, device_bin_segments,
+                                    deduplicated_paths, num_groups,
+                                    temp_phi.data().get());
+  thrust::copy(temp_phi.begin(), temp_phi.end(), phis_begin);
+}
+}  // namespace gpu_treeshap

lib/shap/cext/tree_shap.h ADDED Viewed

	@@ -0,0 +1,1460 @@

+/**
+ * Fast recursive computation of SHAP values in trees.
+ * See https://arxiv.org/abs/1802.03888 for details.
+ *
+ * Scott Lundberg, 2018 (independent algorithm courtesy of Hugh Chen 2018)
+ */
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <stdio.h>
+#include <cmath>
+#include <ctime>
+#if defined(_WIN32) || defined(WIN32)
+    #include <malloc.h>
+#elif defined(__MVS__)
+    #include <stdlib.h>
+#else
+    #include <alloca.h>
+#endif
+using namespace std;
+typedef double tfloat;
+typedef tfloat (* transform_f)(const tfloat margin, const tfloat y);
+namespace FEATURE_DEPENDENCE {
+    const unsigned independent = 0;
+    const unsigned tree_path_dependent = 1;
+    const unsigned global_path_dependent = 2;
+}
+struct TreeEnsemble {
+    int *children_left;
+    int *children_right;
+    int *children_default;
+    int *features;
+    tfloat *thresholds;
+    tfloat *values;
+    tfloat *node_sample_weights;
+    unsigned max_depth;
+    unsigned tree_limit;
+    tfloat *base_offset;
+    unsigned max_nodes;
+    unsigned num_outputs;
+    TreeEnsemble() {}
+    TreeEnsemble(int *children_left, int *children_right, int *children_default, int *features,
+                 tfloat *thresholds, tfloat *values, tfloat *node_sample_weights,
+                 unsigned max_depth, unsigned tree_limit, tfloat *base_offset,
+                 unsigned max_nodes, unsigned num_outputs) :
+        children_left(children_left), children_right(children_right),
+        children_default(children_default), features(features), thresholds(thresholds),
+        values(values), node_sample_weights(node_sample_weights),
+        max_depth(max_depth), tree_limit(tree_limit),
+        base_offset(base_offset), max_nodes(max_nodes), num_outputs(num_outputs) {}
+    void get_tree(TreeEnsemble &tree, const unsigned i) const {
+        const unsigned d = i * max_nodes;
+        tree.children_left = children_left + d;
+        tree.children_right = children_right + d;
+        tree.children_default = children_default + d;
+        tree.features = features + d;
+        tree.thresholds = thresholds + d;
+        tree.values = values + d * num_outputs;
+        tree.node_sample_weights = node_sample_weights + d;
+        tree.max_depth = max_depth;
+        tree.tree_limit = 1;
+        tree.base_offset = base_offset;
+        tree.max_nodes = max_nodes;
+        tree.num_outputs = num_outputs;
+    }
+    bool is_leaf(unsigned pos)const {
+        return children_left[pos] < 0;
+    }
+    void allocate(unsigned tree_limit_in, unsigned max_nodes_in, unsigned num_outputs_in) {
+        tree_limit = tree_limit_in;
+        max_nodes = max_nodes_in;
+        num_outputs = num_outputs_in;
+        children_left = new int[tree_limit * max_nodes];
+        children_right = new int[tree_limit * max_nodes];
+        children_default = new int[tree_limit * max_nodes];
+        features = new int[tree_limit * max_nodes];
+        thresholds = new tfloat[tree_limit * max_nodes];
+        values = new tfloat[tree_limit * max_nodes * num_outputs];
+        node_sample_weights = new tfloat[tree_limit * max_nodes];
+    }
+    void free() {
+        delete[] children_left;
+        delete[] children_right;
+        delete[] children_default;
+        delete[] features;
+        delete[] thresholds;
+        delete[] values;
+        delete[] node_sample_weights;
+    }
+};
+struct ExplanationDataset {
+    tfloat *X;
+    bool *X_missing;
+    tfloat *y;
+    tfloat *R;
+    bool *R_missing;
+    unsigned num_X;
+    unsigned M;
+    unsigned num_R;
+    ExplanationDataset() {}
+    ExplanationDataset(tfloat *X, bool *X_missing, tfloat *y, tfloat *R, bool *R_missing, unsigned num_X,
+                       unsigned M, unsigned num_R) :
+        X(X), X_missing(X_missing), y(y), R(R), R_missing(R_missing), num_X(num_X), M(M), num_R(num_R) {}
+    void get_x_instance(ExplanationDataset &instance, const unsigned i) const {
+        instance.M = M;
+        instance.X = X + i * M;
+        instance.X_missing = X_missing + i * M;
+        instance.num_X = 1;
+    }
+};
+// data we keep about our decision path
+// note that pweight is included for convenience and is not tied with the other attributes
+// the pweight of the i'th path element is the permutation weight of paths with i-1 ones in them
+struct PathElement {
+    int feature_index;
+    tfloat zero_fraction;
+    tfloat one_fraction;
+    tfloat pweight;
+    PathElement() {}
+    PathElement(int i, tfloat z, tfloat o, tfloat w) :
+        feature_index(i), zero_fraction(z), one_fraction(o), pweight(w) {}
+};
+inline tfloat logistic_transform(const tfloat margin, const tfloat y) {
+    return 1 / (1 + exp(-margin));
+}
+inline tfloat logistic_nlogloss_transform(const tfloat margin, const tfloat y) {
+    return log(1 + exp(margin)) - y * margin; // y is in {0, 1}
+}
+inline tfloat squared_loss_transform(const tfloat margin, const tfloat y) {
+    return (margin - y) * (margin - y);
+}
+namespace MODEL_TRANSFORM {
+    const unsigned identity = 0;
+    const unsigned logistic = 1;
+    const unsigned logistic_nlogloss = 2;
+    const unsigned squared_loss = 3;
+}
+inline transform_f get_transform(unsigned model_transform) {
+    transform_f transform = NULL;
+    switch (model_transform) {
+        case MODEL_TRANSFORM::logistic:
+            transform = logistic_transform;
+            break;
+        case MODEL_TRANSFORM::logistic_nlogloss:
+            transform = logistic_nlogloss_transform;
+            break;
+        case MODEL_TRANSFORM::squared_loss:
+            transform = squared_loss_transform;
+            break;
+    }
+    return transform;
+}
+inline tfloat *tree_predict(unsigned i, const TreeEnsemble &trees, const tfloat *x, const bool *x_missing) {
+    const unsigned offset = i * trees.max_nodes;
+    unsigned node = 0;
+    while (true) {
+        const unsigned pos = offset + node;
+        const unsigned feature = trees.features[pos];
+        // we hit a leaf so return a pointer to the values
+        if (trees.is_leaf(pos)) {
+            return trees.values + pos * trees.num_outputs;
+        }
+        // otherwise we are at an internal node and need to recurse
+        if (x_missing[feature]) {
+            node = trees.children_default[pos];
+        } else if (x[feature] <= trees.thresholds[pos]) {
+            node = trees.children_left[pos];
+        } else {
+            node = trees.children_right[pos];
+        }
+    }
+}
+inline void dense_tree_predict(tfloat *out, const TreeEnsemble &trees, const ExplanationDataset &data, unsigned model_transform) {
+    tfloat *row_out = out;
+    const tfloat *x = data.X;
+    const bool *x_missing = data.X_missing;
+    // see what transform (if any) we have
+    transform_f transform = get_transform(model_transform);
+    for (unsigned i = 0; i < data.num_X; ++i) {
+        // add the base offset
+        for (unsigned k = 0; k < trees.num_outputs; ++k) {
+            row_out[k] += trees.base_offset[k];
+        }
+        // add the leaf values from each tree
+        for (unsigned j = 0; j < trees.tree_limit; ++j) {
+            const tfloat *leaf_value = tree_predict(j, trees, x, x_missing);
+            for (unsigned k = 0; k < trees.num_outputs; ++k) {
+                row_out[k] += leaf_value[k];
+            }
+        }
+        // apply any needed transform
+        if (transform != NULL) {
+            const tfloat y_i = data.y == NULL ? 0 : data.y[i];
+            for (unsigned k = 0; k < trees.num_outputs; ++k) {
+                row_out[k] = transform(row_out[k], y_i);
+            }
+        }
+        x += data.M;
+        x_missing += data.M;
+        row_out += trees.num_outputs;
+    }
+}
+inline void tree_update_weights(unsigned i, TreeEnsemble &trees, const tfloat *x, const bool *x_missing) {
+    const unsigned offset = i * trees.max_nodes;
+    unsigned node = 0;
+    while (true) {
+        const unsigned pos = offset + node;
+        const unsigned feature = trees.features[pos];
+        // Record that a sample passed through this node
+        trees.node_sample_weights[pos] += 1.0;
+        // we hit a leaf so return a pointer to the values
+        if (trees.children_left[pos] < 0) break;
+        // otherwise we are at an internal node and need to recurse
+        if (x_missing[feature]) {
+            node = trees.children_default[pos];
+        } else if (x[feature] <= trees.thresholds[pos]) {
+            node = trees.children_left[pos];
+        } else {
+            node = trees.children_right[pos];
+        }
+    }
+}
+inline void dense_tree_update_weights(TreeEnsemble &trees, const ExplanationDataset &data) {
+    const tfloat *x = data.X;
+    const bool *x_missing = data.X_missing;
+    for (unsigned i = 0; i < data.num_X; ++i) {
+        // add the leaf values from each tree
+        for (unsigned j = 0; j < trees.tree_limit; ++j) {
+            tree_update_weights(j, trees, x, x_missing);
+        }
+        x += data.M;
+        x_missing += data.M;
+    }
+}
+inline void tree_saabas(tfloat *out, const TreeEnsemble &tree, const ExplanationDataset &data) {
+    unsigned curr_node = 0;
+    unsigned next_node = 0;
+    while (true) {
+        // we hit a leaf and are done
+        if (tree.children_left[curr_node] < 0) return;
+        // otherwise we are at an internal node and need to recurse
+        const unsigned feature = tree.features[curr_node];
+        if (data.X_missing[feature]) {
+            next_node = tree.children_default[curr_node];
+        } else if (data.X[feature] <= tree.thresholds[curr_node]) {
+            next_node = tree.children_left[curr_node];
+        } else {
+            next_node = tree.children_right[curr_node];
+        }
+        // assign credit to this feature as the difference in values at the current node vs. the next node
+        for (unsigned i = 0; i < tree.num_outputs; ++i) {
+            out[feature * tree.num_outputs + i] += tree.values[next_node * tree.num_outputs + i] - tree.values[curr_node * tree.num_outputs + i];
+        }
+        curr_node = next_node;
+    }
+}
+/**
+ * This runs Tree SHAP with a per tree path conditional dependence assumption.
+ */
+inline void dense_tree_saabas(tfloat *out_contribs, const TreeEnsemble& trees, const ExplanationDataset &data) {
+    tfloat *instance_out_contribs;
+    TreeEnsemble tree;
+    ExplanationDataset instance;
+    // build explanation for each sample
+    for (unsigned i = 0; i < data.num_X; ++i) {
+        instance_out_contribs = out_contribs + i * (data.M + 1) * trees.num_outputs;
+        data.get_x_instance(instance, i);
+        // aggregate the effect of explaining each tree
+        // (this works because of the linearity property of Shapley values)
+        for (unsigned j = 0; j < trees.tree_limit; ++j) {
+            trees.get_tree(tree, j);
+            tree_saabas(instance_out_contribs, tree, instance);
+        }
+        // apply the base offset to the bias term
+        for (unsigned j = 0; j < trees.num_outputs; ++j) {
+            instance_out_contribs[data.M * trees.num_outputs + j] += trees.base_offset[j];
+        }
+    }
+}
+// extend our decision path with a fraction of one and zero extensions
+inline void extend_path(PathElement *unique_path, unsigned unique_depth,
+                        tfloat zero_fraction, tfloat one_fraction, int feature_index) {
+    unique_path[unique_depth].feature_index = feature_index;
+    unique_path[unique_depth].zero_fraction = zero_fraction;
+    unique_path[unique_depth].one_fraction = one_fraction;
+    unique_path[unique_depth].pweight = (unique_depth == 0 ? 1.0f : 0.0f);
+    for (int i = unique_depth - 1; i >= 0; i--) {
+        unique_path[i + 1].pweight += one_fraction * unique_path[i].pweight * (i + 1)
+                                      / static_cast<tfloat>(unique_depth + 1);
+        unique_path[i].pweight = zero_fraction * unique_path[i].pweight * (unique_depth - i)
+                                 / static_cast<tfloat>(unique_depth + 1);
+    }
+}
+// undo a previous extension of the decision path
+inline void unwind_path(PathElement *unique_path, unsigned unique_depth, unsigned path_index) {
+    const tfloat one_fraction = unique_path[path_index].one_fraction;
+    const tfloat zero_fraction = unique_path[path_index].zero_fraction;
+    tfloat next_one_portion = unique_path[unique_depth].pweight;
+    for (int i = unique_depth - 1; i >= 0; --i) {
+        if (one_fraction != 0) {
+            const tfloat tmp = unique_path[i].pweight;
+            unique_path[i].pweight = next_one_portion * (unique_depth + 1)
+                                     / static_cast<tfloat>((i + 1) * one_fraction);
+            next_one_portion = tmp - unique_path[i].pweight * zero_fraction * (unique_depth - i)
+                               / static_cast<tfloat>(unique_depth + 1);
+        } else {
+            unique_path[i].pweight = (unique_path[i].pweight * (unique_depth + 1))
+                                     / static_cast<tfloat>(zero_fraction * (unique_depth - i));
+        }
+    }
+    for (unsigned i = path_index; i < unique_depth; ++i) {
+        unique_path[i].feature_index = unique_path[i+1].feature_index;
+        unique_path[i].zero_fraction = unique_path[i+1].zero_fraction;
+        unique_path[i].one_fraction = unique_path[i+1].one_fraction;
+    }
+}
+// determine what the total permutation weight would be if
+// we unwound a previous extension in the decision path
+inline tfloat unwound_path_sum(const PathElement *unique_path, unsigned unique_depth,
+                               unsigned path_index) {
+    const tfloat one_fraction = unique_path[path_index].one_fraction;
+    const tfloat zero_fraction = unique_path[path_index].zero_fraction;
+    tfloat next_one_portion = unique_path[unique_depth].pweight;
+    tfloat total = 0;
+    if (one_fraction != 0) {
+        for (int i = unique_depth - 1; i >= 0; --i) {
+            const tfloat tmp = next_one_portion / static_cast<tfloat>((i + 1) * one_fraction);
+            total += tmp;
+            next_one_portion = unique_path[i].pweight - tmp * zero_fraction * (unique_depth - i);
+        }
+    } else {
+        for (int i = unique_depth - 1; i >= 0; --i) {
+            total += unique_path[i].pweight / (zero_fraction * (unique_depth - i));
+        }
+    }
+    return total * (unique_depth + 1);
+}
+// recursive computation of SHAP values for a decision tree
+inline void tree_shap_recursive(const unsigned num_outputs, const int *children_left,
+                                const int *children_right,
+                                const int *children_default, const int *features,
+                                const tfloat *thresholds, const tfloat *values,
+                                const tfloat *node_sample_weight,
+                                const tfloat *x, const bool *x_missing, tfloat *phi,
+                                unsigned node_index, unsigned unique_depth,
+                                PathElement *parent_unique_path, tfloat parent_zero_fraction,
+                                tfloat parent_one_fraction, int parent_feature_index,
+                                int condition, unsigned condition_feature,
+                                tfloat condition_fraction) {
+    // stop if we have no weight coming down to us
+    if (condition_fraction == 0) return;
+    // extend the unique path
+    PathElement *unique_path = parent_unique_path + unique_depth + 1;
+    std::copy(parent_unique_path, parent_unique_path + unique_depth + 1, unique_path);
+    if (condition == 0 || condition_feature != static_cast<unsigned>(parent_feature_index)) {
+        extend_path(unique_path, unique_depth, parent_zero_fraction,
+                    parent_one_fraction, parent_feature_index);
+    }
+    const unsigned split_index = features[node_index];
+    // leaf node
+    if (children_right[node_index] < 0) {
+        for (unsigned i = 1; i <= unique_depth; ++i) {
+            const tfloat w = unwound_path_sum(unique_path, unique_depth, i);
+            const PathElement &el = unique_path[i];
+            const unsigned phi_offset = el.feature_index * num_outputs;
+            const unsigned values_offset = node_index * num_outputs;
+            const tfloat scale = w * (el.one_fraction - el.zero_fraction) * condition_fraction;
+            for (unsigned j = 0; j < num_outputs; ++j) {
+                phi[phi_offset + j] += scale * values[values_offset + j];
+            }
+        }
+    // internal node
+    } else {
+        // find which branch is "hot" (meaning x would follow it)
+        unsigned hot_index = 0;
+        if (x_missing[split_index]) {
+            hot_index = children_default[node_index];
+        } else if (x[split_index] <= thresholds[node_index]) {
+            hot_index = children_left[node_index];
+        } else {
+            hot_index = children_right[node_index];
+        }
+        const unsigned cold_index = (static_cast<int>(hot_index) == children_left[node_index] ?
+                                        children_right[node_index] : children_left[node_index]);
+        const tfloat w = node_sample_weight[node_index];
+        const tfloat hot_zero_fraction = node_sample_weight[hot_index] / w;
+        const tfloat cold_zero_fraction = node_sample_weight[cold_index] / w;
+        tfloat incoming_zero_fraction = 1;
+        tfloat incoming_one_fraction = 1;
+        // see if we have already split on this feature,
+        // if so we undo that split so we can redo it for this node
+        unsigned path_index = 0;
+        for (; path_index <= unique_depth; ++path_index) {
+            if (static_cast<unsigned>(unique_path[path_index].feature_index) == split_index) break;
+        }
+        if (path_index != unique_depth + 1) {
+            incoming_zero_fraction = unique_path[path_index].zero_fraction;
+            incoming_one_fraction = unique_path[path_index].one_fraction;
+            unwind_path(unique_path, unique_depth, path_index);
+            unique_depth -= 1;
+        }
+        // divide up the condition_fraction among the recursive calls
+        tfloat hot_condition_fraction = condition_fraction;
+        tfloat cold_condition_fraction = condition_fraction;
+        if (condition > 0 && split_index == condition_feature) {
+            cold_condition_fraction = 0;
+            unique_depth -= 1;
+        } else if (condition < 0 && split_index == condition_feature) {
+            hot_condition_fraction *= hot_zero_fraction;
+            cold_condition_fraction *= cold_zero_fraction;
+            unique_depth -= 1;
+        }
+        tree_shap_recursive(
+            num_outputs, children_left, children_right, children_default, features, thresholds, values,
+            node_sample_weight, x, x_missing, phi, hot_index, unique_depth + 1, unique_path,
+            hot_zero_fraction * incoming_zero_fraction, incoming_one_fraction,
+            split_index, condition, condition_feature, hot_condition_fraction
+        );
+        tree_shap_recursive(
+            num_outputs, children_left, children_right, children_default, features, thresholds, values,
+            node_sample_weight, x, x_missing, phi, cold_index, unique_depth + 1, unique_path,
+            cold_zero_fraction * incoming_zero_fraction, 0,
+            split_index, condition, condition_feature, cold_condition_fraction
+        );
+    }
+}
+inline int compute_expectations(TreeEnsemble &tree, int i = 0, int depth = 0) {
+    unsigned max_depth = 0;
+    if (tree.children_right[i] >= 0) {
+        const unsigned li = tree.children_left[i];
+        const unsigned ri = tree.children_right[i];
+        const unsigned depth_left = compute_expectations(tree, li, depth + 1);
+        const unsigned depth_right = compute_expectations(tree, ri, depth + 1);
+        const tfloat left_weight = tree.node_sample_weights[li];
+        const tfloat right_weight = tree.node_sample_weights[ri];
+        const unsigned li_offset = li * tree.num_outputs;
+        const unsigned ri_offset = ri * tree.num_outputs;
+        const unsigned i_offset = i * tree.num_outputs;
+        for (unsigned j = 0; j < tree.num_outputs; ++j) {
+            if ((left_weight == 0) && (right_weight == 0)) {
+                tree.values[i_offset + j] = 0.0;
+            } else {
+                const tfloat v = (left_weight * tree.values[li_offset + j] + right_weight * tree.values[ri_offset + j]) / (left_weight + right_weight);
+                tree.values[i_offset + j] = v;
+            }
+        }
+        max_depth = std::max(depth_left, depth_right) + 1;
+    }
+    if (depth == 0) tree.max_depth = max_depth;
+    return max_depth;
+}
+inline void tree_shap(const TreeEnsemble& tree, const ExplanationDataset &data,
+                      tfloat *out_contribs, int condition, unsigned condition_feature) {
+    // update the reference value with the expected value of the tree's predictions
+    if (condition == 0) {
+        for (unsigned j = 0; j < tree.num_outputs; ++j) {
+            out_contribs[data.M * tree.num_outputs + j] += tree.values[j];
+        }
+    }
+    // Pre-allocate space for the unique path data
+    const unsigned maxd = tree.max_depth + 2; // need a bit more space than the max depth
+    PathElement *unique_path_data = new PathElement[(maxd * (maxd + 1)) / 2];
+    tree_shap_recursive(
+        tree.num_outputs, tree.children_left, tree.children_right, tree.children_default,
+        tree.features, tree.thresholds, tree.values, tree.node_sample_weights, data.X,
+        data.X_missing, out_contribs, 0, 0, unique_path_data, 1, 1, -1, condition,
+        condition_feature, 1
+    );
+    delete[] unique_path_data;
+}
+inline unsigned build_merged_tree_recursive(TreeEnsemble &out_tree, const TreeEnsemble &trees,
+                                     const tfloat *data, const bool *data_missing, int *data_inds,
+                                     const unsigned num_background_data_inds, unsigned num_data_inds,
+                                     unsigned M, unsigned row = 0, unsigned i = 0, unsigned pos = 0,
+                                     tfloat *leaf_value = NULL) {
+    //tfloat new_leaf_value[trees.num_outputs];
+    tfloat *new_leaf_value = (tfloat *) alloca(sizeof(tfloat) * trees.num_outputs); // allocate on the stack
+    unsigned row_offset = row * trees.max_nodes;
+    // we have hit a terminal leaf!!!
+    if (trees.children_left[row_offset + i] < 0 && row + 1 == trees.tree_limit) {
+        // create the leaf node
+        const tfloat *vals = trees.values + (row * trees.max_nodes + i) * trees.num_outputs;
+        if (leaf_value == NULL) {
+            for (unsigned j = 0; j < trees.num_outputs; ++j) {
+                out_tree.values[pos * trees.num_outputs + j] = vals[j];
+            }
+        } else {
+            for (unsigned j = 0; j < trees.num_outputs; ++j) {
+                out_tree.values[pos * trees.num_outputs + j] = leaf_value[j] + vals[j];
+            }
+        }
+        out_tree.children_left[pos] = -1;
+        out_tree.children_right[pos] = -1;
+        out_tree.children_default[pos] = -1;
+        out_tree.features[pos] = -1;
+        out_tree.thresholds[pos] = 0;
+        out_tree.node_sample_weights[pos] = num_background_data_inds;
+        return pos;
+    }
+    // we hit an intermediate leaf (so just add the value to our accumulator and move to the next tree)
+    if (trees.children_left[row_offset + i] < 0) {
+        // accumulate the value of this original leaf so it will land on all eventual terminal leaves
+        const tfloat *vals = trees.values + (row * trees.max_nodes + i) * trees.num_outputs;
+        if (leaf_value == NULL) {
+            for (unsigned j = 0; j < trees.num_outputs; ++j) {
+                new_leaf_value[j] = vals[j];
+            }
+        } else {
+            for (unsigned j = 0; j < trees.num_outputs; ++j) {
+                new_leaf_value[j] = leaf_value[j] + vals[j];
+            }
+        }
+        leaf_value = new_leaf_value;
+        // move forward to the next tree
+        row += 1;
+        row_offset += trees.max_nodes;
+        i = 0;
+    }
+    // split the data inds by this node's threshold
+    const tfloat t = trees.thresholds[row_offset + i];
+    const int f = trees.features[row_offset + i];
+    const bool right_default = trees.children_default[row_offset + i] == trees.children_right[row_offset + i];
+    int low_ptr = 0;
+    int high_ptr = num_data_inds - 1;
+    unsigned num_left_background_data_inds = 0;
+    int low_data_ind;
+    while (low_ptr <= high_ptr) {
+        low_data_ind = data_inds[low_ptr];
+        const int data_ind = std::abs(low_data_ind) * M + f;
+        const bool is_missing = data_missing[data_ind];
+        if ((!is_missing && data[data_ind] > t) || (right_default && is_missing)) {
+            data_inds[low_ptr] = data_inds[high_ptr];
+            data_inds[high_ptr] = low_data_ind;
+            high_ptr -= 1;
+        } else {
+            if (low_data_ind >= 0) ++num_left_background_data_inds; // negative data_inds are not background samples
+            low_ptr += 1;
+        }
+    }
+    int *left_data_inds = data_inds;
+    const unsigned num_left_data_inds = low_ptr;
+    int *right_data_inds = data_inds + low_ptr;
+    const unsigned num_right_data_inds = num_data_inds - num_left_data_inds;
+    const unsigned num_right_background_data_inds = num_background_data_inds - num_left_background_data_inds;
+    // all the data went right, so we skip creating this node and just recurse right
+    if (num_left_data_inds == 0) {
+        return build_merged_tree_recursive(
+            out_tree, trees, data, data_missing, data_inds,
+            num_background_data_inds, num_data_inds, M, row,
+            trees.children_right[row_offset + i], pos, leaf_value
+        );
+    // all the data went left, so we skip creating this node and just recurse left
+    } else if (num_right_data_inds == 0) {
+        return build_merged_tree_recursive(
+            out_tree, trees, data, data_missing, data_inds,
+            num_background_data_inds, num_data_inds, M, row,
+            trees.children_left[row_offset + i], pos, leaf_value
+        );
+    // data went both ways so we create this node and recurse down both paths
+    } else {
+        // build the left subtree
+        const unsigned new_pos = build_merged_tree_recursive(
+            out_tree, trees, data, data_missing, left_data_inds,
+            num_left_background_data_inds, num_left_data_inds, M, row,
+            trees.children_left[row_offset + i], pos + 1, leaf_value
+        );
+        // fill in the data for this node
+        out_tree.children_left[pos] = pos + 1;
+        out_tree.children_right[pos] = new_pos + 1;
+        if (trees.children_left[row_offset + i] == trees.children_default[row_offset + i]) {
+            out_tree.children_default[pos] = pos + 1;
+        } else {
+            out_tree.children_default[pos] = new_pos + 1;
+        }
+        out_tree.features[pos] = trees.features[row_offset + i];
+        out_tree.thresholds[pos] = trees.thresholds[row_offset + i];
+        out_tree.node_sample_weights[pos] = num_background_data_inds;
+        // build the right subtree
+        return build_merged_tree_recursive(
+            out_tree, trees, data, data_missing, right_data_inds,
+            num_right_background_data_inds, num_right_data_inds, M, row,
+            trees.children_right[row_offset + i], new_pos + 1, leaf_value
+        );
+    }
+}
+inline void build_merged_tree(TreeEnsemble &out_tree, const ExplanationDataset &data, const TreeEnsemble &trees) {
+    // create a joint data matrix from both X and R matrices
+    tfloat *joined_data = new tfloat[(data.num_X + data.num_R) * data.M];
+    std::copy(data.X, data.X + data.num_X * data.M, joined_data);
+    std::copy(data.R, data.R + data.num_R * data.M, joined_data + data.num_X * data.M);
+    bool *joined_data_missing = new bool[(data.num_X + data.num_R) * data.M];
+    std::copy(data.X_missing, data.X_missing + data.num_X * data.M, joined_data_missing);
+    std::copy(data.R_missing, data.R_missing + data.num_R * data.M, joined_data_missing + data.num_X * data.M);
+    // create an starting array of data indexes we will recursively sort
+    int *data_inds = new int[data.num_X + data.num_R];
+    for (unsigned i = 0; i < data.num_X; ++i) data_inds[i] = i;
+    for (unsigned i = data.num_X; i < data.num_X + data.num_R; ++i) {
+        data_inds[i] = -i; // a negative index means it won't be recorded as a background sample
+    }
+    build_merged_tree_recursive(
+        out_tree, trees, joined_data, joined_data_missing, data_inds, data.num_R,
+        data.num_X + data.num_R, data.M
+    );
+    delete[] joined_data;
+    delete[] joined_data_missing;
+    delete[] data_inds;
+}
+// Independent Tree SHAP functions below here
+// ------------------------------------------
+struct Node {
+    short cl, cr, cd, pnode, feat, pfeat; // uint_16
+    float thres, value;
+    char from_flag;
+};
+#define FROM_NEITHER 0
+#define FROM_X_NOT_R 1
+#define FROM_R_NOT_X 2
+// https://www.geeksforgeeks.org/space-and-time-efficient-binomial-coefficient/
+inline int bin_coeff(int n, int k) {
+    int res = 1;
+    if (k > n - k)
+        k = n - k;
+    for (int i = 0; i < k; ++i) {
+        res *= (n - i);
+        res /= (i + 1);
+    }
+    return res;
+}
+// note this only handles single output models, so multi-output models get explained using multiple passes
+inline void tree_shap_indep(const unsigned max_depth, const unsigned num_feats,
+                            const unsigned num_nodes, const tfloat *x,
+                            const bool *x_missing, const tfloat *r,
+                            const bool *r_missing, tfloat *out_contribs,
+                            float *pos_lst, float *neg_lst, signed short *feat_hist,
+                            float *memoized_weights, int *node_stack, Node *mytree) {
+//     const bool DEBUG = true;
+//     ofstream myfile;
+//     if (DEBUG) {
+//       myfile.open ("/homes/gws/hughchen/shap/out.txt",fstream::app);
+//       myfile << "Entering tree_shap_indep\n";
+//     }
+    int ns_ctr = 0;
+    std::fill_n(feat_hist, num_feats, 0);
+    short node = 0, feat, cl, cr, cd, pnode, pfeat = -1;
+    short next_xnode = -1, next_rnode = -1;
+    short next_node = -1, from_child = -1;
+    float thres, pos_x = 0, neg_x = 0, pos_r = 0, neg_r = 0;
+    char from_flag;
+    unsigned M = 0, N = 0;
+    Node curr_node = mytree[node];
+    feat = curr_node.feat;
+    thres = curr_node.thres;
+    cl = curr_node.cl;
+    cr = curr_node.cr;
+    cd = curr_node.cd;
+    // short circuit when this is a stump tree (with no splits)
+    if (cl < 0) {
+        out_contribs[num_feats] += curr_node.value;
+        return;
+    }
+//     if (DEBUG) {
+//       myfile << "\nNode: " << node << "\n";
+//       myfile << "x[feat]: " << x[feat] << ", r[feat]: " << r[feat] << "\n";
+//       myfile << "thres: " << thres << "\n";
+//     }
+    if (x_missing[feat]) {
+        next_xnode = cd;
+    } else if (x[feat] > thres) {
+        next_xnode = cr;
+    } else if (x[feat] <= thres) {
+        next_xnode = cl;
+    }
+    if (r_missing[feat]) {
+        next_rnode = cd;
+    } else if (r[feat] > thres) {
+        next_rnode = cr;
+    } else if (r[feat] <= thres) {
+        next_rnode = cl;
+    }
+    if (next_xnode != next_rnode) {
+        mytree[next_xnode].from_flag = FROM_X_NOT_R;
+        mytree[next_rnode].from_flag = FROM_R_NOT_X;
+    } else {
+        mytree[next_xnode].from_flag = FROM_NEITHER;
+    }
+    // Check if x and r go the same way
+    if (next_xnode == next_rnode) {
+        next_node = next_xnode;
+    }
+    // If not, go left
+    if (next_node < 0) {
+        next_node = cl;
+        if (next_rnode == next_node) { // rpath
+            N = N+1;
+            feat_hist[feat] -= 1;
+        } else if (next_xnode == next_node) { // xpath
+            M = M+1;
+            N = N+1;
+            feat_hist[feat] += 1;
+        }
+    }
+    node_stack[ns_ctr] = node;
+    ns_ctr += 1;
+    while (true) {
+        node = next_node;
+        curr_node = mytree[node];
+        feat = curr_node.feat;
+        thres = curr_node.thres;
+        cl = curr_node.cl;
+        cr = curr_node.cr;
+        cd = curr_node.cd;
+        pnode = curr_node.pnode;
+        pfeat = curr_node.pfeat;
+        from_flag = curr_node.from_flag;
+//         if (DEBUG) {
+//           myfile << "\nNode: " << node << "\n";
+//           myfile << "N: " << N << ", M: " << M << "\n";
+//           myfile << "from_flag==FROM_X_NOT_R: " << (from_flag==FROM_X_NOT_R) << "\n";
+//           myfile << "from_flag==FROM_R_NOT_X: " << (from_flag==FROM_R_NOT_X) << "\n";
+//           myfile << "from_flag==FROM_NEITHER: " << (from_flag==FROM_NEITHER) << "\n";
+//           myfile << "feat_hist[feat]: " << feat_hist[feat] << "\n";
+//         }
+        // At a leaf
+        if (cl < 0) {
+            //      if (DEBUG) {
+            //        myfile << "At a leaf\n";
+            //      }
+            if (M == 0) {
+              out_contribs[num_feats] += mytree[node].value;
+            }
+            // Currently assuming a single output
+            if (N != 0) {
+                if (M != 0) {
+                    pos_lst[node] = mytree[node].value * memoized_weights[N + max_depth * (M-1)];
+                }
+                if (M != N) {
+                    neg_lst[node] = -mytree[node].value * memoized_weights[N + max_depth * M];
+                }
+            }
+//             if (DEBUG) {
+//               myfile << "pos_lst[node]: " << pos_lst[node] << "\n";
+//               myfile << "neg_lst[node]: " << neg_lst[node] << "\n";
+//             }
+            // Pop from node_stack
+            ns_ctr -= 1;
+            next_node = node_stack[ns_ctr];
+            from_child = node;
+            // Unwind
+            if (feat_hist[pfeat] > 0) {
+                feat_hist[pfeat] -= 1;
+            } else if (feat_hist[pfeat] < 0) {
+                feat_hist[pfeat] += 1;
+            }
+            if (feat_hist[pfeat] == 0) {
+                if (from_flag == FROM_X_NOT_R) {
+                    N = N-1;
+                    M = M-1;
+                } else if (from_flag == FROM_R_NOT_X) {
+                    N = N-1;
+                }
+            }
+            continue;
+        }
+        const bool x_right = x[feat] > thres;
+        const bool r_right = r[feat] > thres;
+        if (x_missing[feat]) {
+            next_xnode = cd;
+        } else if (x_right) {
+            next_xnode = cr;
+        } else if (!x_right) {
+            next_xnode = cl;
+        }
+        if (r_missing[feat]) {
+            next_rnode = cd;
+        } else if (r_right) {
+            next_rnode = cr;
+        } else if (!r_right) {
+            next_rnode = cl;
+        }
+        if (next_xnode >= 0) {
+          if (next_xnode != next_rnode) {
+              mytree[next_xnode].from_flag = FROM_X_NOT_R;
+              mytree[next_rnode].from_flag = FROM_R_NOT_X;
+          } else {
+              mytree[next_xnode].from_flag = FROM_NEITHER;
+          }
+        }
+        // Arriving at node from parent
+        if (from_child == -1) {
+            //      if (DEBUG) {
+            //        myfile << "Arriving at node from parent\n";
+            //      }
+            node_stack[ns_ctr] = node;
+            ns_ctr += 1;
+            next_node = -1;
+            //      if (DEBUG) {
+            //        myfile << "feat_hist[feat]" << feat_hist[feat] << "\n";
+            //      }
+            // Feature is set upstream
+            if (feat_hist[feat] > 0) {
+                next_node = next_xnode;
+                feat_hist[feat] += 1;
+            } else if (feat_hist[feat] < 0) {
+                next_node = next_rnode;
+                feat_hist[feat] -= 1;
+            }
+            // x and r go the same way
+            if (next_node < 0) {
+                if (next_xnode == next_rnode) {
+                    next_node = next_xnode;
+                }
+            }
+            // Go down one path
+            if (next_node >= 0) {
+                continue;
+            }
+            // Go down both paths, but go left first
+            next_node = cl;
+            if (next_rnode == next_node) {
+                N = N+1;
+                feat_hist[feat] -= 1;
+            } else if (next_xnode == next_node) {
+                M = M+1;
+                N = N+1;
+                feat_hist[feat] += 1;
+            }
+            from_child = -1;
+            continue;
+        }
+        // Arriving at node from child
+        if (from_child != -1) {
+//             if (DEBUG) {
+//               myfile << "Arriving at node from child\n";
+//             }
+            next_node = -1;
+            // Check if we should unroll immediately
+            if ((next_rnode == next_xnode) || (feat_hist[feat] != 0)) {
+                next_node = pnode;
+            }
+            // Came from a single path, so unroll
+            if (next_node >= 0) {
+//                 if (DEBUG) {
+//                   myfile << "Came from a single path, so unroll\n";
+//                 }
+                // At the root node
+                if (node == 0) {
+                    break;
+                }
+                // Update and unroll
+                pos_lst[node] = pos_lst[from_child];
+                neg_lst[node] = neg_lst[from_child];
+//                 if (DEBUG) {
+//                   myfile << "pos_lst[node]: " << pos_lst[node] << "\n";
+//                   myfile << "neg_lst[node]: " << neg_lst[node] << "\n";
+//                 }
+                from_child = node;
+                ns_ctr -= 1;
+                // Unwind
+                if (feat_hist[pfeat] > 0) {
+                    feat_hist[pfeat] -= 1;
+                } else if (feat_hist[pfeat] < 0) {
+                    feat_hist[pfeat] += 1;
+                }
+                if (feat_hist[pfeat] == 0) {
+                    if (from_flag == FROM_X_NOT_R) {
+                        N = N-1;
+                        M = M-1;
+                    } else if (from_flag == FROM_R_NOT_X) {
+                        N = N-1;
+                    }
+                }
+                continue;
+                // Go right - Arriving from the left child
+            } else if (from_child == cl) {
+//                 if (DEBUG) {
+//                   myfile << "Go right - Arriving from the left child\n";
+//                 }
+                node_stack[ns_ctr] = node;
+                ns_ctr += 1;
+                next_node = cr;
+                if (next_xnode == next_node) {
+                    M = M+1;
+                    N = N+1;
+                    feat_hist[feat] += 1;
+                } else if (next_rnode == next_node) {
+                    N = N+1;
+                    feat_hist[feat] -= 1;
+                }
+                from_child = -1;
+                continue;
+                // Compute stuff and unroll - Arriving from the right child
+            } else if (from_child == cr) {
+//                 if (DEBUG) {
+//                   myfile << "Compute stuff and unroll - Arriving from the right child\n";
+//                 }
+                pos_x = 0;
+                neg_x = 0;
+                pos_r = 0;
+                neg_r = 0;
+                if ((next_xnode == cr) && (next_rnode == cl)) {
+                    pos_x = pos_lst[cr];
+                    neg_x = neg_lst[cr];
+                    pos_r = pos_lst[cl];
+                    neg_r = neg_lst[cl];
+                } else if ((next_xnode == cl) && (next_rnode == cr)) {
+                    pos_x = pos_lst[cl];
+                    neg_x = neg_lst[cl];
+                    pos_r = pos_lst[cr];
+                    neg_r = neg_lst[cr];
+                }
+                // out_contribs needs to have been initialized as all zeros
+                // if (pos_x + neg_r != 0) {
+                //   std::cout << "val " << pos_x + neg_r << "\n";
+                // }
+                out_contribs[feat] += pos_x + neg_r;
+                pos_lst[node] = pos_x + pos_r;
+                neg_lst[node] = neg_x + neg_r;
+//                 if (DEBUG) {
+//                   myfile << "out_contribs[feat]: " << out_contribs[feat] << "\n";
+//                   myfile << "pos_lst[node]: " << pos_lst[node] << "\n";
+//                   myfile << "neg_lst[node]: " << neg_lst[node] << "\n";
+//                 }
+                // Check if at root
+                if (node == 0) {
+                    break;
+                }
+                // Pop
+                ns_ctr -= 1;
+                next_node = node_stack[ns_ctr];
+                from_child = node;
+                // Unwind
+                if (feat_hist[pfeat] > 0) {
+                    feat_hist[pfeat] -= 1;
+                } else if (feat_hist[pfeat] < 0) {
+                    feat_hist[pfeat] += 1;
+                }
+                if (feat_hist[pfeat] == 0) {
+                    if (from_flag == FROM_X_NOT_R) {
+                        N = N-1;
+                        M = M-1;
+                    } else if (from_flag == FROM_R_NOT_X) {
+                        N = N-1;
+                    }
+                }
+                continue;
+            }
+        }
+    }
+    //  if (DEBUG) {
+    //    myfile.close();
+    //  }
+}
+inline void print_progress_bar(tfloat &last_print, tfloat start_time, unsigned i, unsigned total_count) {
+    const tfloat elapsed_seconds = difftime(time(NULL), start_time);
+    if (elapsed_seconds > 10 && elapsed_seconds - last_print > 0.5) {
+        const tfloat fraction = static_cast<tfloat>(i) / total_count;
+        const double total_seconds = elapsed_seconds / fraction;
+        last_print = elapsed_seconds;
+        PySys_WriteStderr(
+            "\r%3.0f%%|%.*s%.*s| %d/%d [%02d:%02d<%02d:%02d]       ",
+            fraction * 100, int(0.5 + fraction*20), "===================",
+            20-int(0.5 + fraction*20), "                   ",
+            i, total_count,
+            int(elapsed_seconds/60), int(elapsed_seconds) % 60,
+            int((total_seconds - elapsed_seconds)/60), int(total_seconds - elapsed_seconds) % 60
+        );
+        // Get handle to python stderr file and flush it (https://mail.python.org/pipermail/python-list/2004-November/294912.html)
+        PyObject *pyStderr = PySys_GetObject("stderr");
+        if (pyStderr) {
+            PyObject *result = PyObject_CallMethod(pyStderr, "flush", NULL);
+            Py_XDECREF(result);
+        }
+    }
+}
+/**
+ * Runs Tree SHAP with feature independence assumptions on dense data.
+ */
+inline void dense_independent(const TreeEnsemble& trees, const ExplanationDataset &data,
+                       tfloat *out_contribs, tfloat transform(const tfloat, const tfloat)) {
+    // reformat the trees for faster access
+    Node *node_trees = new Node[trees.tree_limit * trees.max_nodes];
+    for (unsigned i = 0; i < trees.tree_limit; ++i) {
+        Node *node_tree = node_trees + i * trees.max_nodes;
+        for (unsigned j = 0; j < trees.max_nodes; ++j) {
+            const unsigned en_ind = i * trees.max_nodes + j;
+            node_tree[j].cl = trees.children_left[en_ind];
+            node_tree[j].cr = trees.children_right[en_ind];
+            node_tree[j].cd = trees.children_default[en_ind];
+            if (j == 0) {
+                node_tree[j].pnode = 0;
+            }
+            if (trees.children_left[en_ind] >= 0) { // relies on all unused entries having negative values in them
+                node_tree[trees.children_left[en_ind]].pnode = j;
+                node_tree[trees.children_left[en_ind]].pfeat = trees.features[en_ind];
+            }
+            if (trees.children_right[en_ind] >= 0) { // relies on all unused entries having negative values in them
+                node_tree[trees.children_right[en_ind]].pnode = j;
+                node_tree[trees.children_right[en_ind]].pfeat = trees.features[en_ind];
+            }
+            node_tree[j].thres = trees.thresholds[en_ind];
+            node_tree[j].feat = trees.features[en_ind];
+        }
+    }
+    // preallocate arrays needed by the algorithm
+    float *pos_lst = new float[trees.max_nodes];
+    float *neg_lst = new float[trees.max_nodes];
+    int *node_stack = new int[(unsigned) trees.max_depth];
+    signed short *feat_hist = new signed short[data.M];
+    tfloat *tmp_out_contribs = new tfloat[(data.M + 1)];
+    // precompute all the weight coefficients
+    float *memoized_weights = new float[(trees.max_depth+1) * (trees.max_depth+1)];
+    for (unsigned n = 0; n <= trees.max_depth; ++n) {
+        for (unsigned m = 0; m <= trees.max_depth; ++m) {
+            memoized_weights[n + trees.max_depth * m] = 1.0 / (n * bin_coeff(n-1, m));
+        }
+    }
+    // compute the explanations for each sample
+    tfloat *instance_out_contribs;
+    tfloat rescale_factor = 1.0;
+    tfloat margin_x = 0;
+    tfloat margin_r = 0;
+    time_t start_time = time(NULL);
+    tfloat last_print = 0;
+    for (unsigned oind = 0; oind < trees.num_outputs; ++oind) {
+        // set the values in the reformatted tree to the current output index
+        for (unsigned i = 0; i < trees.tree_limit; ++i) {
+            Node *node_tree = node_trees + i * trees.max_nodes;
+            for (unsigned j = 0; j < trees.max_nodes; ++j) {
+                const unsigned en_ind = i * trees.max_nodes + j;
+                node_tree[j].value = trees.values[en_ind * trees.num_outputs + oind];
+            }
+        }
+        // loop over all the samples
+        for (unsigned i = 0; i < data.num_X; ++i) {
+            const tfloat *x = data.X + i * data.M;
+            const bool *x_missing = data.X_missing + i * data.M;
+            instance_out_contribs = out_contribs + i * (data.M + 1) * trees.num_outputs;
+            const tfloat y_i = data.y == NULL ? 0 : data.y[i];
+            print_progress_bar(last_print, start_time, oind * data.num_X + i, data.num_X * trees.num_outputs);
+            // compute the model's margin output for x
+            if (transform != NULL) {
+                margin_x = trees.base_offset[oind];
+                for (unsigned k = 0; k < trees.tree_limit; ++k) {
+                    margin_x += tree_predict(k, trees, x, x_missing)[oind];
+                }
+            }
+            for (unsigned j = 0; j < data.num_R; ++j) {
+                const tfloat *r = data.R + j * data.M;
+                const bool *r_missing = data.R_missing + j * data.M;
+                std::fill_n(tmp_out_contribs, (data.M + 1), 0);
+                // compute the model's margin output for r
+                if (transform != NULL) {
+                    margin_r = trees.base_offset[oind];
+                    for (unsigned k = 0; k < trees.tree_limit; ++k) {
+                        margin_r += tree_predict(k, trees, r, r_missing)[oind];
+                    }
+                }
+                for (unsigned k = 0; k < trees.tree_limit; ++k) {
+                    tree_shap_indep(
+                        trees.max_depth, data.M, trees.max_nodes, x, x_missing, r, r_missing,
+                        tmp_out_contribs, pos_lst, neg_lst, feat_hist, memoized_weights,
+                        node_stack, node_trees + k * trees.max_nodes
+                    );
+                }
+                // compute the rescale factor
+                if (transform != NULL) {
+                    if (margin_x == margin_r) {
+                        rescale_factor = 1.0;
+                    } else {
+                        rescale_factor = (*transform)(margin_x, y_i) - (*transform)(margin_r, y_i);
+                        rescale_factor /= margin_x - margin_r;
+                    }
+                }
+                // add the effect of the current reference to our running total
+                // this is where we can do per reference scaling for non-linear transformations
+                for (unsigned k = 0; k < data.M; ++k) {
+                    instance_out_contribs[k * trees.num_outputs + oind] += tmp_out_contribs[k] * rescale_factor;
+                }
+                // Add the base offset
+                if (transform != NULL) {
+                    instance_out_contribs[data.M * trees.num_outputs + oind] += (*transform)(trees.base_offset[oind] + tmp_out_contribs[data.M], 0);
+                } else {
+                    instance_out_contribs[data.M * trees.num_outputs + oind] += trees.base_offset[oind] + tmp_out_contribs[data.M];
+                }
+            }
+            // average the results over all the references.
+            for (unsigned j = 0; j < (data.M + 1); ++j) {
+                instance_out_contribs[j * trees.num_outputs + oind] /= data.num_R;
+            }
+            // apply the base offset to the bias term
+            // for (unsigned j = 0; j < trees.num_outputs; ++j) {
+            //     instance_out_contribs[data.M * trees.num_outputs + j] += (*transform)(trees.base_offset[j], 0);
+            // }
+        }
+    }
+    delete[] tmp_out_contribs;
+    delete[] node_trees;
+    delete[] pos_lst;
+    delete[] neg_lst;
+    delete[] node_stack;
+    delete[] feat_hist;
+    delete[] memoized_weights;
+}
+/**
+ * This runs Tree SHAP with a per tree path conditional dependence assumption.
+ */
+inline void dense_tree_path_dependent(const TreeEnsemble& trees, const ExplanationDataset &data,
+                               tfloat *out_contribs, tfloat transform(const tfloat, const tfloat)) {
+    tfloat *instance_out_contribs;
+    TreeEnsemble tree;
+    ExplanationDataset instance;
+    // build explanation for each sample
+    for (unsigned i = 0; i < data.num_X; ++i) {
+        instance_out_contribs = out_contribs + i * (data.M + 1) * trees.num_outputs;
+        data.get_x_instance(instance, i);
+        // aggregate the effect of explaining each tree
+        // (this works because of the linearity property of Shapley values)
+        for (unsigned j = 0; j < trees.tree_limit; ++j) {
+            trees.get_tree(tree, j);
+            tree_shap(tree, instance, instance_out_contribs, 0, 0);
+        }
+        // apply the base offset to the bias term
+        for (unsigned j = 0; j < trees.num_outputs; ++j) {
+            instance_out_contribs[data.M * trees.num_outputs + j] += trees.base_offset[j];
+        }
+    }
+}
+// phi = np.zeros((self._current_X.shape[1] + 1, self._current_X.shape[1] + 1, self.n_outputs))
+//         phi_diag = np.zeros((self._current_X.shape[1] + 1, self.n_outputs))
+//         for t in range(self.tree_limit):
+//             self.tree_shap(self.trees[t], self._current_X[i,:], self._current_x_missing, phi_diag)
+//             for j in self.trees[t].unique_features:
+//                 phi_on = np.zeros((self._current_X.shape[1] + 1, self.n_outputs))
+//                 phi_off = np.zeros((self._current_X.shape[1] + 1, self.n_outputs))
+//                 self.tree_shap(self.trees[t], self._current_X[i,:], self._current_x_missing, phi_on, 1, j)
+//                 self.tree_shap(self.trees[t], self._current_X[i,:], self._current_x_missing, phi_off, -1, j)
+//                 phi[j] += np.true_divide(np.subtract(phi_on,phi_off),2.0)
+//                 phi_diag[j] -= np.sum(np.true_divide(np.subtract(phi_on,phi_off),2.0))
+//         for j in range(self._current_X.shape[1]+1):
+//             phi[j][j] = phi_diag[j]
+//         phi /= self.tree_limit
+//         return phi
+inline void dense_tree_interactions_path_dependent(const TreeEnsemble& trees, const ExplanationDataset &data,
+                                            tfloat *out_contribs,
+                                            tfloat transform(const tfloat, const tfloat)) {
+    // build a list of all the unique features in each tree
+    int amount_of_unique_features = min(data.M, trees.max_nodes);
+    int *unique_features = new int[trees.tree_limit * amount_of_unique_features];
+    std::fill(unique_features, unique_features + trees.tree_limit * amount_of_unique_features, -1);
+    for (unsigned j = 0; j < trees.tree_limit; ++j) {
+        const int *features_row = trees.features + j * trees.max_nodes;
+        int *unique_features_row = unique_features + j * amount_of_unique_features;
+        for (unsigned k = 0; k < trees.max_nodes; ++k) {
+            for (unsigned l = 0; l < amount_of_unique_features; ++l) {
+                if (features_row[k] == unique_features_row[l]) break;
+                if (unique_features_row[l] < 0) {
+                    unique_features_row[l] = features_row[k];
+                    break;
+                }
+            }
+        }
+    }
+    // build an interaction explanation for each sample
+    tfloat *instance_out_contribs;
+    TreeEnsemble tree;
+    ExplanationDataset instance;
+    const unsigned contrib_row_size = (data.M + 1) * trees.num_outputs;
+    tfloat *diag_contribs = new tfloat[contrib_row_size];
+    tfloat *on_contribs = new tfloat[contrib_row_size];
+    tfloat *off_contribs = new tfloat[contrib_row_size];
+    for (unsigned i = 0; i < data.num_X; ++i) {
+        instance_out_contribs = out_contribs + i * (data.M + 1) * contrib_row_size;
+        data.get_x_instance(instance, i);
+        // aggregate the effect of explaining each tree
+        // (this works because of the linearity property of Shapley values)
+        std::fill(diag_contribs, diag_contribs + contrib_row_size, 0);
+        for (unsigned j = 0; j < trees.tree_limit; ++j) {
+            trees.get_tree(tree, j);
+            tree_shap(tree, instance, diag_contribs, 0, 0);
+            const int *unique_features_row = unique_features + j * amount_of_unique_features;
+            for (unsigned k = 0; k < amount_of_unique_features; ++k) {
+                const int ind = unique_features_row[k];
+                if (ind < 0) break; // < 0 means we have seen all the features for this tree
+                // compute the shap value with this feature held on and off
+                std::fill(on_contribs, on_contribs + contrib_row_size, 0);
+                std::fill(off_contribs, off_contribs + contrib_row_size, 0);
+                tree_shap(tree, instance, on_contribs, 1, ind);
+                tree_shap(tree, instance, off_contribs, -1, ind);
+                // save the difference between on and off as the interaction value
+                for (unsigned l = 0; l < contrib_row_size; ++l) {
+                    const tfloat val = (on_contribs[l] - off_contribs[l]) / 2;
+                    instance_out_contribs[ind * contrib_row_size + l] += val;
+                    diag_contribs[l] -= val;
+                }
+            }
+        }
+        // set the diagonal
+        for (unsigned j = 0; j < data.M + 1; ++j) {
+            const unsigned offset = j * contrib_row_size + j * trees.num_outputs;
+            for (unsigned k = 0; k < trees.num_outputs; ++k) {
+                instance_out_contribs[offset + k] = diag_contribs[j * trees.num_outputs + k];
+            }
+        }
+        // apply the base offset to the bias term
+        const unsigned last_ind = (data.M * (data.M + 1) + data.M) * trees.num_outputs;
+        for (unsigned j = 0; j < trees.num_outputs; ++j) {
+            instance_out_contribs[last_ind + j] += trees.base_offset[j];
+        }
+    }
+    delete[] diag_contribs;
+    delete[] on_contribs;
+    delete[] off_contribs;
+    delete[] unique_features;
+}
+/**
+ * This runs Tree SHAP with a global path conditional dependence assumption.
+ *
+ * By first merging all the trees in a tree ensemble into an equivalent single tree
+ * this method allows arbitrary marginal transformations and also ensures that all the
+ * evaluations of the model are consistent with some training data point.
+ */
+inline void dense_global_path_dependent(const TreeEnsemble& trees, const ExplanationDataset &data,
+                                 tfloat *out_contribs, tfloat transform(const tfloat, const tfloat)) {
+    // allocate space for our new merged tree (we save enough room to totally split all samples if need be)
+    TreeEnsemble merged_tree;
+    merged_tree.allocate(1, (data.num_X + data.num_R) * 2, trees.num_outputs);
+    // collapse the ensemble of trees into a single tree that has the same behavior
+    // for all the X and R samples in the dataset
+    build_merged_tree(merged_tree, data, trees);
+    // compute the expected value and depth of the new merged tree
+    compute_expectations(merged_tree);
+    // explain each sample using our new merged tree
+    ExplanationDataset instance;
+    tfloat *instance_out_contribs;
+    for (unsigned i = 0; i < data.num_X; ++i) {
+        instance_out_contribs = out_contribs + i * (data.M + 1) * trees.num_outputs;
+        data.get_x_instance(instance, i);
+        // since we now just have a single merged tree we can just use the tree_path_dependent algorithm
+        tree_shap(merged_tree, instance, instance_out_contribs, 0, 0);
+        // apply the base offset to the bias term
+        for (unsigned j = 0; j < trees.num_outputs; ++j) {
+            instance_out_contribs[data.M * trees.num_outputs + j] += trees.base_offset[j];
+        }
+    }
+    merged_tree.free();
+}
+/**
+ * The main method for computing Tree SHAP on models using dense data.
+ */
+inline void dense_tree_shap(const TreeEnsemble& trees, const ExplanationDataset &data, tfloat *out_contribs,
+                     const int feature_dependence, unsigned model_transform, bool interactions) {
+    // see what transform (if any) we have
+    transform_f transform = get_transform(model_transform);
+    // dispatch to the correct algorithm handler
+    switch (feature_dependence) {
+        case FEATURE_DEPENDENCE::independent:
+            if (interactions) {
+                std::cerr << "FEATURE_DEPENDENCE::independent does not support interactions!\n";
+            } else dense_independent(trees, data, out_contribs, transform);
+            return;
+        case FEATURE_DEPENDENCE::tree_path_dependent:
+            if (interactions) dense_tree_interactions_path_dependent(trees, data, out_contribs, transform);
+            else dense_tree_path_dependent(trees, data, out_contribs, transform);
+            return;
+        case FEATURE_DEPENDENCE::global_path_dependent:
+            if (interactions) {
+                std::cerr << "FEATURE_DEPENDENCE::global_path_dependent does not support interactions!\n";
+            } else dense_global_path_dependent(trees, data, out_contribs, transform);
+            return;
+    }
+}

lib/shap/datasets.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import os
+from urllib.request import urlretrieve
+import numpy as np
+import pandas as pd
+import sklearn.datasets
+import shap
+github_data_url = "https://github.com/shap/shap/raw/master/data/"
+def imagenet50(display=False, resolution=224, n_points=None):
+    """ This is a set of 50 images representative of ImageNet images.
+    This dataset was collected by randomly finding a working ImageNet link and then pasting the
+    original ImageNet image into Google image search restricted to images licensed for reuse. A
+    similar image (now with rights to reuse) was downloaded as a rough replacement for the original
+    ImageNet image. The point is to have a random sample of ImageNet for use as a background
+    distribution for explaining models trained on ImageNet data.
+    Note that because the images are only rough replacements the labels might no longer be correct.
+    """
+    prefix = github_data_url + "imagenet50_"
+    X = np.load(cache(f"{prefix}{resolution}x{resolution}.npy")).astype(np.float32)
+    y = np.loadtxt(cache(f"{prefix}labels.csv"))
+    if n_points is not None:
+        X = shap.utils.sample(X, n_points, random_state=0)
+        y = shap.utils.sample(y, n_points, random_state=0)
+    return X, y
+def california(display=False, n_points=None):
+    """ Return the california housing data in a nice package. """
+    d = sklearn.datasets.fetch_california_housing()
+    df = pd.DataFrame(data=d.data, columns=d.feature_names)
+    target = d.target
+    if n_points is not None:
+        df = shap.utils.sample(df, n_points, random_state=0)
+        target = shap.utils.sample(target, n_points, random_state=0)
+    return df, target
+def linnerud(display=False, n_points=None):
+    """ Return the linnerud data in a nice package (multi-target regression). """
+    d = sklearn.datasets.load_linnerud()
+    X = pd.DataFrame(d.data, columns=d.feature_names)
+    y = pd.DataFrame(d.target, columns=d.target_names)
+    if n_points is not None:
+        X = shap.utils.sample(X, n_points, random_state=0)
+        y = shap.utils.sample(y, n_points, random_state=0)
+    return X, y
+def imdb(display=False, n_points=None):
+    """ Return the classic IMDB sentiment analysis training data in a nice package.
+    Full data is at: http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
+    Paper to cite when using the data is: http://www.aclweb.org/anthology/P11-1015
+    """
+    with open(cache(github_data_url + "imdb_train.txt"), encoding="utf-8") as f:
+        data = f.readlines()
+    y = np.ones(25000, dtype=bool)
+    y[:12500] = 0
+    if n_points is not None:
+        data = shap.utils.sample(data, n_points, random_state=0)
+        y = shap.utils.sample(y, n_points, random_state=0)
+    return data, y
+def communitiesandcrime(display=False, n_points=None):
+    """ Predict total number of non-violent crimes per 100K popuation.
+    This dataset is from the classic UCI Machine Learning repository:
+    https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime+Unnormalized
+    """
+    raw_data = pd.read_csv(
+        cache(github_data_url + "CommViolPredUnnormalizedData.txt"),
+        na_values="?"
+    )
+    # find the indices where the total violent crimes are known
+    valid_inds = np.where(np.invert(np.isnan(raw_data.iloc[:,-2])))[0]
+    if n_points is not None:
+        valid_inds = shap.utils.sample(valid_inds, n_points, random_state=0)
+    y = np.array(raw_data.iloc[valid_inds,-2], dtype=float)
+    # extract the predictive features and remove columns with missing values
+    X = raw_data.iloc[valid_inds,5:-18]
+    valid_cols = np.where(np.isnan(X.values).sum(0) == 0)[0]
+    X = X.iloc[:,valid_cols]
+    return X, y
+def diabetes(display=False, n_points=None):
+    """ Return the diabetes data in a nice package. """
+    d = sklearn.datasets.load_diabetes()
+    df = pd.DataFrame(data=d.data, columns=d.feature_names)
+    target = d.target
+    if n_points is not None:
+        df = shap.utils.sample(df, n_points, random_state=0)
+        target = shap.utils.sample(target, n_points, random_state=0)
+    return df, target
+def iris(display=False, n_points=None):
+    """ Return the classic iris data in a nice package. """
+    d = sklearn.datasets.load_iris()
+    df = pd.DataFrame(data=d.data, columns=d.feature_names)
+    target = d.target
+    if n_points is not None:
+        df = shap.utils.sample(df, n_points, random_state=0)
+        target = shap.utils.sample(target, n_points, random_state=0)
+    if display:
+        return df, [d.target_names[v] for v in target]
+    return df, target
+def adult(display=False, n_points=None):
+    """ Return the Adult census data in a nice package. """
+    dtypes = [
+        ("Age", "float32"), ("Workclass", "category"), ("fnlwgt", "float32"),
+        ("Education", "category"), ("Education-Num", "float32"), ("Marital Status", "category"),
+        ("Occupation", "category"), ("Relationship", "category"), ("Race", "category"),
+        ("Sex", "category"), ("Capital Gain", "float32"), ("Capital Loss", "float32"),
+        ("Hours per week", "float32"), ("Country", "category"), ("Target", "category")
+    ]
+    raw_data = pd.read_csv(
+        cache(github_data_url + "adult.data"),
+        names=[d[0] for d in dtypes],
+        na_values="?",
+        dtype=dict(dtypes)
+    )
+    if n_points is not None:
+        raw_data = shap.utils.sample(raw_data, n_points, random_state=0)
+    data = raw_data.drop(["Education"], axis=1)  # redundant with Education-Num
+    filt_dtypes = list(filter(lambda x: x[0] not in ["Target", "Education"], dtypes))
+    data["Target"] = data["Target"] == " >50K"
+    rcode = {
+        "Not-in-family": 0,
+        "Unmarried": 1,
+        "Other-relative": 2,
+        "Own-child": 3,
+        "Husband": 4,
+        "Wife": 5
+    }
+    for k, dtype in filt_dtypes:
+        if dtype == "category":
+            if k == "Relationship":
+                data[k] = np.array([rcode[v.strip()] for v in data[k]])
+            else:
+                data[k] = data[k].cat.codes
+    if display:
+        return raw_data.drop(["Education", "Target", "fnlwgt"], axis=1), data["Target"].values
+    return data.drop(["Target", "fnlwgt"], axis=1), data["Target"].values
+def nhanesi(display=False, n_points=None):
+    """ A nicely packaged version of NHANES I data with surivival times as labels.
+    """
+    X = pd.read_csv(cache(github_data_url + "NHANESI_X.csv"), index_col=0)
+    y = pd.read_csv(cache(github_data_url + "NHANESI_y.csv"), index_col=0)["y"]
+    if n_points is not None:
+        X = shap.utils.sample(X, n_points, random_state=0)
+        y = shap.utils.sample(y, n_points, random_state=0)
+    if display:
+        X_display = X.copy()
+        # X_display["sex_isFemale"] = ["Female" if v else "Male" for v in X["sex_isFemale"]]
+        return X_display, np.array(y)
+    return X, np.array(y)
+def corrgroups60(display=False, n_points=1_000):
+    """ Correlated Groups 60
+    A simulated dataset with tight correlations among distinct groups of features.
+    """
+    # set a constant seed
+    old_seed = np.random.seed()
+    np.random.seed(0)
+    # generate dataset with known correlation
+    N, M = n_points, 60
+    # set one coefficient from each group of 3 to 1
+    beta = np.zeros(M)
+    beta[0:30:3] = 1
+    # build a correlation matrix with groups of 3 tightly correlated features
+    C = np.eye(M)
+    for i in range(0,30,3):
+        C[i,i+1] = C[i+1,i] = 0.99
+        C[i,i+2] = C[i+2,i] = 0.99
+        C[i+1,i+2] = C[i+2,i+1] = 0.99
+    def f(X):
+        return np.matmul(X, beta)
+    # Make sure the sample correlation is a perfect match
+    X_start = np.random.randn(N, M)
+    X_centered = X_start - X_start.mean(0)
+    Sigma = np.matmul(X_centered.T, X_centered) / X_centered.shape[0]
+    W = np.linalg.cholesky(np.linalg.inv(Sigma)).T
+    X_white = np.matmul(X_centered, W.T)
+    assert np.linalg.norm(np.corrcoef(np.matmul(X_centered, W.T).T) - np.eye(M)) < 1e-6 # ensure this decorrelates the data
+    # create the final data
+    X_final = np.matmul(X_white, np.linalg.cholesky(C).T)
+    X = X_final
+    y = f(X) + np.random.randn(N) * 1e-2
+    # restore the previous numpy random seed
+    np.random.seed(old_seed)
+    return pd.DataFrame(X), y
+def independentlinear60(display=False, n_points=1_000):
+    """ A simulated dataset with tight correlations among distinct groups of features.
+    """
+    # set a constant seed
+    old_seed = np.random.seed()
+    np.random.seed(0)
+    # generate dataset with known correlation
+    N, M = n_points, 60
+    # set one coefficient from each group of 3 to 1
+    beta = np.zeros(M)
+    beta[0:30:3] = 1
+    def f(X):
+        return np.matmul(X, beta)
+    # Make sure the sample correlation is a perfect match
+    X_start = np.random.randn(N, M)
+    X = X_start - X_start.mean(0)
+    y = f(X) + np.random.randn(N) * 1e-2
+    # restore the previous numpy random seed
+    np.random.seed(old_seed)
+    return pd.DataFrame(X), y
+def a1a(n_points=None):
+    """ A sparse dataset in scipy csr matrix format.
+    """
+    data, target = sklearn.datasets.load_svmlight_file(cache(github_data_url + 'a1a.svmlight'))
+    if n_points is not None:
+        data = shap.utils.sample(data, n_points, random_state=0)
+        target = shap.utils.sample(target, n_points, random_state=0)
+    return data, target
+def rank():
+    """ Ranking datasets from lightgbm repository.
+    """
+    rank_data_url = 'https://raw.githubusercontent.com/Microsoft/LightGBM/master/examples/lambdarank/'
+    x_train, y_train = sklearn.datasets.load_svmlight_file(cache(rank_data_url + 'rank.train'))
+    x_test, y_test = sklearn.datasets.load_svmlight_file(cache(rank_data_url + 'rank.test'))
+    q_train = np.loadtxt(cache(rank_data_url + 'rank.train.query'))
+    q_test = np.loadtxt(cache(rank_data_url + 'rank.test.query'))
+    return x_train, y_train, x_test, y_test, q_train, q_test
+def cache(url, file_name=None):
+    """ Loads a file from the URL and caches it locally.
+    """
+    if file_name is None:
+        file_name = os.path.basename(url)
+    data_dir = os.path.join(os.path.dirname(__file__), "cached_data")
+    os.makedirs(data_dir, exist_ok=True)
+    file_path = os.path.join(data_dir, file_name)
+    if not os.path.isfile(file_path):
+        urlretrieve(url, file_path)
+    return file_path

lib/shap/explainers/__init__.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from ._additive import AdditiveExplainer
+from ._deep import DeepExplainer
+from ._exact import ExactExplainer
+from ._gpu_tree import GPUTreeExplainer
+from ._gradient import GradientExplainer
+from ._kernel import KernelExplainer
+from ._linear import LinearExplainer
+from ._partition import PartitionExplainer
+from ._permutation import PermutationExplainer
+from ._sampling import SamplingExplainer
+from ._tree import TreeExplainer
+# Alternative legacy "short-form" aliases, which are kept here for backwards-compatibility
+Additive = AdditiveExplainer
+Deep = DeepExplainer
+Exact = ExactExplainer
+GPUTree = GPUTreeExplainer
+Gradient = GradientExplainer
+Kernel = KernelExplainer
+Linear = LinearExplainer
+Partition = PartitionExplainer
+Permutation = PermutationExplainer
+Sampling = SamplingExplainer
+Tree = TreeExplainer
+__all__ = [
+    "AdditiveExplainer",
+    "DeepExplainer",
+    "ExactExplainer",
+    "GPUTreeExplainer",
+    "GradientExplainer",
+    "KernelExplainer",
+    "LinearExplainer",
+    "PartitionExplainer",
+    "PermutationExplainer",
+    "SamplingExplainer",
+    "TreeExplainer",
+]

lib/shap/explainers/_additive.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import numpy as np
+from ..utils import MaskedModel, safe_isinstance
+from ._explainer import Explainer
+class AdditiveExplainer(Explainer):
+    """ Computes SHAP values for generalized additive models.
+    This assumes that the model only has first-order effects. Extending this to
+    second- and third-order effects is future work (if you apply this to those models right now
+    you will get incorrect answers that fail additivity).
+    """
+    def __init__(self, model, masker, link=None, feature_names=None, linearize_link=True):
+        """ Build an Additive explainer for the given model using the given masker object.
+        Parameters
+        ----------
+        model : function
+            A callable python object that executes the model given a set of input data samples.
+        masker : function or numpy.array or pandas.DataFrame
+            A callable python object used to "mask" out hidden features of the form `masker(mask, *fargs)`.
+            It takes a single a binary mask and an input sample and returns a matrix of masked samples. These
+            masked samples are evaluated using the model function and the outputs are then averaged.
+            As a shortcut for the standard masking used by SHAP you can pass a background data matrix
+            instead of a function and that matrix will be used for masking. To use a clustering
+            game structure you can pass a shap.maskers.Tabular(data, hclustering=\"correlation\") object, but
+            note that this structure information has no effect on the explanations of additive models.
+        """
+        super().__init__(model, masker, feature_names=feature_names, linearize_link=linearize_link)
+        if safe_isinstance(model, "interpret.glassbox.ExplainableBoostingClassifier"):
+            self.model = model.decision_function
+            if self.masker is None:
+                self._expected_value = model.intercept_
+                # num_features = len(model.additive_terms_)
+                # fm = MaskedModel(self.model, self.masker, self.link, np.zeros(num_features))
+                # masks = np.ones((1, num_features), dtype=bool)
+                # outputs = fm(masks)
+                # self.model(np.zeros(num_features))
+                # self._zero_offset = self.model(np.zeros(num_features))#model.intercept_#outputs[0]
+                # self._input_offsets = np.zeros(num_features) #* self._zero_offset
+                raise NotImplementedError("Masker not given and we don't yet support pulling the distribution centering directly from the EBM model!")
+                return
+        # here we need to compute the offsets ourselves because we can't pull them directly from a model we know about
+        assert safe_isinstance(self.masker, "shap.maskers.Independent"), "The Additive explainer only supports the Tabular masker at the moment!"
+        # pre-compute per-feature offsets
+        fm = MaskedModel(self.model, self.masker, self.link, self.linearize_link, np.zeros(self.masker.shape[1]))
+        masks = np.ones((self.masker.shape[1]+1, self.masker.shape[1]), dtype=bool)
+        for i in range(1, self.masker.shape[1]+1):
+            masks[i,i-1] = False
+        outputs = fm(masks)
+        self._zero_offset = outputs[0]
+        self._input_offsets = np.zeros(masker.shape[1])
+        for i in range(1, self.masker.shape[1]+1):
+            self._input_offsets[i-1] = outputs[i] - self._zero_offset
+        self._expected_value = self._input_offsets.sum() + self._zero_offset
+    def __call__(self, *args, max_evals=None, silent=False):
+        """ Explains the output of model(*args), where args represents one or more parallel iterable args.
+        """
+        # we entirely rely on the general call implementation, we override just to remove **kwargs
+        # from the function signature
+        return super().__call__(*args, max_evals=max_evals, silent=silent)
+    @staticmethod
+    def supports_model_with_masker(model, masker):
+        """ Determines if this explainer can handle the given model.
+        This is an abstract static method meant to be implemented by each subclass.
+        """
+        if safe_isinstance(model, "interpret.glassbox.ExplainableBoostingClassifier"):
+            if model.interactions != 0:
+                raise NotImplementedError("Need to add support for interaction effects!")
+            return True
+        return False
+    def explain_row(self, *row_args, max_evals, main_effects, error_bounds, batch_size, outputs, silent):
+        """ Explains a single row and returns the tuple (row_values, row_expected_values, row_mask_shapes).
+        """
+        x = row_args[0]
+        inputs = np.zeros((len(x), len(x)))
+        for i in range(len(x)):
+            inputs[i,i] = x[i]
+        phi = self.model(inputs) - self._zero_offset - self._input_offsets
+        return {
+            "values": phi,
+            "expected_values": self._expected_value,
+            "mask_shapes": [a.shape for a in row_args],
+            "main_effects": phi,
+            "clustering": getattr(self.masker, "clustering", None)
+        }
+# class AdditiveExplainer(Explainer):
+#     """ Computes SHAP values for generalized additive models.
+#     This assumes that the model only has first order effects. Extending this to
+#     2nd and third order effects is future work (if you apply this to those models right now
+#     you will get incorrect answers that fail additivity).
+#     Parameters
+#     ----------
+#     model : function or ExplainableBoostingRegressor
+#         User supplied additive model either as either a function or a model object.
+#     data : numpy.array, pandas.DataFrame
+#         The background dataset to use for computing conditional expectations.
+#     feature_perturbation : "interventional"
+#         Only the standard interventional SHAP values are supported by AdditiveExplainer right now.
+#     """
+#     def __init__(self, model, data, feature_perturbation="interventional"):
+#         if feature_perturbation != "interventional":
+#             raise Exception("Unsupported type of feature_perturbation provided: " + feature_perturbation)
+#         if safe_isinstance(model, "interpret.glassbox.ebm.ebm.ExplainableBoostingRegressor"):
+#             self.f = model.predict
+#         elif callable(model):
+#             self.f = model
+#         else:
+#             raise ValueError("The passed model must be a recognized object or a function!")
+#         # convert dataframes
+#         if isinstance(data, (pd.Series, pd.DataFrame)):
+#             data = data.values
+#         self.data = data
+#         # compute the expected value of the model output
+#         self.expected_value = self.f(data).mean()
+#         # pre-compute per-feature offsets
+#         tmp = np.zeros(data.shape)
+#         self._zero_offset = self.f(tmp).mean()
+#         self._feature_offset = np.zeros(data.shape[1])
+#         for i in range(data.shape[1]):
+#             tmp[:,i] = data[:,i]
+#             self._feature_offset[i] = self.f(tmp).mean() - self._zero_offset
+#             tmp[:,i] = 0
+#     def shap_values(self, X):
+#         """ Estimate the SHAP values for a set of samples.
+#         Parameters
+#         ----------
+#         X : numpy.array, pandas.DataFrame or scipy.csr_matrix
+#             A matrix of samples (# samples x # features) on which to explain the model's output.
+#         Returns
+#         -------
+#         For models with a single output this returns a matrix of SHAP values
+#         (# samples x # features). Each row sums to the difference between the model output for that
+#         sample and the expected value of the model output (which is stored as expected_value
+#         attribute of the explainer).
+#         """
+#         # convert dataframes
+#         if isinstance(X, (pd.Series, pd.DataFrame)):
+#             X = X.values
+#         # assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))
+#         assert len(X.shape) == 1 or len(X.shape) == 2, "Instance must have 1 or 2 dimensions!"
+#         # convert dataframes
+#         if isinstance(X, (pd.Series, pd.DataFrame)):
+#             X = X.values
+#         phi = np.zeros(X.shape)
+#         tmp = np.zeros(X.shape)
+#         for i in range(X.shape[1]):
+#             tmp[:,i] = X[:,i]
+#             phi[:,i] = self.f(tmp) - self._zero_offset - self._feature_offset[i]
+#             tmp[:,i] = 0
+#         return phi

lib/shap/explainers/_deep/__init__.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from .._explainer import Explainer
+from .deep_pytorch import PyTorchDeep
+from .deep_tf import TFDeep
+class DeepExplainer(Explainer):
+    """ Meant to approximate SHAP values for deep learning models.
+    This is an enhanced version of the DeepLIFT algorithm (Deep SHAP) where, similar to Kernel SHAP, we
+    approximate the conditional expectations of SHAP values using a selection of background samples.
+    Lundberg and Lee, NIPS 2017 showed that the per node attribution rules in DeepLIFT (Shrikumar,
+    Greenside, and Kundaje, arXiv 2017) can be chosen to approximate Shapley values. By integrating
+    over many background samples Deep estimates approximate SHAP values such that they sum
+    up to the difference between the expected model output on the passed background samples and the
+    current model output (f(x) - E[f(x)]).
+    Examples
+    --------
+    See :ref:`Deep Explainer Examples <deep_explainer_examples>`
+    """
+    def __init__(self, model, data, session=None, learning_phase_flags=None):
+        """ An explainer object for a differentiable model using a given background dataset.
+        Note that the complexity of the method scales linearly with the number of background data
+        samples. Passing the entire training dataset as `data` will give very accurate expected
+        values, but be unreasonably expensive. The variance of the expectation estimates scale by
+        roughly 1/sqrt(N) for N background data samples. So 100 samples will give a good estimate,
+        and 1000 samples a very good estimate of the expected values.
+        Parameters
+        ----------
+        model : if framework == 'tensorflow', (input : [tf.Tensor], output : tf.Tensor)
+             A pair of TensorFlow tensors (or a list and a tensor) that specifies the input and
+            output of the model to be explained. Note that SHAP values are specific to a single
+            output value, so the output tf.Tensor should be a single dimensional output (,1).
+            if framework == 'pytorch', an nn.Module object (model), or a tuple (model, layer),
+                where both are nn.Module objects
+            The model is an nn.Module object which takes as input a tensor (or list of tensors) of
+            shape data, and returns a single dimensional output.
+            If the input is a tuple, the returned shap values will be for the input of the
+            layer argument. layer must be a layer in the model, i.e. model.conv2
+        data :
+            if framework == 'tensorflow': [numpy.array] or [pandas.DataFrame]
+            if framework == 'pytorch': [torch.tensor]
+            The background dataset to use for integrating out features. Deep integrates
+            over these samples. The data passed here must match the input tensors given in the
+            first argument. Note that since these samples are integrated over for each sample you
+            should only something like 100 or 1000 random background samples, not the whole training
+            dataset.
+        if framework == 'tensorflow':
+        session : None or tensorflow.Session
+            The TensorFlow session that has the model we are explaining. If None is passed then
+            we do our best to find the right session, first looking for a keras session, then
+            falling back to the default TensorFlow session.
+        learning_phase_flags : None or list of tensors
+            If you have your own custom learning phase flags pass them here. When explaining a prediction
+            we need to ensure we are not in training mode, since this changes the behavior of ops like
+            batch norm or dropout. If None is passed then we look for tensors in the graph that look like
+            learning phase flags (this works for Keras models). Note that we assume all the flags should
+            have a value of False during predictions (and hence explanations).
+        """
+        # first, we need to find the framework
+        if type(model) is tuple:
+            a, b = model
+            try:
+                a.named_parameters()
+                framework = 'pytorch'
+            except Exception:
+                framework = 'tensorflow'
+        else:
+            try:
+                model.named_parameters()
+                framework = 'pytorch'
+            except Exception:
+                framework = 'tensorflow'
+        if framework == 'tensorflow':
+            self.explainer = TFDeep(model, data, session, learning_phase_flags)
+        elif framework == 'pytorch':
+            self.explainer = PyTorchDeep(model, data)
+        self.expected_value = self.explainer.expected_value
+        self.explainer.framework = framework
+    def shap_values(self, X, ranked_outputs=None, output_rank_order='max', check_additivity=True):
+        """ Return approximate SHAP values for the model applied to the data given by X.
+        Parameters
+        ----------
+        X : list,
+            if framework == 'tensorflow': numpy.array, or pandas.DataFrame
+            if framework == 'pytorch': torch.tensor
+            A tensor (or list of tensors) of samples (where X.shape[0] == # samples) on which to
+            explain the model's output.
+        ranked_outputs : None or int
+            If ranked_outputs is None then we explain all the outputs in a multi-output model. If
+            ranked_outputs is a positive integer then we only explain that many of the top model
+            outputs (where "top" is determined by output_rank_order). Note that this causes a pair
+            of values to be returned (shap_values, indexes), where shap_values is a list of numpy
+            arrays for each of the output ranks, and indexes is a matrix that indicates for each sample
+            which output indexes were choses as "top".
+        output_rank_order : "max", "min", or "max_abs"
+            How to order the model outputs when using ranked_outputs, either by maximum, minimum, or
+            maximum absolute value.
+        Returns
+        -------
+        array or list
+            For a models with a single output this returns a tensor of SHAP values with the same shape
+            as X. For a model with multiple outputs this returns a list of SHAP value tensors, each of
+            which are the same shape as X. If ranked_outputs is None then this list of tensors matches
+            the number of model outputs. If ranked_outputs is a positive integer a pair is returned
+            (shap_values, indexes), where shap_values is a list of tensors with a length of
+            ranked_outputs, and indexes is a matrix that indicates for each sample which output indexes
+            were chosen as "top".
+        """
+        return self.explainer.shap_values(X, ranked_outputs, output_rank_order, check_additivity=check_additivity)

lib/shap/explainers/_deep/deep_pytorch.py ADDED Viewed

	@@ -0,0 +1,386 @@

+import warnings
+import numpy as np
+from packaging import version
+from .._explainer import Explainer
+from .deep_utils import _check_additivity
+torch = None
+class PyTorchDeep(Explainer):
+    def __init__(self, model, data):
+        # try and import pytorch
+        global torch
+        if torch is None:
+            import torch
+            if version.parse(torch.__version__) < version.parse("0.4"):
+                warnings.warn("Your PyTorch version is older than 0.4 and not supported.")
+        # check if we have multiple inputs
+        self.multi_input = False
+        if isinstance(data, list):
+            self.multi_input = True
+        if not isinstance(data, list):
+            data = [data]
+        self.data = data
+        self.layer = None
+        self.input_handle = None
+        self.interim = False
+        self.interim_inputs_shape = None
+        self.expected_value = None  # to keep the DeepExplainer base happy
+        if type(model) == tuple:
+            self.interim = True
+            model, layer = model
+            model = model.eval()
+            self.layer = layer
+            self.add_target_handle(self.layer)
+            # if we are taking an interim layer, the 'data' is going to be the input
+            # of the interim layer; we will capture this using a forward hook
+            with torch.no_grad():
+                _ = model(*data)
+                interim_inputs = self.layer.target_input
+                if type(interim_inputs) is tuple:
+                    # this should always be true, but just to be safe
+                    self.interim_inputs_shape = [i.shape for i in interim_inputs]
+                else:
+                    self.interim_inputs_shape = [interim_inputs.shape]
+            self.target_handle.remove()
+            del self.layer.target_input
+        self.model = model.eval()
+        self.multi_output = False
+        self.num_outputs = 1
+        with torch.no_grad():
+            outputs = model(*data)
+            # also get the device everything is running on
+            self.device = outputs.device
+            if outputs.shape[1] > 1:
+                self.multi_output = True
+                self.num_outputs = outputs.shape[1]
+            self.expected_value = outputs.mean(0).cpu().numpy()
+    def add_target_handle(self, layer):
+        input_handle = layer.register_forward_hook(get_target_input)
+        self.target_handle = input_handle
+    def add_handles(self, model, forward_handle, backward_handle):
+        """
+        Add handles to all non-container layers in the model.
+        Recursively for non-container layers
+        """
+        handles_list = []
+        model_children = list(model.children())
+        if model_children:
+            for child in model_children:
+                handles_list.extend(self.add_handles(child, forward_handle, backward_handle))
+        else:  # leaves
+            handles_list.append(model.register_forward_hook(forward_handle))
+            handles_list.append(model.register_full_backward_hook(backward_handle))
+        return handles_list
+    def remove_attributes(self, model):
+        """
+        Removes the x and y attributes which were added by the forward handles
+        Recursively searches for non-container layers
+        """
+        for child in model.children():
+            if 'nn.modules.container' in str(type(child)):
+                self.remove_attributes(child)
+            else:
+                try:
+                    del child.x
+                except AttributeError:
+                    pass
+                try:
+                    del child.y
+                except AttributeError:
+                    pass
+    def gradient(self, idx, inputs):
+        self.model.zero_grad()
+        X = [x.requires_grad_() for x in inputs]
+        outputs = self.model(*X)
+        selected = [val for val in outputs[:, idx]]
+        grads = []
+        if self.interim:
+            interim_inputs = self.layer.target_input
+            for idx, input in enumerate(interim_inputs):
+                grad = torch.autograd.grad(selected, input,
+                                           retain_graph=True if idx + 1 < len(interim_inputs) else None,
+                                           allow_unused=True)[0]
+                if grad is not None:
+                    grad = grad.cpu().numpy()
+                else:
+                    grad = torch.zeros_like(X[idx]).cpu().numpy()
+                grads.append(grad)
+            del self.layer.target_input
+            return grads, [i.detach().cpu().numpy() for i in interim_inputs]
+        else:
+            for idx, x in enumerate(X):
+                grad = torch.autograd.grad(selected, x,
+                                           retain_graph=True if idx + 1 < len(X) else None,
+                                           allow_unused=True)[0]
+                if grad is not None:
+                    grad = grad.cpu().numpy()
+                else:
+                    grad = torch.zeros_like(X[idx]).cpu().numpy()
+                grads.append(grad)
+            return grads
+    def shap_values(self, X, ranked_outputs=None, output_rank_order="max", check_additivity=True):
+        # X ~ self.model_input
+        # X_data ~ self.data
+        # check if we have multiple inputs
+        if not self.multi_input:
+            assert not isinstance(X, list), "Expected a single tensor model input!"
+            X = [X]
+        else:
+            assert isinstance(X, list), "Expected a list of model inputs!"
+        X = [x.detach().to(self.device) for x in X]
+        model_output_values = None
+        if ranked_outputs is not None and self.multi_output:
+            with torch.no_grad():
+                model_output_values = self.model(*X)
+            # rank and determine the model outputs that we will explain
+            if output_rank_order == "max":
+                _, model_output_ranks = torch.sort(model_output_values, descending=True)
+            elif output_rank_order == "min":
+                _, model_output_ranks = torch.sort(model_output_values, descending=False)
+            elif output_rank_order == "max_abs":
+                _, model_output_ranks = torch.sort(torch.abs(model_output_values), descending=True)
+            else:
+                emsg = "output_rank_order must be max, min, or max_abs!"
+                raise ValueError(emsg)
+            model_output_ranks = model_output_ranks[:, :ranked_outputs]
+        else:
+            model_output_ranks = (torch.ones((X[0].shape[0], self.num_outputs)).int() *
+                                  torch.arange(0, self.num_outputs).int())
+        # add the gradient handles
+        handles = self.add_handles(self.model, add_interim_values, deeplift_grad)
+        if self.interim:
+            self.add_target_handle(self.layer)
+        # compute the attributions
+        output_phis = []
+        for i in range(model_output_ranks.shape[1]):
+            phis = []
+            if self.interim:
+                for k in range(len(self.interim_inputs_shape)):
+                    phis.append(np.zeros((X[0].shape[0], ) + self.interim_inputs_shape[k][1: ]))
+            else:
+                for k in range(len(X)):
+                    phis.append(np.zeros(X[k].shape))
+            for j in range(X[0].shape[0]):
+                # tile the inputs to line up with the background data samples
+                tiled_X = [X[t][j:j + 1].repeat(
+                                   (self.data[t].shape[0],) + tuple([1 for k in range(len(X[t].shape) - 1)])) for t
+                           in range(len(X))]
+                joint_x = [torch.cat((tiled_X[t], self.data[t]), dim=0) for t in range(len(X))]
+                # run attribution computation graph
+                feature_ind = model_output_ranks[j, i]
+                sample_phis = self.gradient(feature_ind, joint_x)
+                # assign the attributions to the right part of the output arrays
+                if self.interim:
+                    sample_phis, output = sample_phis
+                    x, data = [], []
+                    for k in range(len(output)):
+                        x_temp, data_temp = np.split(output[k], 2)
+                        x.append(x_temp)
+                        data.append(data_temp)
+                    for t in range(len(self.interim_inputs_shape)):
+                        phis[t][j] = (sample_phis[t][self.data[t].shape[0]:] * (x[t] - data[t])).mean(0)
+                else:
+                    for t in range(len(X)):
+                        phis[t][j] = (torch.from_numpy(sample_phis[t][self.data[t].shape[0]:]).to(self.device) * (X[t][j: j + 1] - self.data[t])).cpu().detach().numpy().mean(0)
+            output_phis.append(phis[0] if not self.multi_input else phis)
+        # cleanup; remove all gradient handles
+        for handle in handles:
+            handle.remove()
+        self.remove_attributes(self.model)
+        if self.interim:
+            self.target_handle.remove()
+        # check that the SHAP values sum up to the model output
+        if check_additivity:
+            if model_output_values is None:
+                with torch.no_grad():
+                    model_output_values = self.model(*X)
+            _check_additivity(self, model_output_values.cpu(), output_phis)
+        if not self.multi_output:
+            return output_phis[0]
+        elif ranked_outputs is not None:
+            return output_phis, model_output_ranks
+        else:
+            return output_phis
+# Module hooks
+def deeplift_grad(module, grad_input, grad_output):
+    """The backward hook which computes the deeplift
+    gradient for an nn.Module
+    """
+    # first, get the module type
+    module_type = module.__class__.__name__
+    # first, check the module is supported
+    if module_type in op_handler:
+        if op_handler[module_type].__name__ not in ['passthrough', 'linear_1d']:
+            return op_handler[module_type](module, grad_input, grad_output)
+    else:
+        warnings.warn(f'unrecognized nn.Module: {module_type}')
+        return grad_input
+def add_interim_values(module, input, output):
+    """The forward hook used to save interim tensors, detached
+    from the graph. Used to calculate the multipliers
+    """
+    try:
+        del module.x
+    except AttributeError:
+        pass
+    try:
+        del module.y
+    except AttributeError:
+        pass
+    module_type = module.__class__.__name__
+    if module_type in op_handler:
+        func_name = op_handler[module_type].__name__
+        # First, check for cases where we don't need to save the x and y tensors
+        if func_name == 'passthrough':
+            pass
+        else:
+            # check only the 0th input varies
+            for i in range(len(input)):
+                if i != 0 and type(output) is tuple:
+                    assert input[i] == output[i], "Only the 0th input may vary!"
+            # if a new method is added, it must be added here too. This ensures tensors
+            # are only saved if necessary
+            if func_name in ['maxpool', 'nonlinear_1d']:
+                # only save tensors if necessary
+                if type(input) is tuple:
+                    setattr(module, 'x', torch.nn.Parameter(input[0].detach()))
+                else:
+                    setattr(module, 'x', torch.nn.Parameter(input.detach()))
+                if type(output) is tuple:
+                    setattr(module, 'y', torch.nn.Parameter(output[0].detach()))
+                else:
+                    setattr(module, 'y', torch.nn.Parameter(output.detach()))
+def get_target_input(module, input, output):
+    """A forward hook which saves the tensor - attached to its graph.
+    Used if we want to explain the interim outputs of a model
+    """
+    try:
+        del module.target_input
+    except AttributeError:
+        pass
+    setattr(module, 'target_input', input)
+def passthrough(module, grad_input, grad_output):
+    """No change made to gradients"""
+    return None
+def maxpool(module, grad_input, grad_output):
+    pool_to_unpool = {
+        'MaxPool1d': torch.nn.functional.max_unpool1d,
+        'MaxPool2d': torch.nn.functional.max_unpool2d,
+        'MaxPool3d': torch.nn.functional.max_unpool3d
+    }
+    pool_to_function = {
+        'MaxPool1d': torch.nn.functional.max_pool1d,
+        'MaxPool2d': torch.nn.functional.max_pool2d,
+        'MaxPool3d': torch.nn.functional.max_pool3d
+    }
+    delta_in = module.x[: int(module.x.shape[0] / 2)] - module.x[int(module.x.shape[0] / 2):]
+    dup0 = [2] + [1 for i in delta_in.shape[1:]]
+    # we also need to check if the output is a tuple
+    y, ref_output = torch.chunk(module.y, 2)
+    cross_max = torch.max(y, ref_output)
+    diffs = torch.cat([cross_max - ref_output, y - cross_max], 0)
+    # all of this just to unpool the outputs
+    with torch.no_grad():
+        _, indices = pool_to_function[module.__class__.__name__](
+            module.x, module.kernel_size, module.stride, module.padding,
+            module.dilation, module.ceil_mode, True)
+        xmax_pos, rmax_pos = torch.chunk(pool_to_unpool[module.__class__.__name__](
+            grad_output[0] * diffs, indices, module.kernel_size, module.stride,
+            module.padding, list(module.x.shape)), 2)
+    grad_input = [None for _ in grad_input]
+    grad_input[0] = torch.where(torch.abs(delta_in) < 1e-7, torch.zeros_like(delta_in),
+                           (xmax_pos + rmax_pos) / delta_in).repeat(dup0)
+    return tuple(grad_input)
+def linear_1d(module, grad_input, grad_output):
+    """No change made to gradients."""
+    return None
+def nonlinear_1d(module, grad_input, grad_output):
+    delta_out = module.y[: int(module.y.shape[0] / 2)] - module.y[int(module.y.shape[0] / 2):]
+    delta_in = module.x[: int(module.x.shape[0] / 2)] - module.x[int(module.x.shape[0] / 2):]
+    dup0 = [2] + [1 for i in delta_in.shape[1:]]
+    # handles numerical instabilities where delta_in is very small by
+    # just taking the gradient in those cases
+    grads = [None for _ in grad_input]
+    grads[0] = torch.where(torch.abs(delta_in.repeat(dup0)) < 1e-6, grad_input[0],
+                           grad_output[0] * (delta_out / delta_in).repeat(dup0))
+    return tuple(grads)
+op_handler = {}
+# passthrough ops, where we make no change to the gradient
+op_handler['Dropout3d'] = passthrough
+op_handler['Dropout2d'] = passthrough
+op_handler['Dropout'] = passthrough
+op_handler['AlphaDropout'] = passthrough
+op_handler['Conv1d'] = linear_1d
+op_handler['Conv2d'] = linear_1d
+op_handler['Conv3d'] = linear_1d
+op_handler['ConvTranspose1d'] = linear_1d
+op_handler['ConvTranspose2d'] = linear_1d
+op_handler['ConvTranspose3d'] = linear_1d
+op_handler['Linear'] = linear_1d
+op_handler['AvgPool1d'] = linear_1d
+op_handler['AvgPool2d'] = linear_1d
+op_handler['AvgPool3d'] = linear_1d
+op_handler['AdaptiveAvgPool1d'] = linear_1d
+op_handler['AdaptiveAvgPool2d'] = linear_1d
+op_handler['AdaptiveAvgPool3d'] = linear_1d
+op_handler['BatchNorm1d'] = linear_1d
+op_handler['BatchNorm2d'] = linear_1d
+op_handler['BatchNorm3d'] = linear_1d
+op_handler['LeakyReLU'] = nonlinear_1d
+op_handler['ReLU'] = nonlinear_1d
+op_handler['ELU'] = nonlinear_1d
+op_handler['Sigmoid'] = nonlinear_1d
+op_handler["Tanh"] = nonlinear_1d
+op_handler["Softplus"] = nonlinear_1d
+op_handler['Softmax'] = nonlinear_1d
+op_handler['MaxPool1d'] = maxpool
+op_handler['MaxPool2d'] = maxpool
+op_handler['MaxPool3d'] = maxpool

lib/shap/explainers/_deep/deep_tf.py ADDED Viewed

	@@ -0,0 +1,763 @@

+import warnings
+import numpy as np
+from packaging import version
+from ...utils._exceptions import DimensionError
+from .._explainer import Explainer
+from ..tf_utils import _get_graph, _get_model_inputs, _get_model_output, _get_session
+from .deep_utils import _check_additivity
+tf = None
+tf_ops = None
+tf_backprop = None
+tf_execute = None
+tf_gradients_impl = None
+def custom_record_gradient(op_name, inputs, attrs, results):
+    """ This overrides tensorflow.python.eager.backprop._record_gradient.
+    We need to override _record_gradient in order to get gradient backprop to
+    get called for ResourceGather operations. In order to make this work we
+    temporarily "lie" about the input type to prevent the node from getting
+    pruned from the gradient backprop process. We then reset the type directly
+    afterwards back to what it was (an integer type).
+    """
+    reset_input = False
+    if op_name == "ResourceGather" and inputs[1].dtype == tf.int32:
+        inputs[1].__dict__["_dtype"] = tf.float32
+        reset_input = True
+    try:
+        out = tf_backprop._record_gradient("shap_"+op_name, inputs, attrs, results)
+    except AttributeError:
+        out = tf_backprop.record_gradient("shap_"+op_name, inputs, attrs, results)
+    if reset_input:
+        inputs[1].__dict__["_dtype"] = tf.int32
+    return out
+class TFDeep(Explainer):
+    """
+    Using tf.gradients to implement the backpropagation was
+    inspired by the gradient-based implementation approach proposed by Ancona et al, ICLR 2018. Note
+    that this package does not currently use the reveal-cancel rule for ReLu units proposed in DeepLIFT.
+    """
+    def __init__(self, model, data, session=None, learning_phase_flags=None):
+        """ An explainer object for a deep model using a given background dataset.
+        Note that the complexity of the method scales linearly with the number of background data
+        samples. Passing the entire training dataset as `data` will give very accurate expected
+        values, but will be computationally expensive. The variance of the expectation estimates scales by
+        roughly 1/sqrt(N) for N background data samples. So 100 samples will give a good estimate,
+        and 1000 samples a very good estimate of the expected values.
+        Parameters
+        ----------
+        model : tf.keras.Model or (input : [tf.Operation], output : tf.Operation)
+            A keras model object or a pair of TensorFlow operations (or a list and an op) that
+            specifies the input and output of the model to be explained. Note that SHAP values
+            are specific to a single output value, so you get an explanation for each element of
+            the output tensor (which must be a flat rank one vector).
+        data : [numpy.array] or [pandas.DataFrame] or function
+            The background dataset to use for integrating out features. DeepExplainer integrates
+            over all these samples for each explanation. The data passed here must match the input
+            operations given to the model. If a function is supplied, it must be a function that
+            takes a particular input example and generates the background dataset for that example
+        session : None or tensorflow.Session
+            The TensorFlow session that has the model we are explaining. If None is passed then
+            we do our best to find the right session, first looking for a keras session, then
+            falling back to the default TensorFlow session.
+        learning_phase_flags : None or list of tensors
+            If you have your own custom learning phase flags pass them here. When explaining a prediction
+            we need to ensure we are not in training mode, since this changes the behavior of ops like
+            batch norm or dropout. If None is passed then we look for tensors in the graph that look like
+            learning phase flags (this works for Keras models). Note that we assume all the flags should
+            have a value of False during predictions (and hence explanations).
+        """
+        # try to import tensorflow
+        global tf, tf_ops, tf_backprop, tf_execute, tf_gradients_impl
+        if tf is None:
+            from tensorflow.python.eager import backprop as tf_backprop
+            from tensorflow.python.eager import execute as tf_execute
+            from tensorflow.python.framework import (
+                ops as tf_ops,
+            )
+            from tensorflow.python.ops import (
+                gradients_impl as tf_gradients_impl,
+            )
+            if not hasattr(tf_gradients_impl, "_IsBackpropagatable"):
+                from tensorflow.python.ops import gradients_util as tf_gradients_impl
+            import tensorflow as tf
+            if version.parse(tf.__version__) < version.parse("1.4.0"):
+                warnings.warn("Your TensorFlow version is older than 1.4.0 and not supported.")
+        if version.parse(tf.__version__) >= version.parse("2.4.0"):
+            warnings.warn("Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.")
+        # determine the model inputs and outputs
+        self.model_inputs = _get_model_inputs(model)
+        self.model_output = _get_model_output(model)
+        assert not isinstance(self.model_output, list), "The model output to be explained must be a single tensor!"
+        assert len(self.model_output.shape) < 3, "The model output must be a vector or a single value!"
+        self.multi_output = True
+        if len(self.model_output.shape) == 1:
+            self.multi_output = False
+        if tf.executing_eagerly():
+            if isinstance(model, tuple) or isinstance(model, list):
+                assert len(model) == 2, "When a tuple is passed it must be of the form (inputs, outputs)"
+                from tensorflow.keras import Model
+                self.model = Model(model[0], model[1])
+            else:
+                self.model = model
+        # check if we have multiple inputs
+        self.multi_input = True
+        if not isinstance(self.model_inputs, list) or len(self.model_inputs) == 1:
+            self.multi_input = False
+            if not isinstance(self.model_inputs, list):
+                self.model_inputs = [self.model_inputs]
+        if not isinstance(data, list) and (hasattr(data, "__call__") is False):
+            data = [data]
+        self.data = data
+        self._vinputs = {} # used to track what op inputs depends on the model inputs
+        self.orig_grads = {}
+        if not tf.executing_eagerly():
+            self.session = _get_session(session)
+        self.graph = _get_graph(self)
+        # if no learning phase flags were given we go looking for them
+        # ...this will catch the one that keras uses
+        # we need to find them since we want to make sure learning phase flags are set to False
+        if learning_phase_flags is None:
+            self.learning_phase_ops = []
+            for op in self.graph.get_operations():
+                if 'learning_phase' in op.name and op.type == "Const" and len(op.outputs[0].shape) == 0:
+                    if op.outputs[0].dtype == tf.bool:
+                        self.learning_phase_ops.append(op)
+            self.learning_phase_flags = [op.outputs[0] for op in self.learning_phase_ops]
+        else:
+            self.learning_phase_ops = [t.op for t in learning_phase_flags]
+        # save the expected output of the model
+        # if self.data is a function, set self.expected_value to None
+        if (hasattr(self.data, '__call__')):
+            self.expected_value = None
+        else:
+            if self.data[0].shape[0] > 5000:
+                warnings.warn("You have provided over 5k background samples! For better performance consider using smaller random sample.")
+            if not tf.executing_eagerly():
+                self.expected_value = self.run(self.model_output, self.model_inputs, self.data).mean(0)
+            else:
+                #if type(self.model)is tuple:
+                #    self.fModel(cnn.inputs, cnn.get_layer(theNameYouWant).outputs)
+                self.expected_value = tf.reduce_mean(self.model(self.data), 0)
+        if not tf.executing_eagerly():
+            self._init_between_tensors(self.model_output.op, self.model_inputs)
+        # make a blank array that will get lazily filled in with the SHAP value computation
+        # graphs for each output. Lazy is important since if there are 1000 outputs and we
+        # only explain the top 5 it would be a waste to build graphs for the other 995
+        if not self.multi_output:
+            self.phi_symbolics = [None]
+        else:
+            noutputs = self.model_output.shape.as_list()[1]
+            if noutputs is not None:
+                self.phi_symbolics = [None for i in range(noutputs)]
+            else:
+                raise DimensionError("The model output tensor to be explained cannot have a static shape in dim 1 of None!")
+    def _get_model_output(self, model):
+        if len(model.layers[-1]._inbound_nodes) == 0:
+            if len(model.outputs) > 1:
+                warnings.warn("Only one model output supported.")
+            return model.outputs[0]
+        else:
+            return model.layers[-1].output
+    def _init_between_tensors(self, out_op, model_inputs):
+        # find all the operations in the graph between our inputs and outputs
+        tensor_blacklist = tensors_blocked_by_false(self.learning_phase_ops) # don't follow learning phase branches
+        dependence_breakers = [k for k in op_handlers if op_handlers[k] == break_dependence]
+        back_ops = backward_walk_ops(
+            [out_op], tensor_blacklist,
+            dependence_breakers
+        )
+        start_ops = []
+        for minput in model_inputs:
+            for op in minput.consumers():
+                start_ops.append(op)
+        self.between_ops = forward_walk_ops(
+            start_ops,
+            tensor_blacklist, dependence_breakers,
+            within_ops=back_ops
+        )
+        # note all the tensors that are on the path between the inputs and the output
+        self.between_tensors = {}
+        for op in self.between_ops:
+            for t in op.outputs:
+                self.between_tensors[t.name] = True
+        for t in model_inputs:
+            self.between_tensors[t.name] = True
+        # save what types are being used
+        self.used_types = {}
+        for op in self.between_ops:
+            self.used_types[op.type] = True
+    def _variable_inputs(self, op):
+        """ Return which inputs of this operation are variable (i.e. depend on the model inputs).
+        """
+        if op not in self._vinputs:
+            out = np.zeros(len(op.inputs), dtype=bool)
+            for i,t in enumerate(op.inputs):
+                out[i] = t.name in self.between_tensors
+            self._vinputs[op] = out
+        return self._vinputs[op]
+    def phi_symbolic(self, i):
+        """ Get the SHAP value computation graph for a given model output.
+        """
+        if self.phi_symbolics[i] is None:
+            if not tf.executing_eagerly():
+                def anon():
+                    out = self.model_output[:,i] if self.multi_output else self.model_output
+                    return tf.gradients(out, self.model_inputs)
+                self.phi_symbolics[i] = self.execute_with_overridden_gradients(anon)
+            else:
+                @tf.function
+                def grad_graph(shap_rAnD):
+                    phase = tf.keras.backend.learning_phase()
+                    tf.keras.backend.set_learning_phase(0)
+                    with tf.GradientTape(watch_accessed_variables=False) as tape:
+                        tape.watch(shap_rAnD)
+                        out = self.model(shap_rAnD)
+                        if self.multi_output:
+                            out = out[:,i]
+                    self._init_between_tensors(out.op, shap_rAnD)
+                    x_grad = tape.gradient(out, shap_rAnD)
+                    tf.keras.backend.set_learning_phase(phase)
+                    return x_grad
+                self.phi_symbolics[i] = grad_graph
+        return self.phi_symbolics[i]
+    def shap_values(self, X, ranked_outputs=None, output_rank_order="max", check_additivity=True):
+        # check if we have multiple inputs
+        if not self.multi_input:
+            if isinstance(X, list) and len(X) != 1:
+                raise ValueError("Expected a single tensor as model input!")
+            elif not isinstance(X, list):
+                X = [X]
+        else:
+            assert isinstance(X, list), "Expected a list of model inputs!"
+        assert len(self.model_inputs) == len(X), "Number of model inputs (%d) does not match the number given (%d)!" % (len(self.model_inputs), len(X))
+        # rank and determine the model outputs that we will explain
+        if ranked_outputs is not None and self.multi_output:
+            if not tf.executing_eagerly():
+                model_output_values = self.run(self.model_output, self.model_inputs, X)
+            else:
+                model_output_values = self.model(X)
+            if output_rank_order == "max":
+                model_output_ranks = np.argsort(-model_output_values)
+            elif output_rank_order == "min":
+                model_output_ranks = np.argsort(model_output_values)
+            elif output_rank_order == "max_abs":
+                model_output_ranks = np.argsort(np.abs(model_output_values))
+            else:
+                emsg = "output_rank_order must be max, min, or max_abs!"
+                raise ValueError(emsg)
+            model_output_ranks = model_output_ranks[:,:ranked_outputs]
+        else:
+            model_output_ranks = np.tile(np.arange(len(self.phi_symbolics)), (X[0].shape[0], 1))
+        # compute the attributions
+        output_phis = []
+        for i in range(model_output_ranks.shape[1]):
+            phis = []
+            for k in range(len(X)):
+                phis.append(np.zeros(X[k].shape))
+            for j in range(X[0].shape[0]):
+                if (hasattr(self.data, '__call__')):
+                    bg_data = self.data([X[t][j] for t in range(len(X))])
+                    if not isinstance(bg_data, list):
+                        bg_data = [bg_data]
+                else:
+                    bg_data = self.data
+                # tile the inputs to line up with the background data samples
+                tiled_X = [np.tile(X[t][j:j+1], (bg_data[t].shape[0],) + tuple([1 for k in range(len(X[t].shape)-1)])) for t in range(len(X))]
+                # we use the first sample for the current sample and the rest for the references
+                joint_input = [np.concatenate([tiled_X[t], bg_data[t]], 0) for t in range(len(X))]
+                # run attribution computation graph
+                feature_ind = model_output_ranks[j,i]
+                sample_phis = self.run(self.phi_symbolic(feature_ind), self.model_inputs, joint_input)
+                # assign the attributions to the right part of the output arrays
+                for t in range(len(X)):
+                    phis[t][j] = (sample_phis[t][bg_data[t].shape[0]:] * (X[t][j] - bg_data[t])).mean(0)
+            output_phis.append(phis[0] if not self.multi_input else phis)
+        # check that the SHAP values sum up to the model output
+        if check_additivity:
+            if not tf.executing_eagerly():
+                model_output = self.run(self.model_output, self.model_inputs, X)
+            else:
+                model_output = self.model(X)
+            _check_additivity(self, model_output, output_phis)
+        if not self.multi_output:
+            return output_phis[0]
+        elif ranked_outputs is not None:
+            return output_phis, model_output_ranks
+        else:
+            return output_phis
+    def run(self, out, model_inputs, X):
+        """ Runs the model while also setting the learning phase flags to False.
+        """
+        if not tf.executing_eagerly():
+            feed_dict = dict(zip(model_inputs, X))
+            for t in self.learning_phase_flags:
+                feed_dict[t] = False
+            return self.session.run(out, feed_dict)
+        else:
+            def anon():
+                tf_execute.record_gradient = custom_record_gradient
+                # build inputs that are correctly shaped, typed, and tf-wrapped
+                inputs = []
+                for i in range(len(X)):
+                    shape = list(self.model_inputs[i].shape)
+                    shape[0] = -1
+                    data = X[i].reshape(shape)
+                    v = tf.constant(data, dtype=self.model_inputs[i].dtype)
+                    inputs.append(v)
+                final_out = out(inputs)
+                try:
+                    tf_execute.record_gradient = tf_backprop._record_gradient
+                except AttributeError:
+                    tf_execute.record_gradient = tf_backprop.record_gradient
+                return final_out
+            return self.execute_with_overridden_gradients(anon)
+    def custom_grad(self, op, *grads):
+        """ Passes a gradient op creation request to the correct handler.
+        """
+        type_name = op.type[5:] if op.type.startswith("shap_") else op.type
+        out = op_handlers[type_name](self, op, *grads) # we cut off the shap_ prefix before the lookup
+        return out
+    def execute_with_overridden_gradients(self, f):
+        # replace the gradients for all the non-linear activations
+        # we do this by hacking our way into the registry (TODO: find a public API for this if it exists)
+        reg = tf_ops._gradient_registry._registry
+        ops_not_in_registry = ['TensorListReserve']
+        # NOTE: location_tag taken from tensorflow source for None type ops
+        location_tag = ("UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN", "UNKNOWN")
+        # TODO: unclear why some ops are not in the registry with TF 2.0 like TensorListReserve
+        for non_reg_ops in ops_not_in_registry:
+            reg[non_reg_ops] = {'type': None, 'location': location_tag}
+        for n in op_handlers:
+            if n in reg:
+                self.orig_grads[n] = reg[n]["type"]
+                reg["shap_"+n] = {
+                    "type": self.custom_grad,
+                    "location": reg[n]["location"]
+                }
+                reg[n]["type"] = self.custom_grad
+        # In TensorFlow 1.10 they started pruning out nodes that they think can't be backpropped
+        # unfortunately that includes the index of embedding layers so we disable that check here
+        if hasattr(tf_gradients_impl, "_IsBackpropagatable"):
+            orig_IsBackpropagatable = tf_gradients_impl._IsBackpropagatable
+            tf_gradients_impl._IsBackpropagatable = lambda tensor: True
+        # define the computation graph for the attribution values using a custom gradient-like computation
+        try:
+            out = f()
+        finally:
+            # reinstate the backpropagatable check
+            if hasattr(tf_gradients_impl, "_IsBackpropagatable"):
+                tf_gradients_impl._IsBackpropagatable = orig_IsBackpropagatable
+            # restore the original gradient definitions
+            for n in op_handlers:
+                if n in reg:
+                    del reg["shap_"+n]
+                    reg[n]["type"] = self.orig_grads[n]
+            for non_reg_ops in ops_not_in_registry:
+                del reg[non_reg_ops]
+        if not tf.executing_eagerly():
+            return out
+        else:
+            return [v.numpy() for v in out]
+def tensors_blocked_by_false(ops):
+    """ Follows a set of ops assuming their value is False and find blocked Switch paths.
+    This is used to prune away parts of the model graph that are only used during the training
+    phase (like dropout, batch norm, etc.).
+    """
+    blocked = []
+    def recurse(op):
+        if op.type == "Switch":
+            blocked.append(op.outputs[1]) # the true path is blocked since we assume the ops we trace are False
+        else:
+            for out in op.outputs:
+                for c in out.consumers():
+                    recurse(c)
+    for op in ops:
+        recurse(op)
+    return blocked
+def backward_walk_ops(start_ops, tensor_blacklist, op_type_blacklist):
+    found_ops = []
+    op_stack = [op for op in start_ops]
+    while len(op_stack) > 0:
+        op = op_stack.pop()
+        if op.type not in op_type_blacklist and op not in found_ops:
+            found_ops.append(op)
+            for input in op.inputs:
+                if input not in tensor_blacklist:
+                    op_stack.append(input.op)
+    return found_ops
+def forward_walk_ops(start_ops, tensor_blacklist, op_type_blacklist, within_ops):
+    found_ops = []
+    op_stack = [op for op in start_ops]
+    while len(op_stack) > 0:
+        op = op_stack.pop()
+        if op.type not in op_type_blacklist and op in within_ops and op not in found_ops:
+            found_ops.append(op)
+            for out in op.outputs:
+                if out not in tensor_blacklist:
+                    for c in out.consumers():
+                        op_stack.append(c)
+    return found_ops
+def softmax(explainer, op, *grads):
+    """ Just decompose softmax into its components and recurse, we can handle all of them :)
+    We assume the 'axis' is the last dimension because the TF codebase swaps the 'axis' to
+    the last dimension before the softmax op if 'axis' is not already the last dimension.
+    We also don't subtract the max before tf.exp for numerical stability since that might
+    mess up the attributions and it seems like TensorFlow doesn't define softmax that way
+    (according to the docs)
+    """
+    in0 = op.inputs[0]
+    in0_max = tf.reduce_max(in0, axis=-1, keepdims=True, name="in0_max")
+    in0_centered = in0 - in0_max
+    evals = tf.exp(in0_centered, name="custom_exp")
+    rsum = tf.reduce_sum(evals, axis=-1, keepdims=True)
+    div = evals / rsum
+    # mark these as in-between the inputs and outputs
+    for op in [evals.op, rsum.op, div.op, in0_centered.op]:
+        for t in op.outputs:
+            if t.name not in explainer.between_tensors:
+                explainer.between_tensors[t.name] = False
+    out = tf.gradients(div, in0_centered, grad_ys=grads[0])[0]
+    # remove the names we just added
+    for op in [evals.op, rsum.op, div.op, in0_centered.op]:
+        for t in op.outputs:
+            if explainer.between_tensors[t.name] is False:
+                del explainer.between_tensors[t.name]
+    # rescale to account for our shift by in0_max (which we did for numerical stability)
+    xin0,rin0 = tf.split(in0, 2)
+    xin0_centered,rin0_centered = tf.split(in0_centered, 2)
+    delta_in0 = xin0 - rin0
+    dup0 = [2] + [1 for i in delta_in0.shape[1:]]
+    return tf.where(
+        tf.tile(tf.abs(delta_in0), dup0) < 1e-6,
+        out,
+        out * tf.tile((xin0_centered - rin0_centered) / delta_in0, dup0)
+    )
+def maxpool(explainer, op, *grads):
+    xin0,rin0 = tf.split(op.inputs[0], 2)
+    xout,rout = tf.split(op.outputs[0], 2)
+    delta_in0 = xin0 - rin0
+    dup0 = [2] + [1 for i in delta_in0.shape[1:]]
+    cross_max = tf.maximum(xout, rout)
+    diffs = tf.concat([cross_max - rout, xout - cross_max], 0)
+    if op.type.startswith("shap_"):
+        op.type = op.type[5:]
+    xmax_pos,rmax_pos = tf.split(explainer.orig_grads[op.type](op, grads[0] * diffs), 2)
+    return tf.tile(tf.where(
+        tf.abs(delta_in0) < 1e-7,
+        tf.zeros_like(delta_in0),
+        (xmax_pos + rmax_pos) / delta_in0
+    ), dup0)
+def gather(explainer, op, *grads):
+    #params = op.inputs[0]
+    indices = op.inputs[1]
+    #axis = op.inputs[2]
+    var = explainer._variable_inputs(op)
+    if var[1] and not var[0]:
+        assert len(indices.shape) == 2, "Only scalar indices supported right now in GatherV2!"
+        xin1,rin1 = tf.split(tf.cast(op.inputs[1], tf.float32), 2)
+        xout,rout = tf.split(op.outputs[0], 2)
+        dup_in1 = [2] + [1 for i in xin1.shape[1:]]
+        dup_out = [2] + [1 for i in xout.shape[1:]]
+        delta_in1_t = tf.tile(xin1 - rin1, dup_in1)
+        out_sum = tf.reduce_sum(grads[0] * tf.tile(xout - rout, dup_out), list(range(len(indices.shape), len(grads[0].shape))))
+        if op.type == "ResourceGather":
+            return [None, tf.where(
+                tf.abs(delta_in1_t) < 1e-6,
+                tf.zeros_like(delta_in1_t),
+                out_sum / delta_in1_t
+            )]
+        return [None, tf.where(
+            tf.abs(delta_in1_t) < 1e-6,
+            tf.zeros_like(delta_in1_t),
+            out_sum / delta_in1_t
+        ), None]
+    elif var[0] and not var[1]:
+        if op.type.startswith("shap_"):
+            op.type = op.type[5:]
+        return [explainer.orig_grads[op.type](op, grads[0]), None] # linear in this case
+    else:
+        raise ValueError("Axis not yet supported to be varying for gather op!")
+def linearity_1d_nonlinearity_2d(input_ind0, input_ind1, op_func):
+    def handler(explainer, op, *grads):
+        var = explainer._variable_inputs(op)
+        if var[input_ind0] and not var[input_ind1]:
+            return linearity_1d_handler(input_ind0, explainer, op, *grads)
+        elif var[input_ind1] and not var[input_ind0]:
+            return linearity_1d_handler(input_ind1, explainer, op, *grads)
+        elif var[input_ind0] and var[input_ind1]:
+            return nonlinearity_2d_handler(input_ind0, input_ind1, op_func, explainer, op, *grads)
+        else:
+            return [None for _ in op.inputs] # no inputs vary, we must be hidden by a switch function
+    return handler
+def nonlinearity_1d_nonlinearity_2d(input_ind0, input_ind1, op_func):
+    def handler(explainer, op, *grads):
+        var = explainer._variable_inputs(op)
+        if var[input_ind0] and not var[input_ind1]:
+            return nonlinearity_1d_handler(input_ind0, explainer, op, *grads)
+        elif var[input_ind1] and not var[input_ind0]:
+            return nonlinearity_1d_handler(input_ind1, explainer, op, *grads)
+        elif var[input_ind0] and var[input_ind1]:
+            return nonlinearity_2d_handler(input_ind0, input_ind1, op_func, explainer, op, *grads)
+        else:
+            return [None for _ in op.inputs] # no inputs vary, we must be hidden by a switch function
+    return handler
+def nonlinearity_1d(input_ind):
+    def handler(explainer, op, *grads):
+        return nonlinearity_1d_handler(input_ind, explainer, op, *grads)
+    return handler
+def nonlinearity_1d_handler(input_ind, explainer, op, *grads):
+    # make sure only the given input varies
+    op_inputs = op.inputs
+    if op_inputs is None:
+        op_inputs = op.outputs[0].op.inputs
+    for i in range(len(op_inputs)):
+        if i != input_ind:
+            assert not explainer._variable_inputs(op)[i], str(i) + "th input to " + op.name + " cannot vary!"
+    xin0, rin0 = tf.split(op_inputs[input_ind], 2)
+    xout, rout = tf.split(op.outputs[input_ind], 2)
+    delta_in0 = xin0 - rin0
+    if delta_in0.shape is None:
+        dup0 = [2, 1]
+    else:
+        dup0 = [2] + [1 for i in delta_in0.shape[1:]]
+    out = [None for _ in op_inputs]
+    if op.type.startswith("shap_"):
+        op.type = op.type[5:]
+    orig_grad = explainer.orig_grads[op.type](op, grads[0])
+    out[input_ind] = tf.where(
+        tf.tile(tf.abs(delta_in0), dup0) < 1e-6,
+        orig_grad[input_ind] if len(op_inputs) > 1 else orig_grad,
+        grads[0] * tf.tile((xout - rout) / delta_in0, dup0)
+    )
+    return out
+def nonlinearity_2d_handler(input_ind0, input_ind1, op_func, explainer, op, *grads):
+    if not (input_ind0 == 0 and input_ind1 == 1):
+        emsg = "TODO: Can't yet handle double inputs that are not first!"
+        raise Exception(emsg)
+    xout,rout = tf.split(op.outputs[0], 2)
+    in0 = op.inputs[input_ind0]
+    in1 = op.inputs[input_ind1]
+    xin0,rin0 = tf.split(in0, 2)
+    xin1,rin1 = tf.split(in1, 2)
+    delta_in0 = xin0 - rin0
+    delta_in1 = xin1 - rin1
+    dup0 = [2] + [1 for i in delta_in0.shape[1:]]
+    out10 = op_func(xin0, rin1)
+    out01 = op_func(rin0, xin1)
+    out11,out00 = xout,rout
+    out0 = 0.5 * (out11 - out01 + out10 - out00)
+    out0 = grads[0] * tf.tile(out0 / delta_in0, dup0)
+    out1 = 0.5 * (out11 - out10 + out01 - out00)
+    out1 = grads[0] * tf.tile(out1 / delta_in1, dup0)
+    # Avoid divide by zero nans
+    out0 = tf.where(tf.abs(tf.tile(delta_in0, dup0)) < 1e-7, tf.zeros_like(out0), out0)
+    out1 = tf.where(tf.abs(tf.tile(delta_in1, dup0)) < 1e-7, tf.zeros_like(out1), out1)
+    # see if due to broadcasting our gradient shapes don't match our input shapes
+    if (np.any(np.array(out1.shape) != np.array(in1.shape))):
+        broadcast_index = np.where(np.array(out1.shape) != np.array(in1.shape))[0][0]
+        out1 = tf.reduce_sum(out1, axis=broadcast_index, keepdims=True)
+    elif (np.any(np.array(out0.shape) != np.array(in0.shape))):
+        broadcast_index = np.where(np.array(out0.shape) != np.array(in0.shape))[0][0]
+        out0 = tf.reduce_sum(out0, axis=broadcast_index, keepdims=True)
+    return [out0, out1]
+def linearity_1d(input_ind):
+    def handler(explainer, op, *grads):
+        return linearity_1d_handler(input_ind, explainer, op, *grads)
+    return handler
+def linearity_1d_handler(input_ind, explainer, op, *grads):
+    # make sure only the given input varies (negative means only that input cannot vary, and is measured from the end of the list)
+    for i in range(len(op.inputs)):
+        if i != input_ind:
+            assert not explainer._variable_inputs(op)[i], str(i) + "th input to " + op.name + " cannot vary!"
+    if op.type.startswith("shap_"):
+        op.type = op.type[5:]
+    return explainer.orig_grads[op.type](op, *grads)
+def linearity_with_excluded(input_inds):
+    def handler(explainer, op, *grads):
+        return linearity_with_excluded_handler(input_inds, explainer, op, *grads)
+    return handler
+def linearity_with_excluded_handler(input_inds, explainer, op, *grads):
+    # make sure the given inputs don't vary (negative is measured from the end of the list)
+    for i in range(len(op.inputs)):
+        if i in input_inds or i - len(op.inputs) in input_inds:
+            assert not explainer._variable_inputs(op)[i], str(i) + "th input to " + op.name + " cannot vary!"
+    if op.type.startswith("shap_"):
+        op.type = op.type[5:]
+    return explainer.orig_grads[op.type](op, *grads)
+def passthrough(explainer, op, *grads):
+    if op.type.startswith("shap_"):
+        op.type = op.type[5:]
+    return explainer.orig_grads[op.type](op, *grads)
+def break_dependence(explainer, op, *grads):
+    """ This function name is used to break attribution dependence in the graph traversal.
+    These operation types may be connected above input data values in the graph but their outputs
+    don't depend on the input values (for example they just depend on the shape).
+    """
+    return [None for _ in op.inputs]
+op_handlers = {}
+# ops that are always linear
+op_handlers["Identity"] = passthrough
+op_handlers["StridedSlice"] = passthrough
+op_handlers["Squeeze"] = passthrough
+op_handlers["ExpandDims"] = passthrough
+op_handlers["Pack"] = passthrough
+op_handlers["BiasAdd"] = passthrough
+op_handlers["Unpack"] = passthrough
+op_handlers["Add"] = passthrough
+op_handlers["Sub"] = passthrough
+op_handlers["Merge"] = passthrough
+op_handlers["Sum"] = passthrough
+op_handlers["Mean"] = passthrough
+op_handlers["Cast"] = passthrough
+op_handlers["Transpose"] = passthrough
+op_handlers["Enter"] = passthrough
+op_handlers["Exit"] = passthrough
+op_handlers["NextIteration"] = passthrough
+op_handlers["Tile"] = passthrough
+op_handlers["TensorArrayScatterV3"] = passthrough
+op_handlers["TensorArrayReadV3"] = passthrough
+op_handlers["TensorArrayWriteV3"] = passthrough
+# ops that don't pass any attributions to their inputs
+op_handlers["Shape"] = break_dependence
+op_handlers["RandomUniform"] = break_dependence
+op_handlers["ZerosLike"] = break_dependence
+#op_handlers["StopGradient"] = break_dependence # this allows us to stop attributions when we want to (like softmax re-centering)
+# ops that are linear and only allow a single input to vary
+op_handlers["Reshape"] = linearity_1d(0)
+op_handlers["Pad"] = linearity_1d(0)
+op_handlers["ReverseV2"] = linearity_1d(0)
+op_handlers["ConcatV2"] = linearity_with_excluded([-1])
+op_handlers["Conv2D"] = linearity_1d(0)
+op_handlers["Switch"] = linearity_1d(0)
+op_handlers["AvgPool"] = linearity_1d(0)
+op_handlers["FusedBatchNorm"] = linearity_1d(0)
+# ops that are nonlinear and only allow a single input to vary
+op_handlers["Relu"] = nonlinearity_1d(0)
+op_handlers["Elu"] = nonlinearity_1d(0)
+op_handlers["Sigmoid"] = nonlinearity_1d(0)
+op_handlers["Tanh"] = nonlinearity_1d(0)
+op_handlers["Softplus"] = nonlinearity_1d(0)
+op_handlers["Exp"] = nonlinearity_1d(0)
+op_handlers["ClipByValue"] = nonlinearity_1d(0)
+op_handlers["Rsqrt"] = nonlinearity_1d(0)
+op_handlers["Square"] = nonlinearity_1d(0)
+op_handlers["Max"] = nonlinearity_1d(0)
+# ops that are nonlinear and allow two inputs to vary
+op_handlers["SquaredDifference"] = nonlinearity_1d_nonlinearity_2d(0, 1, lambda x, y: (x - y) * (x - y))
+op_handlers["Minimum"] = nonlinearity_1d_nonlinearity_2d(0, 1, lambda x, y: tf.minimum(x, y))
+op_handlers["Maximum"] = nonlinearity_1d_nonlinearity_2d(0, 1, lambda x, y: tf.maximum(x, y))
+# ops that allow up to two inputs to vary are are linear when only one input varies
+op_handlers["Mul"] = linearity_1d_nonlinearity_2d(0, 1, lambda x, y: x * y)
+op_handlers["RealDiv"] = linearity_1d_nonlinearity_2d(0, 1, lambda x, y: x / y)
+op_handlers["MatMul"] = linearity_1d_nonlinearity_2d(0, 1, lambda x, y: tf.matmul(x, y))
+# ops that need their own custom attribution functions
+op_handlers["GatherV2"] = gather
+op_handlers["ResourceGather"] = gather
+op_handlers["MaxPool"] = maxpool
+op_handlers["Softmax"] = softmax
+# TODO items
+# TensorArrayGatherV3
+# Max
+# TensorArraySizeV3
+# Range

lib/shap/explainers/_deep/deep_utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import numpy as np
+def _check_additivity(explainer, model_output_values, output_phis):
+    TOLERANCE = 1e-2
+    assert len(explainer.expected_value) == model_output_values.shape[1], "Length of expected values and model outputs does not match."
+    for t in range(len(explainer.expected_value)):
+        if not explainer.multi_input:
+            diffs = model_output_values[:, t] - explainer.expected_value[t] - output_phis[t].sum(axis=tuple(range(1, output_phis[t].ndim)))
+        else:
+            diffs = model_output_values[:, t] - explainer.expected_value[t]
+            for i in range(len(output_phis[t])):
+                diffs -= output_phis[t][i].sum(axis=tuple(range(1, output_phis[t][i].ndim)))
+        maxdiff = np.abs(diffs).max()
+        assert maxdiff < TOLERANCE, "The SHAP explanations do not sum up to the model's output! This is either because of a " \
+                                    "rounding error or because an operator in your computation graph was not fully supported. If " \
+                                    "the sum difference of %f is significant compared to the scale of your model outputs, please post " \
+                                    f"as a github issue, with a reproducible example so we can debug it. Used framework: {explainer.framework} - Max. diff: {maxdiff} - Tolerance: {TOLERANCE}"

lib/shap/explainers/_exact.py ADDED Viewed

	@@ -0,0 +1,366 @@

+import logging
+import numpy as np
+from numba import njit
+from .. import links
+from ..models import Model
+from ..utils import (
+    MaskedModel,
+    delta_minimization_order,
+    make_masks,
+    shapley_coefficients,
+)
+from ._explainer import Explainer
+log = logging.getLogger('shap')
+class ExactExplainer(Explainer):
+    """ Computes SHAP values via an optimized exact enumeration.
+    This works well for standard Shapley value maskers for models with less than ~15 features that vary
+    from the background per sample. It also works well for Owen values from hclustering structured
+    maskers when there are less than ~100 features that vary from the background per sample. This
+    explainer minimizes the number of function evaluations needed by ordering the masking sets to
+    minimize sequential differences. This is done using gray codes for standard Shapley values
+    and a greedy sorting method for hclustering structured maskers.
+    """
+    def __init__(self, model, masker, link=links.identity, linearize_link=True, feature_names=None):
+        """ Build an explainers.Exact object for the given model using the given masker object.
+        Parameters
+        ----------
+        model : function
+            A callable python object that executes the model given a set of input data samples.
+        masker : function or numpy.array or pandas.DataFrame
+            A callable python object used to "mask" out hidden features of the form `masker(mask, *fargs)`.
+            It takes a single a binary mask and an input sample and returns a matrix of masked samples. These
+            masked samples are evaluated using the model function and the outputs are then averaged.
+            As a shortcut for the standard masking used by SHAP you can pass a background data matrix
+            instead of a function and that matrix will be used for masking. To use a clustering
+            game structure you can pass a shap.maskers.TabularPartitions(data) object.
+        link : function
+            The link function used to map between the output units of the model and the SHAP value units. By
+            default it is shap.links.identity, but shap.links.logit can be useful so that expectations are
+            computed in probability units while explanations remain in the (more naturally additive) log-odds
+            units. For more details on how link functions work see any overview of link functions for generalized
+            linear models.
+        linearize_link : bool
+            If we use a non-linear link function to take expectations then models that are additive with respect to that
+            link function for a single background sample will no longer be additive when using a background masker with
+            many samples. This for example means that a linear logistic regression model would have interaction effects
+            that arise from the non-linear changes in expectation averaging. To retain the additively of the model with
+            still respecting the link function we linearize the link function by default.
+        """ # TODO link to the link linearization paper when done
+        super().__init__(model, masker, link=link, linearize_link=linearize_link, feature_names=feature_names)
+        self.model = Model(model)
+        if getattr(masker, "clustering", None) is not None:
+            self._partition_masks, self._partition_masks_inds = partition_masks(masker.clustering)
+            self._partition_delta_indexes = partition_delta_indexes(masker.clustering, self._partition_masks)
+        self._gray_code_cache = {} # used to avoid regenerating the same gray code patterns
+    def __call__(self, *args, max_evals=100000, main_effects=False, error_bounds=False, batch_size="auto", interactions=1, silent=False):
+        """ Explains the output of model(*args), where args represents one or more parallel iterators.
+        """
+        # we entirely rely on the general call implementation, we override just to remove **kwargs
+        # from the function signature
+        return super().__call__(
+            *args, max_evals=max_evals, main_effects=main_effects, error_bounds=error_bounds,
+            batch_size=batch_size, interactions=interactions, silent=silent
+        )
+    def _cached_gray_codes(self, n):
+        if n not in self._gray_code_cache:
+            self._gray_code_cache[n] = gray_code_indexes(n)
+        return self._gray_code_cache[n]
+    def explain_row(self, *row_args, max_evals, main_effects, error_bounds, batch_size, outputs, interactions, silent):
+        """ Explains a single row and returns the tuple (row_values, row_expected_values, row_mask_shapes).
+        """
+        # build a masked version of the model for the current input sample
+        fm = MaskedModel(self.model, self.masker, self.link, self.linearize_link, *row_args)
+        # do the standard Shapley values
+        inds = None
+        if getattr(self.masker, "clustering", None) is None:
+            # see which elements we actually need to perturb
+            inds = fm.varying_inputs()
+            # make sure we have enough evals
+            if max_evals is not None and max_evals != "auto" and max_evals < 2**len(inds):
+                raise ValueError(
+                    f"It takes {2**len(inds)} masked evaluations to run the Exact explainer on this instance, but max_evals={max_evals}!"
+                )
+            # generate the masks in gray code order (so that we change the inputs as little
+            # as possible while we iterate to minimize the need to re-eval when the inputs
+            # don't vary from the background)
+            delta_indexes = self._cached_gray_codes(len(inds))
+            # map to a larger mask that includes the invariant entries
+            extended_delta_indexes = np.zeros(2**len(inds), dtype=int)
+            for i in range(2**len(inds)):
+                if delta_indexes[i] == MaskedModel.delta_mask_noop_value:
+                    extended_delta_indexes[i] = delta_indexes[i]
+                else:
+                    extended_delta_indexes[i] = inds[delta_indexes[i]]
+            # run the model
+            outputs = fm(extended_delta_indexes, zero_index=0, batch_size=batch_size)
+            # Shapley values
+            # Care: Need to distinguish between `True` and `1`
+            if interactions is False or (interactions == 1 and interactions is not True):
+                # loop over all the outputs to update the rows
+                coeff = shapley_coefficients(len(inds))
+                row_values = np.zeros((len(fm),) + outputs.shape[1:])
+                mask = np.zeros(len(fm), dtype=bool)
+                _compute_grey_code_row_values(row_values, mask, inds, outputs, coeff, extended_delta_indexes, MaskedModel.delta_mask_noop_value)
+            # Shapley-Taylor interaction values
+            elif interactions is True or interactions == 2:
+                # loop over all the outputs to update the rows
+                coeff = shapley_coefficients(len(inds))
+                row_values = np.zeros((len(fm), len(fm)) + outputs.shape[1:])
+                mask = np.zeros(len(fm), dtype=bool)
+                _compute_grey_code_row_values_st(row_values, mask, inds, outputs, coeff, extended_delta_indexes, MaskedModel.delta_mask_noop_value)
+            elif interactions > 2:
+                raise NotImplementedError("Currently the Exact explainer does not support interactions higher than order 2!")
+        # do a partition tree constrained version of Shapley values
+        else:
+            # make sure we have enough evals
+            if max_evals is not None and max_evals != "auto" and max_evals < len(fm)**2:
+                raise ValueError(
+                    f"It takes {len(fm)**2} masked evaluations to run the Exact explainer on this instance, but max_evals={max_evals}!"
+                )
+            # generate the masks in a hclust order (so that we change the inputs as little
+            # as possible while we iterate to minimize the need to re-eval when the inputs
+            # don't vary from the background)
+            delta_indexes = self._partition_delta_indexes
+            # run the model
+            outputs = fm(delta_indexes, batch_size=batch_size)
+            # loop over each output feature
+            row_values = np.zeros((len(fm),) + outputs.shape[1:])
+            for i in range(len(fm)):
+                on_outputs = outputs[self._partition_masks_inds[i][1]]
+                off_outputs = outputs[self._partition_masks_inds[i][0]]
+                row_values[i] = (on_outputs - off_outputs).mean(0)
+        # compute the main effects if we need to
+        main_effect_values = None
+        if main_effects or interactions is True or interactions == 2:
+            if inds is None:
+                inds = np.arange(len(fm))
+            main_effect_values = fm.main_effects(inds)
+            if interactions is True or interactions == 2:
+                for i in range(len(fm)):
+                    row_values[i, i] = main_effect_values[i]
+        return {
+            "values": row_values,
+            "expected_values": outputs[0],
+            "mask_shapes": fm.mask_shapes,
+            "main_effects": main_effect_values if main_effects else None,
+            "clustering": getattr(self.masker, "clustering", None)
+        }
+@njit
+def _compute_grey_code_row_values(row_values, mask, inds, outputs, shapley_coeff, extended_delta_indexes, noop_code):
+    set_size = 0
+    M = len(inds)
+    for i in range(2**M):
+        # update the mask
+        delta_ind = extended_delta_indexes[i]
+        if delta_ind != noop_code:
+            mask[delta_ind] = ~mask[delta_ind]
+            if mask[delta_ind]:
+                set_size += 1
+            else:
+                set_size -= 1
+        # update the output row values
+        on_coeff = shapley_coeff[set_size-1]
+        if set_size < M:
+            off_coeff = shapley_coeff[set_size]
+        out = outputs[i]
+        for j in inds:
+            if mask[j]:
+                row_values[j] += out * on_coeff
+            else:
+                row_values[j] -= out * off_coeff
+@njit
+def _compute_grey_code_row_values_st(row_values, mask, inds, outputs, shapley_coeff, extended_delta_indexes, noop_code):
+    set_size = 0
+    M = len(inds)
+    for i in range(2**M):
+        # update the mask
+        delta_ind = extended_delta_indexes[i]
+        if delta_ind != noop_code:
+            mask[delta_ind] = ~mask[delta_ind]
+            if mask[delta_ind]:
+                set_size += 1
+            else:
+                set_size -= 1
+        # distribute the effect of this mask set over all the terms it impacts
+        out = outputs[i]
+        for j in range(M):
+            for k in range(j+1, M):
+                if not mask[j] and not mask[k]:
+                    delta = out * shapley_coeff[set_size] # * 2
+                elif (not mask[j] and mask[k]) or (mask[j] and not mask[k]):
+                    delta = -out * shapley_coeff[set_size - 1] # * 2
+                else: # both true
+                    delta = out * shapley_coeff[set_size - 2] # * 2
+                row_values[j,k] += delta
+                row_values[k,j] += delta
+def partition_delta_indexes(partition_tree, all_masks):
+    """ Return an delta index encoded array of all the masks possible while following the given partition tree.
+    """
+    # convert the masks to delta index format
+    mask = np.zeros(all_masks.shape[1], dtype=bool)
+    delta_inds = []
+    for i in range(len(all_masks)):
+        inds = np.where(mask ^ all_masks[i,:])[0]
+        for j in inds[:-1]:
+            delta_inds.append(-j - 1) # negative + (-1) means we have more inds still to change...
+        if len(inds) == 0:
+            delta_inds.append(MaskedModel.delta_mask_noop_value)
+        else:
+            delta_inds.extend(inds[-1:])
+        mask = all_masks[i,:]
+    return np.array(delta_inds)
+def partition_masks(partition_tree):
+    """ Return an array of all the masks possible while following the given partition tree.
+    """
+    M = partition_tree.shape[0] + 1
+    mask_matrix = make_masks(partition_tree)
+    all_masks = []
+    m00 = np.zeros(M, dtype=bool)
+    all_masks.append(m00)
+    all_masks.append(~m00)
+    #inds_stack = [0,1]
+    inds_lists = [[[], []] for i in range(M)]
+    _partition_masks_recurse(len(partition_tree)-1, m00, 0, 1, inds_lists, mask_matrix, partition_tree, M, all_masks)
+    all_masks = np.array(all_masks)
+    # we resort the clustering matrix to minimize the sequential difference between the masks
+    # this minimizes the number of model evaluations we need to run when the background sometimes
+    # matches the foreground. We seem to average about 1.5 feature changes per mask with this
+    # approach. This is not as clean as the grey code ordering, but a perfect 1 feature change
+    # ordering is not possible with a clustering tree
+    order = delta_minimization_order(all_masks)
+    inverse_order = np.arange(len(order))[np.argsort(order)]
+    for inds_list0,inds_list1 in inds_lists:
+        for i in range(len(inds_list0)):
+            inds_list0[i] = inverse_order[inds_list0[i]]
+            inds_list1[i] = inverse_order[inds_list1[i]]
+    # Care: inds_lists have different lengths, so partition_masks_inds is a "ragged" array. See GH #3063
+    partition_masks = all_masks[order]
+    partition_masks_inds = [[np.array(on), np.array(off)] for on, off in inds_lists]
+    return partition_masks, partition_masks_inds
+# TODO: this should be a jit function... which would require preallocating the inds_lists (sizes are 2**depth of that ind)
+# TODO: we could also probable avoid making the masks at all and just record the deltas if we want...
+def _partition_masks_recurse(index, m00, ind00, ind11, inds_lists, mask_matrix, partition_tree, M, all_masks):
+    if index < 0:
+        inds_lists[index + M][0].append(ind00)
+        inds_lists[index + M][1].append(ind11)
+        return
+    # get our children indexes
+    left_index = int(partition_tree[index,0] - M)
+    right_index = int(partition_tree[index,1] - M)
+    # build more refined masks
+    m10 = m00.copy() # we separate the copy from the add so as to not get converted to a matrix
+    m10[:] += mask_matrix[left_index+M, :]
+    m01 = m00.copy()
+    m01[:] += mask_matrix[right_index+M, :]
+    # record the new masks we made
+    ind01 = len(all_masks)
+    all_masks.append(m01)
+    ind10 = len(all_masks)
+    all_masks.append(m10)
+    # inds_stack.append(len(all_masks) - 2)
+    # inds_stack.append(len(all_masks) - 1)
+    # recurse left and right with both 1 (True) and 0 (False) contexts
+    _partition_masks_recurse(left_index, m00, ind00, ind10, inds_lists, mask_matrix, partition_tree, M, all_masks)
+    _partition_masks_recurse(right_index, m10, ind10, ind11, inds_lists, mask_matrix, partition_tree, M, all_masks)
+    _partition_masks_recurse(left_index, m01, ind01, ind11, inds_lists, mask_matrix, partition_tree, M, all_masks)
+    _partition_masks_recurse(right_index, m00, ind00, ind01, inds_lists, mask_matrix, partition_tree, M, all_masks)
+def gray_code_masks(nbits):
+    """ Produces an array of all binary patterns of size nbits in gray code order.
+    This is based on code from: http://code.activestate.com/recipes/576592-gray-code-generatoriterator/
+    """
+    out = np.zeros((2**nbits, nbits), dtype=bool)
+    li = np.zeros(nbits, dtype=bool)
+    for term in range(2, (1<<nbits)+1):
+        if term % 2 == 1: # odd
+            for i in range(-1,-nbits,-1):
+                if li[i] == 1:
+                    li[i-1] = li[i-1]^1
+                    break
+        else: # even
+            li[-1] = li[-1]^1
+        out[term-1,:] = li
+    return out
+def gray_code_indexes(nbits):
+    """ Produces an array of which bits flip at which position.
+    We assume the masks start at all zero and -1 means don't do a flip.
+    This is a more efficient representation of the gray_code_masks version.
+    """
+    out = np.ones(2**nbits, dtype=int) * MaskedModel.delta_mask_noop_value
+    li = np.zeros(nbits, dtype=bool)
+    for term in range((1<<nbits)-1):
+        if term % 2 == 1: # odd
+            for i in range(-1,-nbits,-1):
+                if li[i] == 1:
+                    li[i-1] = li[i-1]^1
+                    out[term+1] = nbits + (i-1)
+                    break
+        else: # even
+            li[-1] = li[-1]^1
+            out[term+1] = nbits-1
+    return out

lib/shap/explainers/_explainer.py ADDED Viewed

	@@ -0,0 +1,457 @@

+import copy
+import time
+import numpy as np
+import pandas as pd
+import scipy.sparse
+from .. import explainers, links, maskers, models
+from .._explanation import Explanation
+from .._serializable import Deserializer, Serializable, Serializer
+from ..maskers import Masker
+from ..models import Model
+from ..utils import safe_isinstance, show_progress
+from ..utils._exceptions import InvalidAlgorithmError
+from ..utils.transformers import is_transformers_lm
+class Explainer(Serializable):
+    """ Uses Shapley values to explain any machine learning model or python function.
+    This is the primary explainer interface for the SHAP library. It takes any combination
+    of a model and masker and returns a callable subclass object that implements
+    the particular estimation algorithm that was chosen.
+    """
+    def __init__(self, model, masker=None, link=links.identity, algorithm="auto", output_names=None, feature_names=None, linearize_link=True,
+                 seed=None, **kwargs):
+        """ Build a new explainer for the passed model.
+        Parameters
+        ----------
+        model : object or function
+            User supplied function or model object that takes a dataset of samples and
+            computes the output of the model for those samples.
+        masker : function, numpy.array, pandas.DataFrame, tokenizer, None, or a list of these for each model input
+            The function used to "mask" out hidden features of the form `masked_args = masker(*model_args, mask=mask)`.
+            It takes input in the same form as the model, but for just a single sample with a binary
+            mask, then returns an iterable of masked samples. These
+            masked samples will then be evaluated using the model function and the outputs averaged.
+            As a shortcut for the standard masking using by SHAP you can pass a background data matrix
+            instead of a function and that matrix will be used for masking. Domain specific masking
+            functions are available in shap such as shap.ImageMasker for images and shap.TokenMasker
+            for text. In addition to determining how to replace hidden features, the masker can also
+            constrain the rules of the cooperative game used to explain the model. For example
+            shap.TabularMasker(data, hclustering="correlation") will enforce a hierarchical clustering
+            of coalitions for the game (in this special case the attributions are known as the Owen values).
+        link : function
+            The link function used to map between the output units of the model and the SHAP value units. By
+            default it is shap.links.identity, but shap.links.logit can be useful so that expectations are
+            computed in probability units while explanations remain in the (more naturally additive) log-odds
+            units. For more details on how link functions work see any overview of link functions for generalized
+            linear models.
+        algorithm : "auto", "permutation", "partition", "tree", or "linear"
+            The algorithm used to estimate the Shapley values. There are many different algorithms that
+            can be used to estimate the Shapley values (and the related value for constrained games), each
+            of these algorithms have various tradeoffs and are preferable in different situations. By
+            default the "auto" options attempts to make the best choice given the passed model and masker,
+            but this choice can always be overridden by passing the name of a specific algorithm. The type of
+            algorithm used will determine what type of subclass object is returned by this constructor, and
+            you can also build those subclasses directly if you prefer or need more fine grained control over
+            their options.
+        output_names : None or list of strings
+            The names of the model outputs. For example if the model is an image classifier, then output_names would
+            be the names of all the output classes. This parameter is optional. When output_names is None then
+            the Explanation objects produced by this explainer will not have any output_names, which could effect
+            downstream plots.
+        seed: None or int
+            seed for reproducibility
+        """
+        self.model = model
+        self.output_names = output_names
+        self.feature_names = feature_names
+        # wrap the incoming masker object as a shap.Masker object
+        if (
+            isinstance(masker, pd.DataFrame)
+            or ((isinstance(masker, np.ndarray) or scipy.sparse.issparse(masker)) and len(masker.shape) == 2)
+        ):
+            if algorithm == "partition":
+                self.masker = maskers.Partition(masker)
+            else:
+                self.masker = maskers.Independent(masker)
+        elif safe_isinstance(masker, ["transformers.PreTrainedTokenizer", "transformers.tokenization_utils_base.PreTrainedTokenizerBase"]):
+            if is_transformers_lm(self.model):
+                # auto assign text infilling if model is a transformer model with lm head
+                self.masker = maskers.Text(masker, mask_token="...", collapse_mask_token=True)
+            else:
+                self.masker = maskers.Text(masker)
+        elif (masker is list or masker is tuple) and masker[0] is not str:
+            self.masker = maskers.Composite(*masker)
+        elif (masker is dict) and ("mean" in masker):
+            self.masker = maskers.Independent(masker)
+        elif masker is None and isinstance(self.model, models.TransformersPipeline):
+            return self.__init__(
+                self.model, self.model.inner_model.tokenizer,
+                link=link, algorithm=algorithm, output_names=output_names, feature_names=feature_names, linearize_link=linearize_link, **kwargs
+            )
+        else:
+            self.masker = masker
+        # Check for transformer pipeline objects and wrap them
+        if safe_isinstance(self.model, "transformers.pipelines.Pipeline"):
+            if is_transformers_lm(self.model.model):
+                return self.__init__(
+                    self.model.model, self.model.tokenizer if self.masker is None else self.masker,
+                    link=link, algorithm=algorithm, output_names=output_names, feature_names=feature_names, linearize_link=linearize_link, **kwargs
+                )
+            else:
+                return self.__init__(
+                    models.TransformersPipeline(self.model), self.masker,
+                    link=link, algorithm=algorithm, output_names=output_names, feature_names=feature_names, linearize_link=linearize_link, **kwargs
+                )
+        # wrap self.masker and self.model for output text explanation algorithm
+        if is_transformers_lm(self.model):
+            self.model = models.TeacherForcing(self.model, self.masker.tokenizer)
+            self.masker = maskers.OutputComposite(self.masker, self.model.text_generate)
+        elif safe_isinstance(self.model, "shap.models.TeacherForcing") and safe_isinstance(self.masker, ["shap.maskers.Text", "shap.maskers.Image"]):
+            self.masker = maskers.OutputComposite(self.masker, self.model.text_generate)
+        elif safe_isinstance(self.model, "shap.models.TopKLM") and safe_isinstance(self.masker, "shap.maskers.Text"):
+            self.masker = maskers.FixedComposite(self.masker)
+        #self._brute_force_fallback = explainers.BruteForce(self.model, self.masker)
+        # validate and save the link function
+        if callable(link):
+            self.link = link
+        else:
+            raise TypeError("The passed link function needs to be callable!")
+        self.linearize_link = linearize_link
+        # if we are called directly (as opposed to through super()) then we convert ourselves to the subclass
+        # that implements the specific algorithm that was chosen
+        if self.__class__ is Explainer:
+            # do automatic algorithm selection
+            #from .. import explainers
+            if algorithm == "auto":
+                # use implementation-aware methods if possible
+                if explainers.LinearExplainer.supports_model_with_masker(model, self.masker):
+                    algorithm = "linear"
+                elif explainers.TreeExplainer.supports_model_with_masker(model, self.masker): # TODO: check for Partition?
+                    algorithm = "tree"
+                elif explainers.AdditiveExplainer.supports_model_with_masker(model, self.masker):
+                    algorithm = "additive"
+                # otherwise use a model agnostic method
+                elif callable(self.model):
+                    if issubclass(type(self.masker), maskers.Independent):
+                        if self.masker.shape[1] <= 10:
+                            algorithm = "exact"
+                        else:
+                            algorithm = "permutation"
+                    elif issubclass(type(self.masker), maskers.Partition):
+                        if self.masker.shape[1] <= 32:
+                            algorithm = "exact"
+                        else:
+                            algorithm = "permutation"
+                    elif (getattr(self.masker, "text_data", False) or getattr(self.masker, "image_data", False)) and hasattr(self.masker, "clustering"):
+                        algorithm = "partition"
+                    else:
+                        algorithm = "permutation"
+                # if we get here then we don't know how to handle what was given to us
+                else:
+                    raise TypeError("The passed model is not callable and cannot be analyzed directly with the given masker! Model: " + str(model))
+            # build the right subclass
+            if algorithm == "exact":
+                self.__class__ = explainers.ExactExplainer
+                explainers.ExactExplainer.__init__(self, self.model, self.masker, link=self.link, feature_names=self.feature_names, linearize_link=linearize_link, **kwargs)
+            elif algorithm == "permutation":
+                self.__class__ = explainers.PermutationExplainer
+                explainers.PermutationExplainer.__init__(self, self.model, self.masker, link=self.link, feature_names=self.feature_names, linearize_link=linearize_link, seed=seed, **kwargs)
+            elif algorithm == "partition":
+                self.__class__ = explainers.PartitionExplainer
+                explainers.PartitionExplainer.__init__(self, self.model, self.masker, link=self.link, feature_names=self.feature_names, linearize_link=linearize_link, output_names=self.output_names, **kwargs)
+            elif algorithm == "tree":
+                self.__class__ = explainers.TreeExplainer
+                explainers.TreeExplainer.__init__(self, self.model, self.masker, link=self.link, feature_names=self.feature_names, linearize_link=linearize_link, **kwargs)
+            elif algorithm == "additive":
+                self.__class__ = explainers.AdditiveExplainer
+                explainers.AdditiveExplainer.__init__(self, self.model, self.masker, link=self.link, feature_names=self.feature_names, linearize_link=linearize_link, **kwargs)
+            elif algorithm == "linear":
+                self.__class__ = explainers.LinearExplainer
+                explainers.LinearExplainer.__init__(self, self.model, self.masker, link=self.link, feature_names=self.feature_names, linearize_link=linearize_link, **kwargs)
+            elif algorithm == "deep":
+                self.__class__ = explainers.DeepExplainer
+                explainers.DeepExplainer.__init__(self, self.model, self.masker, link=self.link, feature_names=self.feature_names, linearize_link=linearize_link, **kwargs)
+            else:
+                raise InvalidAlgorithmError("Unknown algorithm type passed: %s!" % algorithm)
+    def __call__(self, *args, max_evals="auto", main_effects=False, error_bounds=False, batch_size="auto",
+                 outputs=None, silent=False, **kwargs):
+        """ Explains the output of model(*args), where args is a list of parallel iterable datasets.
+        Note this default version could be an abstract method that is implemented by each algorithm-specific
+        subclass of Explainer. Descriptions of each subclasses' __call__ arguments
+        are available in their respective doc-strings.
+        """
+        # if max_evals == "auto":
+        #     self._brute_force_fallback
+        start_time = time.time()
+        if issubclass(type(self.masker), maskers.OutputComposite) and len(args)==2:
+            self.masker.model = models.TextGeneration(target_sentences=args[1])
+            args = args[:1]
+        # parse our incoming arguments
+        num_rows = None
+        args = list(args)
+        if self.feature_names is None:
+            feature_names = [None for _ in range(len(args))]
+        elif issubclass(type(self.feature_names[0]), (list, tuple)):
+            feature_names = copy.deepcopy(self.feature_names)
+        else:
+            feature_names = [copy.deepcopy(self.feature_names)]
+        for i in range(len(args)):
+            # try and see if we can get a length from any of the for our progress bar
+            if num_rows is None:
+                try:
+                    num_rows = len(args[i])
+                except Exception:
+                    pass
+            # convert DataFrames to numpy arrays
+            if isinstance(args[i], pd.DataFrame):
+                feature_names[i] = list(args[i].columns)
+                args[i] = args[i].to_numpy()
+            # convert nlp Dataset objects to lists
+            if safe_isinstance(args[i], "nlp.arrow_dataset.Dataset"):
+                args[i] = args[i]["text"]
+            elif issubclass(type(args[i]), dict) and "text" in args[i]:
+                args[i] = args[i]["text"]
+        if batch_size == "auto":
+            if hasattr(self.masker, "default_batch_size"):
+                batch_size = self.masker.default_batch_size
+            else:
+                batch_size = 10
+        # loop over each sample, filling in the values array
+        values = []
+        output_indices = []
+        expected_values = []
+        mask_shapes = []
+        main_effects = []
+        hierarchical_values = []
+        clustering = []
+        output_names = []
+        error_std = []
+        if callable(getattr(self.masker, "feature_names", None)):
+            feature_names = [[] for _ in range(len(args))]
+        for row_args in show_progress(zip(*args), num_rows, self.__class__.__name__+" explainer", silent):
+            row_result = self.explain_row(
+                *row_args, max_evals=max_evals, main_effects=main_effects, error_bounds=error_bounds,
+                batch_size=batch_size, outputs=outputs, silent=silent, **kwargs
+            )
+            values.append(row_result.get("values", None))
+            output_indices.append(row_result.get("output_indices", None))
+            expected_values.append(row_result.get("expected_values", None))
+            mask_shapes.append(row_result["mask_shapes"])
+            main_effects.append(row_result.get("main_effects", None))
+            clustering.append(row_result.get("clustering", None))
+            hierarchical_values.append(row_result.get("hierarchical_values", None))
+            tmp = row_result.get("output_names", None)
+            output_names.append(tmp(*row_args) if callable(tmp) else tmp)
+            error_std.append(row_result.get("error_std", None))
+            if callable(getattr(self.masker, "feature_names", None)):
+                row_feature_names = self.masker.feature_names(*row_args)
+                for i in range(len(row_args)):
+                    feature_names[i].append(row_feature_names[i])
+        # split the values up according to each input
+        arg_values = [[] for a in args]
+        for i, v in enumerate(values):
+            pos = 0
+            for j in range(len(args)):
+                mask_length = np.prod(mask_shapes[i][j])
+                arg_values[j].append(values[i][pos:pos+mask_length])
+                pos += mask_length
+        # collapse the arrays as possible
+        expected_values = pack_values(expected_values)
+        main_effects = pack_values(main_effects)
+        output_indices = pack_values(output_indices)
+        main_effects = pack_values(main_effects)
+        hierarchical_values = pack_values(hierarchical_values)
+        error_std = pack_values(error_std)
+        clustering = pack_values(clustering)
+        # getting output labels
+        ragged_outputs = False
+        if output_indices is not None:
+            ragged_outputs = not all(len(x) == len(output_indices[0]) for x in output_indices)
+        if self.output_names is None:
+            if None not in output_names:
+                if not ragged_outputs:
+                    sliced_labels = np.array(output_names)
+                else:
+                    sliced_labels = [np.array(output_names[i])[index_list] for i,index_list in enumerate(output_indices)]
+            else:
+                sliced_labels = None
+        else:
+            assert output_indices is not None, "You have passed a list for output_names but the model seems to not have multiple outputs!"
+            labels = np.array(self.output_names)
+            sliced_labels = [labels[index_list] for index_list in output_indices]
+            if not ragged_outputs:
+                sliced_labels = np.array(sliced_labels)
+        if isinstance(sliced_labels, np.ndarray) and len(sliced_labels.shape) == 2:
+            if np.all(sliced_labels[0,:] == sliced_labels):
+                sliced_labels = sliced_labels[0]
+        # allow the masker to transform the input data to better match the masking pattern
+        # (such as breaking text into token segments)
+        if hasattr(self.masker, "data_transform"):
+            new_args = []
+            for row_args in zip(*args):
+                new_args.append([pack_values(v) for v in self.masker.data_transform(*row_args)])
+            args = list(zip(*new_args))
+        # build the explanation objects
+        out = []
+        for j, data in enumerate(args):
+            # reshape the attribution values using the mask_shapes
+            tmp = []
+            for i, v in enumerate(arg_values[j]):
+                if np.prod(mask_shapes[i][j]) != np.prod(v.shape): # see if we have multiple outputs
+                    tmp.append(v.reshape(*mask_shapes[i][j], -1))
+                else:
+                    tmp.append(v.reshape(*mask_shapes[i][j]))
+            arg_values[j] = pack_values(tmp)
+            if feature_names[j] is None:
+                feature_names[j] = ["Feature " + str(i) for i in range(data.shape[1])]
+            # build an explanation object for this input argument
+            out.append(Explanation(
+                arg_values[j], expected_values, data,
+                feature_names=feature_names[j], main_effects=main_effects,
+                clustering=clustering,
+                hierarchical_values=hierarchical_values,
+                output_names=sliced_labels, # self.output_names
+                error_std=error_std,
+                compute_time=time.time() - start_time
+                # output_shape=output_shape,
+                #lower_bounds=v_min, upper_bounds=v_max
+            ))
+        return out[0] if len(out) == 1 else out
+    def explain_row(self, *row_args, max_evals, main_effects, error_bounds, outputs, silent, **kwargs):
+        """ Explains a single row and returns the tuple (row_values, row_expected_values, row_mask_shapes, main_effects).
+        This is an abstract method meant to be implemented by each subclass.
+        Returns
+        -------
+        tuple
+            A tuple of (row_values, row_expected_values, row_mask_shapes), where row_values is an array of the
+            attribution values for each sample, row_expected_values is an array (or single value) representing
+            the expected value of the model for each sample (which is the same for all samples unless there
+            are fixed inputs present, like labels when explaining the loss), and row_mask_shapes is a list
+            of all the input shapes (since the row_values is always flattened),
+        """
+        return {}
+    @staticmethod
+    def supports_model_with_masker(model, masker):
+        """ Determines if this explainer can handle the given model.
+        This is an abstract static method meant to be implemented by each subclass.
+        """
+        return False
+    @staticmethod
+    def _compute_main_effects(fm, expected_value, inds):
+        """ A utility method to compute the main effects from a MaskedModel.
+        """
+        # mask each input on in isolation
+        masks = np.zeros(2*len(inds)-1, dtype=int)
+        last_ind = -1
+        for i in range(len(inds)):
+            if i > 0:
+                masks[2*i - 1] = -last_ind - 1 # turn off the last input
+            masks[2*i] = inds[i] # turn on this input
+            last_ind = inds[i]
+        # compute the main effects for the given indexes
+        main_effects = fm(masks) - expected_value
+        # expand the vector to the full input size
+        expanded_main_effects = np.zeros(len(fm))
+        for i, ind in enumerate(inds):
+            expanded_main_effects[ind] = main_effects[i]
+        return expanded_main_effects
+    def save(self, out_file, model_saver=".save", masker_saver=".save"):
+        """ Write the explainer to the given file stream.
+        """
+        super().save(out_file)
+        with Serializer(out_file, "shap.Explainer", version=0) as s:
+            s.save("model", self.model, model_saver)
+            s.save("masker", self.masker, masker_saver)
+            s.save("link", self.link)
+    @classmethod
+    def load(cls, in_file, model_loader=Model.load, masker_loader=Masker.load, instantiate=True):
+        """ Load an Explainer from the given file stream.
+        Parameters
+        ----------
+        in_file : The file stream to load objects from.
+        """
+        if instantiate:
+            return cls._instantiated_load(in_file, model_loader=model_loader, masker_loader=masker_loader)
+        kwargs = super().load(in_file, instantiate=False)
+        with Deserializer(in_file, "shap.Explainer", min_version=0, max_version=0) as s:
+            kwargs["model"] = s.load("model", model_loader)
+            kwargs["masker"] = s.load("masker", masker_loader)
+            kwargs["link"] = s.load("link")
+        return kwargs
+def pack_values(values):
+    """ Used the clean up arrays before putting them into an Explanation object.
+    """
+    if not hasattr(values, "__len__"):
+        return values
+    # collapse the values if we didn't compute them
+    if values is None or values[0] is None:
+        return None
+    # convert to a single numpy matrix when the array is not ragged
+    elif np.issubdtype(type(values[0]), np.number) or len(np.unique([len(v) for v in values])) == 1:
+        return np.array(values)
+    else:
+        return np.array(values, dtype=object)

lib/shap/explainers/_gpu_tree.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""GPU accelerated tree explanations"""
+import numpy as np
+from ..utils import assert_import, record_import_error
+from ._tree import TreeExplainer, feature_perturbation_codes, output_transform_codes
+try:
+    from .. import _cext_gpu
+except ImportError as e:
+    record_import_error("cext_gpu", "cuda extension was not built during install!", e)
+class GPUTreeExplainer(TreeExplainer):
+    """
+    Experimental GPU accelerated version of TreeExplainer. Currently requires source build with
+    cuda available and 'CUDA_PATH' environment variable defined.
+    Parameters
+    ----------
+    model : model object
+        The tree based machine learning model that we want to explain. XGBoost, LightGBM,
+        CatBoost, Pyspark and most tree-based scikit-learn models are supported.
+    data : numpy.array or pandas.DataFrame
+        The background dataset to use for integrating out features. This argument is optional when
+        feature_perturbation="tree_path_dependent", since in that case we can use the number of
+        training samples that went down each tree path as our background dataset (this is recorded
+        in the model object).
+    feature_perturbation : "interventional" (default) or "tree_path_dependent" (default when data=None)
+        Since SHAP values rely on conditional expectations we need to decide how to handle correlated
+        (or otherwise dependent) input features. The "interventional" approach breaks the dependencies
+        between features according to the rules dictated by casual inference (Janzing et al. 2019). Note
+        that the "interventional" option requires a background dataset and its runtime scales linearly
+        with the size of the background dataset you use. Anywhere from 100 to 1000 random background samples
+        are good sizes to use. The "tree_path_dependent" approach is to just follow the trees and use the
+        number of training examples that went down each leaf to represent the background distribution.
+        This approach does not require a background dataset and so is used by default when no background
+        dataset is provided.
+    model_output : "raw", "probability", "log_loss", or model method name
+        What output of the model should be explained. If "raw" then we explain the raw output of the
+        trees, which varies by model. For regression models "raw" is the standard output, for binary
+        classification in XGBoost this is the log odds ratio. If model_output is the name of a
+        supported prediction method on the model object then we explain the output of that model
+        method name. For example model_output="predict_proba" explains the result of calling
+        model.predict_proba. If "probability" then we explain the output of the model transformed into
+        probability space (note that this means the SHAP values now sum to the probability output of the
+        model). If "logloss" then we explain the log base e of the model loss function, so that the SHAP
+        values sum up to the log loss of the model for each sample. This is helpful for breaking
+        down model performance by feature. Currently the probability and logloss options are only
+        supported when
+        feature_dependence="independent".
+    Examples
+    --------
+    See `GPUTree explainer examples <https://shap.readthedocs.io/en/latest/api_examples/explainers/GPUTreeExplainer.html>`_
+    """
+    def shap_values(self, X, y=None, tree_limit=None, approximate=False, check_additivity=True,
+                    from_call=False):
+        """ Estimate the SHAP values for a set of samples.
+        Parameters
+        ----------
+        X : numpy.array, pandas.DataFrame or catboost.Pool (for catboost)
+            A matrix of samples (# samples x # features) on which to explain the model's output.
+        y : numpy.array
+            An array of label values for each sample. Used when explaining loss functions.
+        tree_limit : None (default) or int
+            Limit the number of trees used by the model. By default None means no use the limit
+            of the
+            original model, and -1 means no limit.
+        approximate : bool
+            Not supported.
+        check_additivity : bool
+            Run a validation check that the sum of the SHAP values equals the output of the
+            model. This
+            check takes only a small amount of time, and will catch potential unforeseen errors.
+            Note that this check only runs right now when explaining the margin of the model.
+        Returns
+        -------
+        array or list
+            For models with a single output this returns a matrix of SHAP values
+            (# samples x # features). Each row sums to the difference between the model output
+            for that
+            sample and the expected value of the model output (which is stored in the expected_value
+            attribute of the explainer when it is constant). For models with vector outputs this
+            returns
+            a list of such matrices, one for each output.
+        """
+        assert not approximate, "approximate not supported"
+        X, y, X_missing, flat_output, tree_limit, check_additivity = \
+            self._validate_inputs(X, y,
+                                  tree_limit,
+                                  check_additivity)
+        transform = self.model.get_transform()
+        # run the core algorithm using the C extension
+        assert_import("cext_gpu")
+        phi = np.zeros((X.shape[0], X.shape[1] + 1, self.model.num_outputs))
+        _cext_gpu.dense_tree_shap(
+            self.model.children_left, self.model.children_right, self.model.children_default,
+            self.model.features, self.model.thresholds, self.model.values,
+            self.model.node_sample_weight,
+            self.model.max_depth, X, X_missing, y, self.data, self.data_missing, tree_limit,
+            self.model.base_offset, phi, feature_perturbation_codes[self.feature_perturbation],
+            output_transform_codes[transform], False
+        )
+        out = self._get_shap_output(phi, flat_output)
+        if check_additivity and self.model.model_output == "raw":
+            self.assert_additivity(out, self.model.predict(X))
+        return out
+    def shap_interaction_values(self, X, y=None, tree_limit=None):
+        """ Estimate the SHAP interaction values for a set of samples.
+        Parameters
+        ----------
+        X : numpy.array, pandas.DataFrame or catboost.Pool (for catboost)
+            A matrix of samples (# samples x # features) on which to explain the model's output.
+        y : numpy.array
+            An array of label values for each sample. Used when explaining loss functions (not
+            yet supported).
+        tree_limit : None (default) or int
+            Limit the number of trees used by the model. By default None means no use the limit
+            of the
+            original model, and -1 means no limit.
+        Returns
+        -------
+        array or list
+            For models with a single output this returns a tensor of SHAP values
+            (# samples x # features x # features). The matrix (# features x # features) for each
+            sample sums
+            to the difference between the model output for that sample and the expected value of
+            the model output
+            (which is stored in the expected_value attribute of the explainer). Each row of this
+            matrix sums to the
+            SHAP value for that feature for that sample. The diagonal entries of the matrix
+            represent the
+            "main effect" of that feature on the prediction and the symmetric off-diagonal
+            entries represent the
+            interaction effects between all pairs of features for that sample. For models with
+            vector outputs
+            this returns a list of tensors, one for each output.
+        """
+        assert self.model.model_output == "raw", "Only model_output = \"raw\" is supported for " \
+                                                 "SHAP interaction values right now!"
+        assert self.feature_perturbation != "interventional", 'feature_perturbation="interventional" is not yet supported for ' + \
+                                                              'interaction values. Use feature_perturbation="tree_path_dependent" instead.'
+        transform = "identity"
+        X, y, X_missing, flat_output, tree_limit, _ = self._validate_inputs(X, y, tree_limit,
+                                                                            False)
+        # run the core algorithm using the C extension
+        assert_import("cext_gpu")
+        phi = np.zeros((X.shape[0], X.shape[1] + 1, X.shape[1] + 1, self.model.num_outputs))
+        _cext_gpu.dense_tree_shap(
+            self.model.children_left, self.model.children_right, self.model.children_default,
+            self.model.features, self.model.thresholds, self.model.values,
+            self.model.node_sample_weight,
+            self.model.max_depth, X, X_missing, y, self.data, self.data_missing, tree_limit,
+            self.model.base_offset, phi, feature_perturbation_codes[self.feature_perturbation],
+            output_transform_codes[transform], True
+        )
+        return self._get_shap_interactions_output(phi, flat_output)

lib/shap/explainers/_gradient.py ADDED Viewed

	@@ -0,0 +1,592 @@

+import warnings
+import numpy as np
+import pandas as pd
+from packaging import version
+from .._explanation import Explanation
+from ..explainers._explainer import Explainer
+from ..explainers.tf_utils import (
+    _get_graph,
+    _get_model_inputs,
+    _get_model_output,
+    _get_session,
+)
+keras = None
+tf = None
+torch = None
+class GradientExplainer(Explainer):
+    """ Explains a model using expected gradients (an extension of integrated gradients).
+    Expected gradients an extension of the integrated gradients method (Sundararajan et al. 2017), a
+    feature attribution method designed for differentiable models based on an extension of Shapley
+    values to infinite player games (Aumann-Shapley values). Integrated gradients values are a bit
+    different from SHAP values, and require a single reference value to integrate from. As an adaptation
+    to make them approximate SHAP values, expected gradients reformulates the integral as an expectation
+    and combines that expectation with sampling reference values from the background dataset. This leads
+    to a single combined expectation of gradients that converges to attributions that sum to the
+    difference between the expected model output and the current output.
+    Examples
+    --------
+    See :ref:`Gradient Explainer Examples <gradient_explainer_examples>`
+    """
+    def __init__(self, model, data, session=None, batch_size=50, local_smoothing=0):
+        """ An explainer object for a differentiable model using a given background dataset.
+        Parameters
+        ----------
+        model : tf.keras.Model, (input : [tf.Tensor], output : tf.Tensor), torch.nn.Module, or a tuple
+                (model, layer), where both are torch.nn.Module objects
+            For TensorFlow this can be a model object, or a pair of TensorFlow tensors (or a list and
+            a tensor) that specifies the input and output of the model to be explained. Note that for
+            TensowFlow 2 you must pass a tensorflow function, not a tuple of input/output tensors).
+            For PyTorch this can be a nn.Module object (model), or a tuple (model, layer), where both
+            are nn.Module objects. The model is an nn.Module object which takes as input a tensor
+            (or list of tensors) of shape data, and returns a single dimensional output. If the input
+            is a tuple, the returned shap values will be for the input of the layer argument. layer must
+            be a layer in the model, i.e. model.conv2.
+        data : [numpy.array] or [pandas.DataFrame] or [torch.tensor]
+            The background dataset to use for integrating out features. Gradient explainer integrates
+            over these samples. The data passed here must match the input tensors given in the
+            first argument. Single element lists can be passed unwrapped.
+        """
+        # first, we need to find the framework
+        if type(model) is tuple:
+            a, b = model
+            try:
+                a.named_parameters()
+                framework = 'pytorch'
+            except Exception:
+                framework = 'tensorflow'
+        else:
+            try:
+                model.named_parameters()
+                framework = 'pytorch'
+            except Exception:
+                framework = 'tensorflow'
+        if isinstance(data, pd.DataFrame):
+            self.features = data.columns.values
+        else:
+            self.features = None
+        if framework == 'tensorflow':
+            self.explainer = _TFGradient(model, data, session, batch_size, local_smoothing)
+        elif framework == 'pytorch':
+            self.explainer = _PyTorchGradient(model, data, batch_size, local_smoothing)
+    def __call__(self, X, nsamples=200):
+        """ Return an explanation object for the model applied to X.
+        Parameters
+        ----------
+        X : list,
+            if framework == 'tensorflow': numpy.array, or pandas.DataFrame
+            if framework == 'pytorch': torch.tensor
+            A tensor (or list of tensors) of samples (where X.shape[0] == # samples) on which to
+            explain the model's output.
+        nsamples : int
+            number of background samples
+        Returns
+        -------
+        shap.Explanation:
+        """
+        shap_values = self.shap_values(X, nsamples)
+        return Explanation(values=shap_values, data=X, feature_names=self.features)
+    def shap_values(self, X, nsamples=200, ranked_outputs=None, output_rank_order="max", rseed=None, return_variances=False):
+        """ Return the values for the model applied to X.
+        Parameters
+        ----------
+        X : list,
+            if framework == 'tensorflow': numpy.array, or pandas.DataFrame
+            if framework == 'pytorch': torch.tensor
+            A tensor (or list of tensors) of samples (where X.shape[0] == # samples) on which to
+            explain the model's output.
+        ranked_outputs : None or int
+            If ranked_outputs is None then we explain all the outputs in a multi-output model. If
+            ranked_outputs is a positive integer then we only explain that many of the top model
+            outputs (where "top" is determined by output_rank_order). Note that this causes a pair
+            of values to be returned (shap_values, indexes), where shap_values is a list of numpy arrays
+            for each of the output ranks, and indexes is a matrix that tells for each sample which output
+            indexes were chosen as "top".
+        output_rank_order : "max", "min", "max_abs", or "custom"
+            How to order the model outputs when using ranked_outputs, either by maximum, minimum, or
+            maximum absolute value. If "custom" Then "ranked_outputs" contains a list of output nodes.
+        rseed : None or int
+            Seeding the randomness in shap value computation  (background example choice,
+            interpolation between current and background example, smoothing).
+        Returns
+        -------
+        array or list
+            For a models with a single output this returns a tensor of SHAP values with the same shape
+            as X. For a model with multiple outputs this returns a list of SHAP value tensors, each of
+            which are the same shape as X. If ranked_outputs is None then this list of tensors matches
+            the number of model outputs. If ranked_outputs is a positive integer a pair is returned
+            (shap_values, indexes), where shap_values is a list of tensors with a length of
+            ranked_outputs, and indexes is a matrix that tells for each sample which output indexes
+            were chosen as "top".
+        """
+        return self.explainer.shap_values(X, nsamples, ranked_outputs, output_rank_order, rseed, return_variances)
+class _TFGradient(Explainer):
+    def __init__(self, model, data, session=None, batch_size=50, local_smoothing=0):
+        # try and import keras and tensorflow
+        global tf, keras
+        if tf is None:
+            import tensorflow as tf
+            if version.parse(tf.__version__) < version.parse("1.4.0"):
+                warnings.warn("Your TensorFlow version is older than 1.4.0 and not supported.")
+        if keras is None:
+            try:
+                from tensorflow import keras
+                if version.parse(keras.__version__) < version.parse("2.1.0"):
+                    warnings.warn("Your Keras version is older than 2.1.0 and not supported.")
+            except Exception:
+                pass
+        # determine the model inputs and outputs
+        self.model = model
+        self.model_inputs = _get_model_inputs(model)
+        self.model_output = _get_model_output(model)
+        assert not isinstance(self.model_output, list), "The model output to be explained must be a single tensor!"
+        assert len(self.model_output.shape) < 3, "The model output must be a vector or a single value!"
+        self.multi_output = True
+        if len(self.model_output.shape) == 1:
+            self.multi_output = False
+        # check if we have multiple inputs
+        self.multi_input = True
+        if not isinstance(self.model_inputs, list):
+            self.model_inputs = [self.model_inputs]
+        self.multi_input = len(self.model_inputs) > 1
+        if isinstance(data, pd.DataFrame):
+            data = [data.values]
+        if not isinstance(data, list):
+            data = [data]
+        self.data = data
+        self._num_vinputs = {}
+        self.batch_size = batch_size
+        self.local_smoothing = local_smoothing
+        if not tf.executing_eagerly():
+            self.session = _get_session(session)
+            self.graph = _get_graph(self)
+            # see if there is a keras operation we need to save
+            self.keras_phase_placeholder = None
+            for op in self.graph.get_operations():
+                if 'keras_learning_phase' in op.name:
+                    self.keras_phase_placeholder = op.outputs[0]
+        # save the expected output of the model (commented out because self.data could be huge for GradientExpliner)
+        #self.expected_value = self.run(self.model_output, self.model_inputs, self.data).mean(0)
+        if not self.multi_output:
+            self.gradients = [None]
+        else:
+            self.gradients = [None for i in range(self.model_output.shape[1])]
+    def gradient(self, i):
+        global tf, keras
+        if self.gradients[i] is None:
+            if not tf.executing_eagerly():
+                out = self.model_output[:,i] if self.multi_output else self.model_output
+                self.gradients[i] = tf.gradients(out, self.model_inputs)
+            else:
+                @tf.function
+                def grad_graph(x):
+                    phase = tf.keras.backend.learning_phase()
+                    tf.keras.backend.set_learning_phase(0)
+                    with tf.GradientTape(watch_accessed_variables=False) as tape:
+                        tape.watch(x)
+                        out = self.model(x)
+                        if self.multi_output:
+                            out = out[:,i]
+                    x_grad = tape.gradient(out, x)
+                    tf.keras.backend.set_learning_phase(phase)
+                    return x_grad
+                self.gradients[i] = grad_graph
+        return self.gradients[i]
+    def shap_values(self, X, nsamples=200, ranked_outputs=None, output_rank_order="max", rseed=None, return_variances=False):
+        global tf, keras
+        import tensorflow as tf
+        import tensorflow.keras as keras
+        # check if we have multiple inputs
+        if not self.multi_input:
+            assert not isinstance(X, list), "Expected a single tensor model input!"
+            X = [X]
+        else:
+            assert isinstance(X, list), "Expected a list of model inputs!"
+        assert len(self.model_inputs) == len(X), "Number of model inputs does not match the number given!"
+        # rank and determine the model outputs that we will explain
+        if not tf.executing_eagerly():
+            model_output_values = self.run(self.model_output, self.model_inputs, X)
+        else:
+            model_output_values = self.run(self.model, self.model_inputs, X)
+        if ranked_outputs is not None and self.multi_output:
+            if output_rank_order == "max":
+                model_output_ranks = np.argsort(-model_output_values)
+            elif output_rank_order == "min":
+                model_output_ranks = np.argsort(model_output_values)
+            elif output_rank_order == "max_abs":
+                model_output_ranks = np.argsort(np.abs(model_output_values))
+            elif output_rank_order == "custom":
+                model_output_ranks = ranked_outputs
+            else:
+                emsg = "output_rank_order must be max, min, max_abs or custom!"
+                raise ValueError(emsg)
+            if output_rank_order in ["max", "min", "max_abs"]:
+                model_output_ranks = model_output_ranks[:,:ranked_outputs]
+        else:
+            model_output_ranks = np.tile(np.arange(len(self.gradients)), (X[0].shape[0], 1))
+        # compute the attributions
+        output_phis = []
+        output_phi_vars = []
+        samples_input = [np.zeros((nsamples,) + X[t].shape[1:], dtype=np.float32) for t in range(len(X))]
+        samples_delta = [np.zeros((nsamples,) + X[t].shape[1:], dtype=np.float32) for t in range(len(X))]
+        # use random seed if no argument given
+        if rseed is None:
+            rseed = np.random.randint(0, 1e6)
+        for i in range(model_output_ranks.shape[1]):
+            np.random.seed(rseed) # so we get the same noise patterns for each output class
+            phis = []
+            phi_vars = []
+            for k in range(len(X)):
+                phis.append(np.zeros(X[k].shape))
+                phi_vars.append(np.zeros(X[k].shape))
+            for j in range(X[0].shape[0]):
+                # fill in the samples arrays
+                for k in range(nsamples):
+                    rind = np.random.choice(self.data[0].shape[0])
+                    t = np.random.uniform()
+                    for u in range(len(X)):
+                        if self.local_smoothing > 0:
+                            x = X[u][j] + np.random.randn(*X[u][j].shape) * self.local_smoothing
+                        else:
+                            x = X[u][j]
+                        samples_input[u][k] = t * x + (1 - t) * self.data[u][rind]
+                        samples_delta[u][k] = x - self.data[u][rind]
+                # compute the gradients at all the sample points
+                find = model_output_ranks[j,i]
+                grads = []
+                for b in range(0, nsamples, self.batch_size):
+                    batch = [samples_input[a][b:min(b+self.batch_size,nsamples)] for a in range(len(X))]
+                    grads.append(self.run(self.gradient(find), self.model_inputs, batch))
+                grad = [np.concatenate([g[a] for g in grads], 0) for a in range(len(X))]
+                # assign the attributions to the right part of the output arrays
+                for a in range(len(X)):
+                    samples = grad[a] * samples_delta[a]
+                    phis[a][j] = samples.mean(0)
+                    phi_vars[a][j] = samples.var(0) / np.sqrt(samples.shape[0]) # estimate variance of means
+                # TODO: this could be avoided by integrating between endpoints if no local smoothing is used
+                # correct the sum of the values to equal the output of the model using a linear
+                # regression model with priors of the coefficients equal to the estimated variances for each
+                # value (note that 1e-6 is designed to increase the weight of the sample and so closely
+                # match the correct sum)
+                # if False and self.local_smoothing == 0: # disabled right now to make sure it doesn't mask problems
+                #     phis_sum = np.sum([phis[l][j].sum() for l in range(len(X))])
+                #     phi_vars_s = np.stack([phi_vars[l][j] for l in range(len(X))], 0).flatten()
+                #     if self.multi_output:
+                #         sum_error = model_output_values[j,find] - phis_sum - self.expected_value[find]
+                #     else:
+                #         sum_error = model_output_values[j] - phis_sum - self.expected_value
+                #     # this is a ridge regression with one sample of all ones with sum_error as the label
+                #     # and 1/v as the ridge penalties. This simplified (and stable) form comes from the
+                #     # Sherman-Morrison formula
+                #     v = (phi_vars_s / phi_vars_s.max()) * 1e6
+                #     adj = sum_error * (v - (v * v.sum()) / (1 + v.sum()))
+                #     # add the adjustment to the output so the sum matches
+                #     offset = 0
+                #     for l in range(len(X)):
+                #         s = np.prod(phis[l][j].shape)
+                #         phis[l][j] += adj[offset:offset+s].reshape(phis[l][j].shape)
+                #         offset += s
+            output_phis.append(phis[0] if not self.multi_input else phis)
+            output_phi_vars.append(phi_vars[0] if not self.multi_input else phi_vars)
+        if not self.multi_output:
+            if return_variances:
+                return output_phis[0], output_phi_vars[0]
+            else:
+                return output_phis[0]
+        elif ranked_outputs is not None:
+            if return_variances:
+                return output_phis, output_phi_vars, model_output_ranks
+            else:
+                return output_phis, model_output_ranks
+        else:
+            if return_variances:
+                return output_phis, output_phi_vars
+            else:
+                return output_phis
+    def run(self, out, model_inputs, X):
+        global tf, keras
+        if not tf.executing_eagerly():
+            feed_dict = dict(zip(model_inputs, X))
+            if self.keras_phase_placeholder is not None:
+                feed_dict[self.keras_phase_placeholder] = 0
+            return self.session.run(out, feed_dict)
+        else:
+            # build inputs that are correctly shaped, typed, and tf-wrapped
+            inputs = []
+            for i in range(len(X)):
+                shape = list(self.model_inputs[i].shape)
+                shape[0] = -1
+                v = tf.constant(X[i].reshape(shape), dtype=self.model_inputs[i].dtype)
+                inputs.append(v)
+            return out(inputs)
+class _PyTorchGradient(Explainer):
+    def __init__(self, model, data, batch_size=50, local_smoothing=0):
+        # try and import pytorch
+        global torch
+        if torch is None:
+            import torch
+            if version.parse(torch.__version__) < version.parse("0.4"):
+                warnings.warn("Your PyTorch version is older than 0.4 and not supported.")
+        # check if we have multiple inputs
+        self.multi_input = False
+        if isinstance(data, list):
+            self.multi_input = True
+        if not isinstance(data, list):
+            data = [data]
+        # for consistency, the method signature calls for data as the model input.
+        # However, within this class, self.model_inputs is the input (i.e. the data passed by the user)
+        # and self.data is the background data for the layer we want to assign importances to. If this layer is
+        # the input, then self.data = self.model_inputs
+        self.model_inputs = data
+        self.batch_size = batch_size
+        self.local_smoothing = local_smoothing
+        self.layer = None
+        self.input_handle = None
+        self.interim = False
+        if type(model) == tuple:
+            self.interim = True
+            model, layer = model
+            model = model.eval()
+            self.add_handles(layer)
+            self.layer = layer
+            # now, if we are taking an interim layer, the 'data' is going to be the input
+            # of the interim layer; we will capture this using a forward hook
+            with torch.no_grad():
+                _ = model(*data)
+                interim_inputs = self.layer.target_input
+                if type(interim_inputs) is tuple:
+                    # this should always be true, but just to be safe
+                    self.data = [i.clone().detach() for i in interim_inputs]
+                else:
+                    self.data = [interim_inputs.clone().detach()]
+        else:
+            self.data = data
+        self.model = model.eval()
+        multi_output = False
+        outputs = self.model(*self.model_inputs)
+        if len(outputs.shape) > 1 and outputs.shape[1] > 1:
+            multi_output = True
+        self.multi_output = multi_output
+        if not self.multi_output:
+            self.gradients = [None]
+        else:
+            self.gradients = [None for i in range(outputs.shape[1])]
+    def gradient(self, idx, inputs):
+        self.model.zero_grad()
+        X = [x.requires_grad_() for x in inputs]
+        outputs = self.model(*X)
+        selected = [val for val in outputs[:, idx]]
+        if self.input_handle is not None:
+            interim_inputs = self.layer.target_input
+            grads = [torch.autograd.grad(selected, input,
+                                         retain_graph=True if idx + 1 < len(interim_inputs) else None)[0].cpu().numpy()
+                     for idx, input in enumerate(interim_inputs)]
+            del self.layer.target_input
+        else:
+            grads = [torch.autograd.grad(selected, x,
+                                         retain_graph=True if idx + 1 < len(X) else None)[0].cpu().numpy()
+                     for idx, x in enumerate(X)]
+        return grads
+    @staticmethod
+    def get_interim_input(self, input, output):
+        try:
+            del self.target_input
+        except AttributeError:
+            pass
+        setattr(self, 'target_input', input)
+    def add_handles(self, layer):
+        input_handle = layer.register_forward_hook(self.get_interim_input)
+        self.input_handle = input_handle
+    def shap_values(self, X, nsamples=200, ranked_outputs=None, output_rank_order="max", rseed=None, return_variances=False):
+        # X ~ self.model_input
+        # X_data ~ self.data
+        # check if we have multiple inputs
+        if not self.multi_input:
+            assert not isinstance(X, list), "Expected a single tensor model input!"
+            X = [X]
+        else:
+            assert isinstance(X, list), "Expected a list of model inputs!"
+        if ranked_outputs is not None and self.multi_output:
+            with torch.no_grad():
+                model_output_values = self.model(*X)
+            # rank and determine the model outputs that we will explain
+            if output_rank_order == "max":
+                _, model_output_ranks = torch.sort(model_output_values, descending=True)
+            elif output_rank_order == "min":
+                _, model_output_ranks = torch.sort(model_output_values, descending=False)
+            elif output_rank_order == "max_abs":
+                _, model_output_ranks = torch.sort(torch.abs(model_output_values), descending=True)
+            else:
+                emsg = "output_rank_order must be max, min, or max_abs!"
+                raise ValueError(emsg)
+            model_output_ranks = model_output_ranks[:, :ranked_outputs]
+        else:
+            model_output_ranks = (torch.ones((X[0].shape[0], len(self.gradients))).int() *
+                                  torch.arange(0, len(self.gradients)).int())
+        # if a cleanup happened, we need to add the handles back
+        # this allows shap_values to be called multiple times, but the model to be
+        # 'clean' at the end of each run for other uses
+        if self.input_handle is None and self.interim is True:
+            self.add_handles(self.layer)
+        # compute the attributions
+        X_batches = X[0].shape[0]
+        output_phis = []
+        output_phi_vars = []
+        # samples_input = input to the model
+        # samples_delta = (x - x') for the input being explained - may be an interim input
+        samples_input = [torch.zeros((nsamples,) + X[t].shape[1:], device=X[t].device) for t in range(len(X))]
+        samples_delta = [np.zeros((nsamples, ) + self.data[t].shape[1:]) for t in range(len(self.data))]
+        # use random seed if no argument given
+        if rseed is None:
+            rseed = np.random.randint(0, 1e6)
+        for i in range(model_output_ranks.shape[1]):
+            np.random.seed(rseed)  # so we get the same noise patterns for each output class
+            phis = []
+            phi_vars = []
+            for k in range(len(self.data)):
+                # for each of the inputs being explained - may be an interim input
+                phis.append(np.zeros((X_batches,) + self.data[k].shape[1:]))
+                phi_vars.append(np.zeros((X_batches, ) + self.data[k].shape[1:]))
+            for j in range(X[0].shape[0]):
+                # fill in the samples arrays
+                for k in range(nsamples):
+                    rind = np.random.choice(self.data[0].shape[0])
+                    t = np.random.uniform()
+                    for a in range(len(X)):
+                        if self.local_smoothing > 0:
+                            # local smoothing is added to the base input, unlike in the TF gradient explainer
+                            x = X[a][j].clone().detach() + torch.empty(X[a][j].shape, device=X[a].device).normal_() \
+                                * self.local_smoothing
+                        else:
+                            x = X[a][j].clone().detach()
+                        samples_input[a][k] = (t * x + (1 - t) * (self.model_inputs[a][rind]).clone().detach()).\
+                            clone().detach()
+                        if self.input_handle is None:
+                            samples_delta[a][k] = (x - (self.data[a][rind]).clone().detach()).cpu().numpy()
+                    if self.interim is True:
+                        with torch.no_grad():
+                            _ = self.model(*[samples_input[a][k].unsqueeze(0) for a in range(len(X))])
+                            interim_inputs = self.layer.target_input
+                            del self.layer.target_input
+                            if type(interim_inputs) is tuple:
+                                if type(interim_inputs) is tuple:
+                                    # this should always be true, but just to be safe
+                                    for a in range(len(interim_inputs)):
+                                        samples_delta[a][k] = interim_inputs[a].cpu().numpy()
+                                else:
+                                    samples_delta[0][k] = interim_inputs.cpu().numpy()
+                # compute the gradients at all the sample points
+                find = model_output_ranks[j, i]
+                grads = []
+                for b in range(0, nsamples, self.batch_size):
+                    batch = [samples_input[c][b:min(b+self.batch_size,nsamples)].clone().detach() for c in range(len(X))]
+                    grads.append(self.gradient(find, batch))
+                grad = [np.concatenate([g[z] for g in grads], 0) for z in range(len(self.data))]
+                # assign the attributions to the right part of the output arrays
+                for t in range(len(self.data)):
+                    samples = grad[t] * samples_delta[t]
+                    phis[t][j] = samples.mean(0)
+                    phi_vars[t][j] = samples.var(0) / np.sqrt(samples.shape[0]) # estimate variance of means
+            output_phis.append(phis[0] if len(self.data) == 1 else phis)
+            output_phi_vars.append(phi_vars[0] if not self.multi_input else phi_vars)
+        # cleanup: remove the handles, if they were added
+        if self.input_handle is not None:
+            self.input_handle.remove()
+            self.input_handle = None
+            # note: the target input attribute is deleted in the loop
+        if not self.multi_output:
+            if return_variances:
+                return output_phis[0], output_phi_vars[0]
+            else:
+                return output_phis[0]
+        elif ranked_outputs is not None:
+            if return_variances:
+                return output_phis, output_phi_vars, model_output_ranks
+            else:
+                return output_phis, model_output_ranks
+        else:
+            if return_variances:
+                return output_phis, output_phi_vars
+            else:
+                return output_phis

lib/shap/explainers/_kernel.py ADDED Viewed

	@@ -0,0 +1,696 @@

+import copy
+import gc
+import itertools
+import logging
+import time
+import warnings
+import numpy as np
+import pandas as pd
+import scipy.sparse
+import sklearn
+from packaging import version
+from scipy.special import binom
+from sklearn.linear_model import Lasso, LassoLarsIC, lars_path
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from tqdm.auto import tqdm
+from .._explanation import Explanation
+from ..utils import safe_isinstance
+from ..utils._exceptions import DimensionError
+from ..utils._legacy import (
+    DenseData,
+    SparseData,
+    convert_to_data,
+    convert_to_instance,
+    convert_to_instance_with_index,
+    convert_to_link,
+    convert_to_model,
+    match_instance_to_data,
+    match_model_to_data,
+)
+from ._explainer import Explainer
+log = logging.getLogger('shap')
+class KernelExplainer(Explainer):
+    """Uses the Kernel SHAP method to explain the output of any function.
+    Kernel SHAP is a method that uses a special weighted linear regression
+    to compute the importance of each feature. The computed importance values
+    are Shapley values from game theory and also coefficients from a local linear
+    regression.
+    Parameters
+    ----------
+    model : function or iml.Model
+        User supplied function that takes a matrix of samples (# samples x # features) and
+        computes the output of the model for those samples. The output can be a vector
+        (# samples) or a matrix (# samples x # model outputs).
+    data : numpy.array or pandas.DataFrame or shap.common.DenseData or any scipy.sparse matrix
+        The background dataset to use for integrating out features. To determine the impact
+        of a feature, that feature is set to "missing" and the change in the model output
+        is observed. Since most models aren't designed to handle arbitrary missing data at test
+        time, we simulate "missing" by replacing the feature with the values it takes in the
+        background dataset. So if the background dataset is a simple sample of all zeros, then
+        we would approximate a feature being missing by setting it to zero. For small problems,
+        this background dataset can be the whole training set, but for larger problems consider
+        using a single reference value or using the ``kmeans`` function to summarize the dataset.
+        Note: for the sparse case, we accept any sparse matrix but convert to lil format for
+        performance.
+    feature_names : list
+        The names of the features in the background dataset. If the background dataset is
+        supplied as a pandas.DataFrame, then ``feature_names`` can be set to ``None`` (default),
+        and the feature names will be taken as the column names of the dataframe.
+    link : "identity" or "logit"
+        A generalized linear model link to connect the feature importance values to the model
+        output. Since the feature importance values, phi, sum up to the model output, it often makes
+        sense to connect them to the output with a link function where link(output) = sum(phi).
+        Default is "identity" (a no-op).
+        If the model output is a probability, then "logit" can be used to transform the SHAP values
+        into log-odds units.
+    Examples
+    --------
+    See :ref:`Kernel Explainer Examples <kernel_explainer_examples>`.
+    """
+    def __init__(self, model, data, feature_names=None, link="identity", **kwargs):
+        if feature_names is not None:
+            self.data_feature_names=feature_names
+        elif isinstance(data, pd.DataFrame):
+            self.data_feature_names = list(data.columns)
+        # convert incoming inputs to standardized iml objects
+        self.link = convert_to_link(link)
+        self.keep_index = kwargs.get("keep_index", False)
+        self.keep_index_ordered = kwargs.get("keep_index_ordered", False)
+        self.model = convert_to_model(model, keep_index=self.keep_index)
+        self.data = convert_to_data(data, keep_index=self.keep_index)
+        model_null = match_model_to_data(self.model, self.data)
+        # enforce our current input type limitations
+        if not isinstance(self.data, (DenseData, SparseData)):
+            emsg = "Shap explainer only supports the DenseData and SparseData input currently."
+            raise TypeError(emsg)
+        if self.data.transposed:
+            emsg = "Shap explainer does not support transposed DenseData or SparseData currently."
+            raise DimensionError(emsg)
+        # warn users about large background data sets
+        if len(self.data.weights) > 100:
+            log.warning("Using " + str(len(self.data.weights)) + " background data samples could cause " +
+                        "slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to " +
+                        "summarize the background as K samples.")
+        # init our parameters
+        self.N = self.data.data.shape[0]
+        self.P = self.data.data.shape[1]
+        self.linkfv = np.vectorize(self.link.f)
+        self.nsamplesAdded = 0
+        self.nsamplesRun = 0
+        # find E_x[f(x)]
+        if isinstance(model_null, (pd.DataFrame, pd.Series)):
+            model_null = np.squeeze(model_null.values)
+        if safe_isinstance(model_null, "tensorflow.python.framework.ops.EagerTensor"):
+            model_null = model_null.numpy()
+        self.fnull = np.sum((model_null.T * self.data.weights).T, 0)
+        self.expected_value = self.linkfv(self.fnull)
+        # see if we have a vector output
+        self.vector_out = True
+        if len(self.fnull.shape) == 0:
+            self.vector_out = False
+            self.fnull = np.array([self.fnull])
+            self.D = 1
+            self.expected_value = float(self.expected_value)
+        else:
+            self.D = self.fnull.shape[0]
+    def __call__(self, X):
+        start_time = time.time()
+        if isinstance(X, pd.DataFrame):
+            feature_names = list(X.columns)
+        else:
+            feature_names = getattr(self, "data_feature_names", None)
+        v = self.shap_values(X)
+        if isinstance(v, list):
+            v = np.stack(v, axis=-1) # put outputs at the end
+        # the explanation object expects an expected value for each row
+        if hasattr(self.expected_value, "__len__"):
+            ev_tiled = np.tile(self.expected_value, (v.shape[0],1))
+        else:
+            ev_tiled = np.tile(self.expected_value, v.shape[0])
+        return Explanation(
+            v,
+            base_values=ev_tiled,
+            data=X.to_numpy() if isinstance(X, pd.DataFrame) else X,
+            feature_names=feature_names,
+            compute_time=time.time() - start_time,
+        )
+    def shap_values(self, X, **kwargs):
+        """ Estimate the SHAP values for a set of samples.
+        Parameters
+        ----------
+        X : numpy.array or pandas.DataFrame or any scipy.sparse matrix
+            A matrix of samples (# samples x # features) on which to explain the model's output.
+        nsamples : "auto" or int
+            Number of times to re-evaluate the model when explaining each prediction. More samples
+            lead to lower variance estimates of the SHAP values. The "auto" setting uses
+            `nsamples = 2 * X.shape[1] + 2048`.
+        l1_reg : "num_features(int)", "auto" (default for now, but deprecated), "aic", "bic", or float
+            The l1 regularization to use for feature selection (the estimation procedure is based on
+            a debiased lasso). The auto option currently uses "aic" when less that 20% of the possible sample
+            space is enumerated, otherwise it uses no regularization. THE BEHAVIOR OF "auto" WILL CHANGE
+            in a future version to be based on num_features instead of AIC.
+            The "aic" and "bic" options use the AIC and BIC rules for regularization.
+            Using "num_features(int)" selects a fix number of top features. Passing a float directly sets the
+            "alpha" parameter of the sklearn.linear_model.Lasso model used for feature selection.
+        gc_collect : bool
+           Run garbage collection after each explanation round. Sometime needed for memory intensive explanations (default False).
+        Returns
+        -------
+        array or list
+            For models with a single output this returns a matrix of SHAP values
+            (# samples x # features). Each row sums to the difference between the model output for that
+            sample and the expected value of the model output (which is stored as expected_value
+            attribute of the explainer). For models with vector outputs this returns a list
+            of such matrices, one for each output.
+        """
+        # convert dataframes
+        if isinstance(X, pd.Series):
+            X = X.values
+        elif isinstance(X, pd.DataFrame):
+            if self.keep_index:
+                index_value = X.index.values
+                index_name = X.index.name
+                column_name = list(X.columns)
+            X = X.values
+        x_type = str(type(X))
+        arr_type = "'numpy.ndarray'>"
+        # if sparse, convert to lil for performance
+        if scipy.sparse.issparse(X) and not scipy.sparse.isspmatrix_lil(X):
+            X = X.tolil()
+        assert x_type.endswith(arr_type) or scipy.sparse.isspmatrix_lil(X), "Unknown instance type: " + x_type
+        # single instance
+        if len(X.shape) == 1:
+            data = X.reshape((1, X.shape[0]))
+            if self.keep_index:
+                data = convert_to_instance_with_index(data, column_name, index_name, index_value)
+            explanation = self.explain(data, **kwargs)
+            # vector-output
+            s = explanation.shape
+            if len(s) == 2:
+                outs = [np.zeros(s[0]) for j in range(s[1])]
+                for j in range(s[1]):
+                    outs[j] = explanation[:, j]
+                return outs
+            # single-output
+            else:
+                out = np.zeros(s[0])
+                out[:] = explanation
+                return out
+        # explain the whole dataset
+        elif len(X.shape) == 2:
+            explanations = []
+            for i in tqdm(range(X.shape[0]), disable=kwargs.get("silent", False)):
+                data = X[i:i + 1, :]
+                if self.keep_index:
+                    data = convert_to_instance_with_index(data, column_name, index_value[i:i + 1], index_name)
+                explanations.append(self.explain(data, **kwargs))
+                if kwargs.get("gc_collect", False):
+                    gc.collect()
+            # vector-output
+            s = explanations[0].shape
+            if len(s) == 2:
+                outs = [np.zeros((X.shape[0], s[0])) for j in range(s[1])]
+                for i in range(X.shape[0]):
+                    for j in range(s[1]):
+                        outs[j][i] = explanations[i][:, j]
+                return outs
+            # single-output
+            else:
+                out = np.zeros((X.shape[0], s[0]))
+                for i in range(X.shape[0]):
+                    out[i] = explanations[i]
+                return out
+        else:
+            emsg = "Instance must have 1 or 2 dimensions!"
+            raise DimensionError(emsg)
+    def explain(self, incoming_instance, **kwargs):
+        # convert incoming input to a standardized iml object
+        instance = convert_to_instance(incoming_instance)
+        match_instance_to_data(instance, self.data)
+        # find the feature groups we will test. If a feature does not change from its
+        # current value then we know it doesn't impact the model
+        self.varyingInds = self.varying_groups(instance.x)
+        if self.data.groups is None:
+            self.varyingFeatureGroups = np.array([i for i in self.varyingInds])
+            self.M = self.varyingFeatureGroups.shape[0]
+        else:
+            self.varyingFeatureGroups = [self.data.groups[i] for i in self.varyingInds]
+            self.M = len(self.varyingFeatureGroups)
+            groups = self.data.groups
+            # convert to numpy array as it is much faster if not jagged array (all groups of same length)
+            if self.varyingFeatureGroups and all(len(groups[i]) == len(groups[0]) for i in self.varyingInds):
+                self.varyingFeatureGroups = np.array(self.varyingFeatureGroups)
+                # further performance optimization in case each group has a single value
+                if self.varyingFeatureGroups.shape[1] == 1:
+                    self.varyingFeatureGroups = self.varyingFeatureGroups.flatten()
+        # find f(x)
+        if self.keep_index:
+            model_out = self.model.f(instance.convert_to_df())
+        else:
+            model_out = self.model.f(instance.x)
+        if isinstance(model_out, (pd.DataFrame, pd.Series)):
+            model_out = model_out.values
+        self.fx = model_out[0]
+        if not self.vector_out:
+            self.fx = np.array([self.fx])
+        # if no features vary then no feature has an effect
+        if self.M == 0:
+            phi = np.zeros((self.data.groups_size, self.D))
+            phi_var = np.zeros((self.data.groups_size, self.D))
+        # if only one feature varies then it has all the effect
+        elif self.M == 1:
+            phi = np.zeros((self.data.groups_size, self.D))
+            phi_var = np.zeros((self.data.groups_size, self.D))
+            diff = self.link.f(self.fx) - self.link.f(self.fnull)
+            for d in range(self.D):
+                phi[self.varyingInds[0],d] = diff[d]
+        # if more than one feature varies then we have to do real work
+        else:
+            self.l1_reg = kwargs.get("l1_reg", "auto")
+            # pick a reasonable number of samples if the user didn't specify how many they wanted
+            self.nsamples = kwargs.get("nsamples", "auto")
+            if self.nsamples == "auto":
+                self.nsamples = 2 * self.M + 2**11
+            # if we have enough samples to enumerate all subsets then ignore the unneeded samples
+            self.max_samples = 2 ** 30
+            if self.M <= 30:
+                self.max_samples = 2 ** self.M - 2
+                if self.nsamples > self.max_samples:
+                    self.nsamples = self.max_samples
+            # reserve space for some of our computations
+            self.allocate()
+            # weight the different subset sizes
+            num_subset_sizes = int(np.ceil((self.M - 1) / 2.0))
+            num_paired_subset_sizes = int(np.floor((self.M - 1) / 2.0))
+            weight_vector = np.array([(self.M - 1.0) / (i * (self.M - i)) for i in range(1, num_subset_sizes + 1)])
+            weight_vector[:num_paired_subset_sizes] *= 2
+            weight_vector /= np.sum(weight_vector)
+            log.debug(f"{weight_vector = }")
+            log.debug(f"{num_subset_sizes = }")
+            log.debug(f"{num_paired_subset_sizes = }")
+            log.debug(f"{self.M = }")
+            # fill out all the subset sizes we can completely enumerate
+            # given nsamples*remaining_weight_vector[subset_size]
+            num_full_subsets = 0
+            num_samples_left = self.nsamples
+            group_inds = np.arange(self.M, dtype='int64')
+            mask = np.zeros(self.M)
+            remaining_weight_vector = copy.copy(weight_vector)
+            for subset_size in range(1, num_subset_sizes + 1):
+                # determine how many subsets (and their complements) are of the current size
+                nsubsets = binom(self.M, subset_size)
+                if subset_size <= num_paired_subset_sizes:
+                    nsubsets *= 2
+                log.debug(f"{subset_size = }")
+                log.debug(f"{nsubsets = }")
+                log.debug("self.nsamples*weight_vector[subset_size-1] = {}".format(
+                    num_samples_left * remaining_weight_vector[subset_size - 1]))
+                log.debug("self.nsamples*weight_vector[subset_size-1]/nsubsets = {}".format(
+                    num_samples_left * remaining_weight_vector[subset_size - 1] / nsubsets))
+                # see if we have enough samples to enumerate all subsets of this size
+                if num_samples_left * remaining_weight_vector[subset_size - 1] / nsubsets >= 1.0 - 1e-8:
+                    num_full_subsets += 1
+                    num_samples_left -= nsubsets
+                    # rescale what's left of the remaining weight vector to sum to 1
+                    if remaining_weight_vector[subset_size - 1] < 1.0:
+                        remaining_weight_vector /= (1 - remaining_weight_vector[subset_size - 1])
+                    # add all the samples of the current subset size
+                    w = weight_vector[subset_size - 1] / binom(self.M, subset_size)
+                    if subset_size <= num_paired_subset_sizes:
+                        w /= 2.0
+                    for inds in itertools.combinations(group_inds, subset_size):
+                        mask[:] = 0.0
+                        mask[np.array(inds, dtype='int64')] = 1.0
+                        self.addsample(instance.x, mask, w)
+                        if subset_size <= num_paired_subset_sizes:
+                            mask[:] = np.abs(mask - 1)
+                            self.addsample(instance.x, mask, w)
+                else:
+                    break
+            log.info(f"{num_full_subsets = }")
+            # add random samples from what is left of the subset space
+            nfixed_samples = self.nsamplesAdded
+            samples_left = self.nsamples - self.nsamplesAdded
+            log.debug(f"{samples_left = }")
+            if num_full_subsets != num_subset_sizes:
+                remaining_weight_vector = copy.copy(weight_vector)
+                remaining_weight_vector[:num_paired_subset_sizes] /= 2 # because we draw two samples each below
+                remaining_weight_vector = remaining_weight_vector[num_full_subsets:]
+                remaining_weight_vector /= np.sum(remaining_weight_vector)
+                log.info(f"{remaining_weight_vector = }")
+                log.info(f"{num_paired_subset_sizes = }")
+                ind_set = np.random.choice(len(remaining_weight_vector), 4 * samples_left, p=remaining_weight_vector)
+                ind_set_pos = 0
+                used_masks = {}
+                while samples_left > 0 and ind_set_pos < len(ind_set):
+                    mask.fill(0.0)
+                    ind = ind_set[ind_set_pos] # we call np.random.choice once to save time and then just read it here
+                    ind_set_pos += 1
+                    subset_size = ind + num_full_subsets + 1
+                    mask[np.random.permutation(self.M)[:subset_size]] = 1.0
+                    # only add the sample if we have not seen it before, otherwise just
+                    # increment a previous sample's weight
+                    mask_tuple = tuple(mask)
+                    new_sample = False
+                    if mask_tuple not in used_masks:
+                        new_sample = True
+                        used_masks[mask_tuple] = self.nsamplesAdded
+                        samples_left -= 1
+                        self.addsample(instance.x, mask, 1.0)
+                    else:
+                        self.kernelWeights[used_masks[mask_tuple]] += 1.0
+                    # add the compliment sample
+                    if samples_left > 0 and subset_size <= num_paired_subset_sizes:
+                        mask[:] = np.abs(mask - 1)
+                        # only add the sample if we have not seen it before, otherwise just
+                        # increment a previous sample's weight
+                        if new_sample:
+                            samples_left -= 1
+                            self.addsample(instance.x, mask, 1.0)
+                        else:
+                            # we know the compliment sample is the next one after the original sample, so + 1
+                            self.kernelWeights[used_masks[mask_tuple] + 1] += 1.0
+                # normalize the kernel weights for the random samples to equal the weight left after
+                # the fixed enumerated samples have been already counted
+                weight_left = np.sum(weight_vector[num_full_subsets:])
+                log.info(f"{weight_left = }")
+                self.kernelWeights[nfixed_samples:] *= weight_left / self.kernelWeights[nfixed_samples:].sum()
+            # execute the model on the synthetic samples we have created
+            self.run()
+            # solve then expand the feature importance (Shapley value) vector to contain the non-varying features
+            phi = np.zeros((self.data.groups_size, self.D))
+            phi_var = np.zeros((self.data.groups_size, self.D))
+            for d in range(self.D):
+                vphi, vphi_var = self.solve(self.nsamples / self.max_samples, d)
+                phi[self.varyingInds, d] = vphi
+                phi_var[self.varyingInds, d] = vphi_var
+        if not self.vector_out:
+            phi = np.squeeze(phi, axis=1)
+            phi_var = np.squeeze(phi_var, axis=1)
+        return phi
+    @staticmethod
+    def not_equal(i, j):
+        number_types = (int, float, np.number)
+        if isinstance(i, number_types) and isinstance(j, number_types):
+            return 0 if np.isclose(i, j, equal_nan=True) else 1
+        else:
+            return 0 if i == j else 1
+    def varying_groups(self, x):
+        if not scipy.sparse.issparse(x):
+            varying = np.zeros(self.data.groups_size)
+            for i in range(0, self.data.groups_size):
+                inds = self.data.groups[i]
+                x_group = x[0, inds]
+                if scipy.sparse.issparse(x_group):
+                    if all(j not in x.nonzero()[1] for j in inds):
+                        varying[i] = False
+                        continue
+                    x_group = x_group.todense()
+                num_mismatches = np.sum(np.frompyfunc(self.not_equal, 2, 1)(x_group, self.data.data[:, inds]))
+                varying[i] = num_mismatches > 0
+            varying_indices = np.nonzero(varying)[0]
+            return varying_indices
+        else:
+            varying_indices = []
+            # go over all nonzero columns in background and evaluation data
+            # if both background and evaluation are zero, the column does not vary
+            varying_indices = np.unique(np.union1d(self.data.data.nonzero()[1], x.nonzero()[1]))
+            remove_unvarying_indices = []
+            for i in range(0, len(varying_indices)):
+                varying_index = varying_indices[i]
+                # now verify the nonzero values do vary
+                data_rows = self.data.data[:, [varying_index]]
+                nonzero_rows = data_rows.nonzero()[0]
+                if nonzero_rows.size > 0:
+                    background_data_rows = data_rows[nonzero_rows]
+                    if scipy.sparse.issparse(background_data_rows):
+                        background_data_rows = background_data_rows.toarray()
+                    num_mismatches = np.sum(np.abs(background_data_rows - x[0, varying_index]) > 1e-7)
+                    # Note: If feature column non-zero but some background zero, can't remove index
+                    if num_mismatches == 0 and not \
+                        (np.abs(x[0, [varying_index]][0, 0]) > 1e-7 and len(nonzero_rows) < data_rows.shape[0]):
+                        remove_unvarying_indices.append(i)
+            mask = np.ones(len(varying_indices), dtype=bool)
+            mask[remove_unvarying_indices] = False
+            varying_indices = varying_indices[mask]
+            return varying_indices
+    def allocate(self):
+        if scipy.sparse.issparse(self.data.data):
+            # We tile the sparse matrix in csr format but convert it to lil
+            # for performance when adding samples
+            shape = self.data.data.shape
+            nnz = self.data.data.nnz
+            data_rows, data_cols = shape
+            rows = data_rows * self.nsamples
+            shape = rows, data_cols
+            if nnz == 0:
+                self.synth_data = scipy.sparse.csr_matrix(shape, dtype=self.data.data.dtype).tolil()
+            else:
+                data = self.data.data.data
+                indices = self.data.data.indices
+                indptr = self.data.data.indptr
+                last_indptr_idx = indptr[len(indptr) - 1]
+                indptr_wo_last = indptr[:-1]
+                new_indptrs = []
+                for i in range(0, self.nsamples - 1):
+                    new_indptrs.append(indptr_wo_last + (i * last_indptr_idx))
+                new_indptrs.append(indptr + ((self.nsamples - 1) * last_indptr_idx))
+                new_indptr = np.concatenate(new_indptrs)
+                new_data = np.tile(data, self.nsamples)
+                new_indices = np.tile(indices, self.nsamples)
+                self.synth_data = scipy.sparse.csr_matrix((new_data, new_indices, new_indptr), shape=shape).tolil()
+        else:
+            self.synth_data = np.tile(self.data.data, (self.nsamples, 1))
+        self.maskMatrix = np.zeros((self.nsamples, self.M))
+        self.kernelWeights = np.zeros(self.nsamples)
+        self.y = np.zeros((self.nsamples * self.N, self.D))
+        self.ey = np.zeros((self.nsamples, self.D))
+        self.lastMask = np.zeros(self.nsamples)
+        self.nsamplesAdded = 0
+        self.nsamplesRun = 0
+        if self.keep_index:
+            self.synth_data_index = np.tile(self.data.index_value, self.nsamples)
+    def addsample(self, x, m, w):
+        offset = self.nsamplesAdded * self.N
+        if isinstance(self.varyingFeatureGroups, (list,)):
+            for j in range(self.M):
+                for k in self.varyingFeatureGroups[j]:
+                    if m[j] == 1.0:
+                        self.synth_data[offset:offset+self.N, k] = x[0, k]
+        else:
+            # for non-jagged numpy array we can significantly boost performance
+            mask = m == 1.0
+            groups = self.varyingFeatureGroups[mask]
+            if len(groups.shape) == 2:
+                for group in groups:
+                    self.synth_data[offset:offset+self.N, group] = x[0, group]
+            else:
+                # further performance optimization in case each group has a single feature
+                evaluation_data = x[0, groups]
+                # In edge case where background is all dense but evaluation data
+                # is all sparse, make evaluation data dense
+                if scipy.sparse.issparse(x) and not scipy.sparse.issparse(self.synth_data):
+                    evaluation_data = evaluation_data.toarray()
+                self.synth_data[offset:offset+self.N, groups] = evaluation_data
+        self.maskMatrix[self.nsamplesAdded, :] = m
+        self.kernelWeights[self.nsamplesAdded] = w
+        self.nsamplesAdded += 1
+    def run(self):
+        num_to_run = self.nsamplesAdded * self.N - self.nsamplesRun * self.N
+        data = self.synth_data[self.nsamplesRun*self.N:self.nsamplesAdded*self.N,:]
+        if self.keep_index:
+            index = self.synth_data_index[self.nsamplesRun*self.N:self.nsamplesAdded*self.N]
+            index = pd.DataFrame(index, columns=[self.data.index_name])
+            data = pd.DataFrame(data, columns=self.data.group_names)
+            data = pd.concat([index, data], axis=1).set_index(self.data.index_name)
+            if self.keep_index_ordered:
+                data = data.sort_index()
+        modelOut = self.model.f(data)
+        if isinstance(modelOut, (pd.DataFrame, pd.Series)):
+            modelOut = modelOut.values
+        self.y[self.nsamplesRun * self.N:self.nsamplesAdded * self.N, :] = np.reshape(modelOut, (num_to_run, self.D))
+        # find the expected value of each output
+        for i in range(self.nsamplesRun, self.nsamplesAdded):
+            eyVal = np.zeros(self.D)
+            for j in range(0, self.N):
+                eyVal += self.y[i * self.N + j, :] * self.data.weights[j]
+            self.ey[i, :] = eyVal
+            self.nsamplesRun += 1
+    def solve(self, fraction_evaluated, dim):
+        eyAdj = self.linkfv(self.ey[:, dim]) - self.link.f(self.fnull[dim])
+        s = np.sum(self.maskMatrix, 1)
+        # do feature selection if we have not well enumerated the space
+        nonzero_inds = np.arange(self.M)
+        log.debug(f"{fraction_evaluated = }")
+        # if self.l1_reg == "auto":
+        #     warnings.warn(
+        #         "l1_reg=\"auto\" is deprecated and in the next version (v0.29) the behavior will change from a " \
+        #         "conditional use of AIC to simply \"num_features(10)\"!"
+        #     )
+        if (self.l1_reg not in ["auto", False, 0]) or (fraction_evaluated < 0.2 and self.l1_reg == "auto"):
+            w_aug = np.hstack((self.kernelWeights * (self.M - s), self.kernelWeights * s))
+            log.info(f"{np.sum(w_aug) = }")
+            log.info(f"{np.sum(self.kernelWeights) = }")
+            w_sqrt_aug = np.sqrt(w_aug)
+            eyAdj_aug = np.hstack((eyAdj, eyAdj - (self.link.f(self.fx[dim]) - self.link.f(self.fnull[dim]))))
+            eyAdj_aug *= w_sqrt_aug
+            mask_aug = np.transpose(w_sqrt_aug * np.transpose(np.vstack((self.maskMatrix, self.maskMatrix - 1))))
+            #var_norms = np.array([np.linalg.norm(mask_aug[:, i]) for i in range(mask_aug.shape[1])])
+            # select a fixed number of top features
+            if isinstance(self.l1_reg, str) and self.l1_reg.startswith("num_features("):
+                r = int(self.l1_reg[len("num_features("):-1])
+                nonzero_inds = lars_path(mask_aug, eyAdj_aug, max_iter=r)[1]
+            # use an adaptive regularization method
+            elif self.l1_reg == "auto" or self.l1_reg == "bic" or self.l1_reg == "aic":
+                c = "aic" if self.l1_reg == "auto" else self.l1_reg
+                # "Normalize" parameter of LassoLarsIC was deprecated in sklearn version 1.2
+                if version.parse(sklearn.__version__) < version.parse("1.2.0"):
+                    kwg = dict(normalize=False)
+                else:
+                    kwg = {}
+                model = make_pipeline(StandardScaler(with_mean=False), LassoLarsIC(criterion=c, **kwg))
+                nonzero_inds = np.nonzero(model.fit(mask_aug, eyAdj_aug)[1].coef_)[0]
+            # use a fixed regularization coefficient
+            else:
+                nonzero_inds = np.nonzero(Lasso(alpha=self.l1_reg).fit(mask_aug, eyAdj_aug).coef_)[0]
+        if len(nonzero_inds) == 0:
+            return np.zeros(self.M), np.ones(self.M)
+        # eliminate one variable with the constraint that all features sum to the output
+        eyAdj2 = eyAdj - self.maskMatrix[:, nonzero_inds[-1]] * (
+                    self.link.f(self.fx[dim]) - self.link.f(self.fnull[dim]))
+        etmp = np.transpose(np.transpose(self.maskMatrix[:, nonzero_inds[:-1]]) - self.maskMatrix[:, nonzero_inds[-1]])
+        log.debug(f"{etmp[:4, :] = }")
+        # solve a weighted least squares equation to estimate phi
+        # least squares:
+        #     phi = min_w ||W^(1/2) (y - X w)||^2
+        # the corresponding normal equation:
+        #     (X' W X) phi = X' W y
+        # with
+        #     X = etmp
+        #     W = np.diag(self.kernelWeights)
+        #     y = eyAdj2
+        #
+        # We could just rely on sciki-learn
+        #     from sklearn.linear_model import LinearRegression
+        #     lm = LinearRegression(fit_intercept=False).fit(etmp, eyAdj2, sample_weight=self.kernelWeights)
+        # Under the hood, as of scikit-learn version 1.3, LinearRegression still uses np.linalg.lstsq and
+        # there are more performant options. See https://github.com/scikit-learn/scikit-learn/issues/22855.
+        y = eyAdj2
+        X = etmp
+        WX = self.kernelWeights[:, None] * X
+        try:
+            w = np.linalg.solve(X.T @ WX, WX.T @ y)
+        except np.linalg.LinAlgError:
+            warnings.warn(
+                "Linear regression equation is singular, a least squares solutions is used instead.\n"
+                "To avoid this situation and get a regular matrix do one of the following:\n"
+                "1) turn up the number of samples,\n"
+                "2) turn up the L1 regularization with num_features(N) where N is less than the number of samples,\n"
+                "3) group features together to reduce the number of inputs that need to be explained."
+            )
+            # XWX = np.linalg.pinv(X.T @ WX)
+            # w = np.dot(XWX, np.dot(np.transpose(WX), y))
+            sqrt_W = np.sqrt(self.kernelWeights)
+            w = np.linalg.lstsq(sqrt_W[:, None] * X, sqrt_W * y, rcond=None)[0]
+        log.debug(f"{np.sum(w) = }")
+        log.debug("self.link(self.fx) - self.link(self.fnull) = {}".format(
+            self.link.f(self.fx[dim]) - self.link.f(self.fnull[dim])))
+        log.debug(f"self.fx = {self.fx[dim]}")
+        log.debug(f"self.link(self.fx) = {self.link.f(self.fx[dim])}")
+        log.debug(f"self.fnull = {self.fnull[dim]}")
+        log.debug(f"self.link(self.fnull) = {self.link.f(self.fnull[dim])}")
+        phi = np.zeros(self.M)
+        phi[nonzero_inds[:-1]] = w
+        phi[nonzero_inds[-1]] = (self.link.f(self.fx[dim]) - self.link.f(self.fnull[dim])) - sum(w)
+        log.info(f"{phi = }")
+        # clean up any rounding errors
+        for i in range(self.M):
+            if np.abs(phi[i]) < 1e-10:
+                phi[i] = 0
+        return phi, np.ones(len(phi))

lib/shap/explainers/_linear.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import warnings
+import numpy as np
+import pandas as pd
+from scipy.sparse import issparse
+from tqdm.auto import tqdm
+from .. import links, maskers
+from ..utils._exceptions import (
+    DimensionError,
+    InvalidFeaturePerturbationError,
+    InvalidModelError,
+)
+from ._explainer import Explainer
+class LinearExplainer(Explainer):
+    """ Computes SHAP values for a linear model, optionally accounting for inter-feature correlations.
+    This computes the SHAP values for a linear model and can account for the correlations among
+    the input features. Assuming features are independent leads to interventional SHAP values which
+    for a linear model are coef[i] * (x[i] - X.mean(0)[i]) for the ith feature. If instead we account
+    for correlations then we prevent any problems arising from collinearity and share credit among
+    correlated features. Accounting for correlations can be computationally challenging, but
+    LinearExplainer uses sampling to estimate a transform that can then be applied to explain
+    any prediction of the model.
+    Parameters
+    ----------
+    model : (coef, intercept) or sklearn.linear_model.*
+        User supplied linear model either as either a parameter pair or sklearn object.
+    data : (mean, cov), numpy.array, pandas.DataFrame, iml.DenseData or scipy.csr_matrix
+        The background dataset to use for computing conditional expectations. Note that only the
+        mean and covariance of the dataset are used. This means passing a raw data matrix is just
+        a convenient alternative to passing the mean and covariance directly.
+    nsamples : int
+        Number of samples to use when estimating the transformation matrix used to account for
+        feature correlations.
+    feature_perturbation : "interventional" (default) or "correlation_dependent"
+        There are two ways we might want to compute SHAP values, either the full conditional SHAP
+        values or the interventional SHAP values. For interventional SHAP values we break any
+        dependence structure between features in the model and so uncover how the model would behave if we
+        intervened and changed some of the inputs. For the full conditional SHAP values we respect
+        the correlations among the input features, so if the model depends on one input but that
+        input is correlated with another input, then both get some credit for the model's behavior. The
+        interventional option stays "true to the model" meaning it will only give credit to features that are
+        actually used by the model, while the correlation option stays "true to the data" in the sense that
+        it only considers how the model would behave when respecting the correlations in the input data.
+        For sparse case only interventional option is supported.
+    Examples
+    --------
+    See `Linear explainer examples <https://shap.readthedocs.io/en/latest/api_examples/explainers/LinearExplainer.html>`_
+    """
+    def __init__(self, model, masker, link=links.identity, nsamples=1000, feature_perturbation=None, **kwargs):
+        if 'feature_dependence' in kwargs:
+            warnings.warn('The option feature_dependence has been renamed to feature_perturbation!')
+            feature_perturbation = kwargs["feature_dependence"]
+        if feature_perturbation == "independent":
+            warnings.warn('The option feature_perturbation="independent" is has been renamed to feature_perturbation="interventional"!')
+            feature_perturbation = "interventional"
+        elif feature_perturbation == "correlation":
+            warnings.warn('The option feature_perturbation="correlation" is has been renamed to feature_perturbation="correlation_dependent"!')
+            feature_perturbation = "correlation_dependent"
+        if feature_perturbation is not None:
+            warnings.warn("The feature_perturbation option is now deprecated in favor of using the appropriate masker (maskers.Independent, or maskers.Impute)")
+        else:
+            feature_perturbation = "interventional"
+        self.feature_perturbation = feature_perturbation
+        # wrap the incoming masker object as a shap.Masker object before calling
+        # parent class constructor, which does the same but without respecting
+        # the user-provided feature_perturbation choice
+        if isinstance(masker, pd.DataFrame) or ((isinstance(masker, np.ndarray) or issparse(masker)) and len(masker.shape) == 2):
+            if self.feature_perturbation == "correlation_dependent":
+                masker = maskers.Impute(masker)
+            else:
+                masker = maskers.Independent(masker)
+        elif issubclass(type(masker), tuple) and len(masker) == 2:
+            if self.feature_perturbation == "correlation_dependent":
+                masker = maskers.Impute({"mean": masker[0], "cov": masker[1]}, method="linear")
+            else:
+                masker = maskers.Independent({"mean": masker[0], "cov": masker[1]})
+        super().__init__(model, masker, link=link, **kwargs)
+        self.nsamples = nsamples
+        # extract what we need from the given model object
+        self.coef, self.intercept = LinearExplainer._parse_model(model)
+        # extract the data
+        if issubclass(type(self.masker), (maskers.Independent, maskers.Partition)):
+            self.feature_perturbation = "interventional"
+        elif issubclass(type(self.masker), maskers.Impute):
+            self.feature_perturbation = "correlation_dependent"
+        else:
+            raise NotImplementedError("The Linear explainer only supports the Independent, Partition, and Impute maskers right now!")
+        data = getattr(self.masker, "data", None)
+        # convert DataFrame's to numpy arrays
+        if isinstance(data, pd.DataFrame):
+            data = data.values
+        # get the mean and covariance of the model
+        if getattr(self.masker, "mean", None) is not None:
+            self.mean = self.masker.mean
+            self.cov = self.masker.cov
+        elif isinstance(data, dict) and len(data) == 2:
+            self.mean = data["mean"]
+            if isinstance(self.mean, pd.Series):
+                self.mean = self.mean.values
+            self.cov = data["cov"]
+            if isinstance(self.cov, pd.DataFrame):
+                self.cov = self.cov.values
+        elif isinstance(data, tuple) and len(data) == 2:
+            self.mean = data[0]
+            if isinstance(self.mean, pd.Series):
+                self.mean = self.mean.values
+            self.cov = data[1]
+            if isinstance(self.cov, pd.DataFrame):
+                self.cov = self.cov.values
+        elif data is None:
+            raise ValueError("A background data distribution must be provided!")
+        else:
+            if issparse(data):
+                self.mean = np.array(np.mean(data, 0))[0]
+                if self.feature_perturbation != "interventional":
+                    raise NotImplementedError("Only feature_perturbation = 'interventional' is supported for sparse data")
+            else:
+                self.mean = np.array(np.mean(data, 0)).flatten() # assumes it is an array
+                if self.feature_perturbation == "correlation_dependent":
+                    self.cov = np.cov(data, rowvar=False)
+        #print(self.coef, self.mean.flatten(), self.intercept)
+        # Note: mean can be numpy.matrixlib.defmatrix.matrix or numpy.matrix type depending on numpy version
+        if issparse(self.mean) or str(type(self.mean)).endswith("matrix'>"):
+            # accept both sparse and dense coef
+            # if not issparse(self.coef):
+            #     self.coef = np.asmatrix(self.coef)
+            self.expected_value = np.dot(self.coef, self.mean) + self.intercept
+            # unwrap the matrix form
+            if len(self.expected_value) == 1:
+                self.expected_value = self.expected_value[0,0]
+            else:
+                self.expected_value = np.array(self.expected_value)[0]
+        else:
+            self.expected_value = np.dot(self.coef, self.mean) + self.intercept
+        self.M = len(self.mean)
+        # if needed, estimate the transform matrices
+        if self.feature_perturbation == "correlation_dependent":
+            self.valid_inds = np.where(np.diag(self.cov) > 1e-8)[0]
+            self.mean = self.mean[self.valid_inds]
+            self.cov = self.cov[:,self.valid_inds][self.valid_inds,:]
+            self.coef = self.coef[self.valid_inds]
+            # group perfectly redundant variables together
+            self.avg_proj,sum_proj = duplicate_components(self.cov)
+            self.cov = np.matmul(np.matmul(self.avg_proj, self.cov), self.avg_proj.T)
+            self.mean = np.matmul(self.avg_proj, self.mean)
+            self.coef = np.matmul(sum_proj, self.coef)
+            # if we still have some multi-collinearity present then we just add regularization...
+            e,_ = np.linalg.eig(self.cov)
+            if e.min() < 1e-7:
+                self.cov = self.cov + np.eye(self.cov.shape[0]) * 1e-6
+            mean_transform, x_transform = self._estimate_transforms(nsamples)
+            self.mean_transformed = np.matmul(mean_transform, self.mean)
+            self.x_transform = x_transform
+        elif self.feature_perturbation == "interventional":
+            if nsamples != 1000:
+                warnings.warn("Setting nsamples has no effect when feature_perturbation = 'interventional'!")
+        else:
+            raise InvalidFeaturePerturbationError("Unknown type of feature_perturbation provided: " + self.feature_perturbation)
+    def _estimate_transforms(self, nsamples):
+        """ Uses block matrix inversion identities to quickly estimate transforms.
+        After a bit of matrix math we can isolate a transform matrix (# features x # features)
+        that is independent of any sample we are explaining. It is the result of averaging over
+        all feature permutations, but we just use a fixed number of samples to estimate the value.
+        TODO: Do a brute force enumeration when # feature subsets is less than nsamples. This could
+              happen through a recursive method that uses the same block matrix inversion as below.
+        """
+        M = len(self.coef)
+        mean_transform = np.zeros((M,M))
+        x_transform = np.zeros((M,M))
+        inds = np.arange(M, dtype=int)
+        for _ in tqdm(range(nsamples), "Estimating transforms"):
+            np.random.shuffle(inds)
+            cov_inv_SiSi = np.zeros((0,0))
+            cov_Si = np.zeros((M,0))
+            for j in range(M):
+                i = inds[j]
+                # use the last Si as the new S
+                cov_S = cov_Si
+                cov_inv_SS = cov_inv_SiSi
+                # get the new cov_Si
+                cov_Si = self.cov[:,inds[:j+1]]
+                # compute the new cov_inv_SiSi from cov_inv_SS
+                d = cov_Si[i,:-1].T
+                t = np.matmul(cov_inv_SS, d)
+                Z = self.cov[i, i]
+                u = Z - np.matmul(t.T, d)
+                cov_inv_SiSi = np.zeros((j+1, j+1))
+                if j > 0:
+                    cov_inv_SiSi[:-1, :-1] = cov_inv_SS + np.outer(t, t) / u
+                    cov_inv_SiSi[:-1, -1] = cov_inv_SiSi[-1,:-1] = -t / u
+                cov_inv_SiSi[-1, -1] = 1 / u
+                # + coef @ (Q(bar(Sui)) - Q(bar(S)))
+                mean_transform[i, i] += self.coef[i]
+                # + coef @ R(Sui)
+                coef_R_Si = np.matmul(self.coef[inds[j+1:]], np.matmul(cov_Si, cov_inv_SiSi)[inds[j+1:]])
+                mean_transform[i, inds[:j+1]] += coef_R_Si
+                # - coef @ R(S)
+                coef_R_S = np.matmul(self.coef[inds[j:]], np.matmul(cov_S, cov_inv_SS)[inds[j:]])
+                mean_transform[i, inds[:j]] -= coef_R_S
+                # - coef @ (Q(Sui) - Q(S))
+                x_transform[i, i] += self.coef[i]
+                # + coef @ R(Sui)
+                x_transform[i, inds[:j+1]] += coef_R_Si
+                # - coef @ R(S)
+                x_transform[i, inds[:j]] -= coef_R_S
+        mean_transform /= nsamples
+        x_transform /= nsamples
+        return mean_transform, x_transform
+    @staticmethod
+    def _parse_model(model):
+        """ Attempt to pull out the coefficients and intercept from the given model object.
+        """
+        # raw coefficients
+        if type(model) == tuple and len(model) == 2:
+            coef = model[0]
+            intercept = model[1]
+        # sklearn style model
+        elif hasattr(model, "coef_") and hasattr(model, "intercept_"):
+            # work around for multi-class with a single class
+            if len(model.coef_.shape) > 1 and model.coef_.shape[0] == 1:
+                coef = model.coef_[0]
+                try:
+                    intercept = model.intercept_[0]
+                except TypeError:
+                    intercept = model.intercept_
+            else:
+                coef = model.coef_
+                intercept = model.intercept_
+        else:
+            raise InvalidModelError("An unknown model type was passed: " + str(type(model)))
+        return coef,intercept
+    @staticmethod
+    def supports_model_with_masker(model, masker):
+        """ Determines if we can parse the given model.
+        """
+        if not isinstance(masker, (maskers.Independent, maskers.Partition, maskers.Impute)):
+            return False
+        try:
+            LinearExplainer._parse_model(model)
+        except Exception:
+            return False
+        return True
+    def explain_row(self, *row_args, max_evals, main_effects, error_bounds, batch_size, outputs, silent):
+        """ Explains a single row and returns the tuple (row_values, row_expected_values, row_mask_shapes).
+        """
+        assert len(row_args) == 1, "Only single-argument functions are supported by the Linear explainer!"
+        X = row_args[0]
+        if len(X.shape) == 1:
+            X = X.reshape(1, -1)
+        # convert dataframes
+        if isinstance(X, (pd.Series, pd.DataFrame)):
+            X = X.values
+        if len(X.shape) not in (1, 2):
+            raise DimensionError("Instance must have 1 or 2 dimensions! Not: %s" %len(X.shape))
+        if self.feature_perturbation == "correlation_dependent":
+            if issparse(X):
+                raise InvalidFeaturePerturbationError("Only feature_perturbation = 'interventional' is supported for sparse data")
+            phi = np.matmul(np.matmul(X[:,self.valid_inds], self.avg_proj.T), self.x_transform.T) - self.mean_transformed
+            phi = np.matmul(phi, self.avg_proj)
+            full_phi = np.zeros((phi.shape[0], self.M))
+            full_phi[:,self.valid_inds] = phi
+            phi = full_phi
+        elif self.feature_perturbation == "interventional":
+            if issparse(X):
+                phi = np.array(np.multiply(X - self.mean, self.coef))
+                # if len(self.coef.shape) == 1:
+                #     return np.array(np.multiply(X - self.mean, self.coef))
+                # else:
+                #     return [np.array(np.multiply(X - self.mean, self.coef[i])) for i in range(self.coef.shape[0])]
+            else:
+                phi = np.array(X - self.mean) * self.coef
+                # if len(self.coef.shape) == 1:
+                #     phi = np.array(X - self.mean) * self.coef
+                #     return np.array(X - self.mean) * self.coef
+                # else:
+                #     return [np.array(X - self.mean) * self.coef[i] for i in range(self.coef.shape[0])]
+        return {
+            "values": phi.T,
+            "expected_values": self.expected_value,
+            "mask_shapes": (X.shape[1:],),
+            "main_effects": phi.T,
+            "clustering": None
+        }
+    def shap_values(self, X):
+        """ Estimate the SHAP values for a set of samples.
+        Parameters
+        ----------
+        X : numpy.array, pandas.DataFrame or scipy.csr_matrix
+            A matrix of samples (# samples x # features) on which to explain the model's output.
+        Returns
+        -------
+        array or list
+            For models with a single output this returns a matrix of SHAP values
+            (# samples x # features). Each row sums to the difference between the model output for that
+            sample and the expected value of the model output (which is stored as expected_value
+            attribute of the explainer).
+        """
+        # convert dataframes
+        if isinstance(X, (pd.Series, pd.DataFrame)):
+            X = X.values
+        # assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))
+        if len(X.shape) not in (1, 2):
+            raise DimensionError("Instance must have 1 or 2 dimensions! Not: %s" % len(X.shape))
+        if self.feature_perturbation == "correlation_dependent":
+            if issparse(X):
+                raise InvalidFeaturePerturbationError("Only feature_perturbation = 'interventional' is supported for sparse data")
+            phi = np.matmul(np.matmul(X[:,self.valid_inds], self.avg_proj.T), self.x_transform.T) - self.mean_transformed
+            phi = np.matmul(phi, self.avg_proj)
+            full_phi = np.zeros((phi.shape[0], self.M))
+            full_phi[:,self.valid_inds] = phi
+            return full_phi
+        elif self.feature_perturbation == "interventional":
+            if issparse(X):
+                if len(self.coef.shape) == 1:
+                    return np.array(np.multiply(X - self.mean, self.coef))
+                else:
+                    return [np.array(np.multiply(X - self.mean, self.coef[i])) for i in range(self.coef.shape[0])]
+            else:
+                if len(self.coef.shape) == 1:
+                    return np.array(X - self.mean) * self.coef
+                else:
+                    return [np.array(X - self.mean) * self.coef[i] for i in range(self.coef.shape[0])]
+def duplicate_components(C):
+    D = np.diag(1/np.sqrt(np.diag(C)))
+    C = np.matmul(np.matmul(D, C), D)
+    components = -np.ones(C.shape[0], dtype=int)
+    count = -1
+    for i in range(C.shape[0]):
+        found_group = False
+        for j in range(C.shape[0]):
+            if components[j] < 0 and np.abs(2*C[i,j] - C[i,i] - C[j,j]) < 1e-8:
+                if not found_group:
+                    count += 1
+                    found_group = True
+                components[j] = count
+    proj = np.zeros((len(np.unique(components)), C.shape[0]))
+    proj[0, 0] = 1
+    for i in range(1,C.shape[0]):
+        proj[components[i], i] = 1
+    return (proj.T / proj.sum(1)).T, proj

lib/shap/explainers/_partition.py ADDED Viewed

	@@ -0,0 +1,681 @@

+import queue
+import time
+import numpy as np
+from numba import njit
+from tqdm.auto import tqdm
+from .. import Explanation, links
+from ..models import Model
+from ..utils import MaskedModel, OpChain, make_masks, safe_isinstance
+from ._explainer import Explainer
+class PartitionExplainer(Explainer):
+    """Uses the Partition SHAP method to explain the output of any function.
+    Partition SHAP computes Shapley values recursively through a hierarchy of features, this
+    hierarchy defines feature coalitions and results in the Owen values from game theory.
+    The PartitionExplainer has two particularly nice properties:
+    1) PartitionExplainer is model-agnostic but when using a balanced partition tree only has
+       quadratic exact runtime (in term of the number of input features). This is in contrast to the
+       exponential exact runtime of KernelExplainer or SamplingExplainer.
+    2) PartitionExplainer always assigns to groups of correlated features the credit that set of features
+       would have had if treated as a group. This means if the hierarchical clustering given to
+       PartitionExplainer groups correlated features together, then feature correlations are
+       "accounted for" in the sense that the total credit assigned to a group of tightly dependent features
+       does not depend on how they behave if their correlation structure was broken during the explanation's
+       perturbation process.
+    Note that for linear models the Owen values that PartitionExplainer returns are the same as the standard
+    non-hierarchical Shapley values.
+    """
+    def __init__(self, model, masker, *, output_names=None, link=links.identity, linearize_link=True,
+                 feature_names=None, **call_args):
+        """Build a PartitionExplainer for the given model with the given masker.
+        Parameters
+        ----------
+        model : function
+            User supplied function that takes a matrix of samples (# samples x # features) and
+            computes the output of the model for those samples.
+        masker : function or numpy.array or pandas.DataFrame or tokenizer
+            The function used to "mask" out hidden features of the form `masker(mask, x)`. It takes a
+            single input sample and a binary mask and returns a matrix of masked samples. These
+            masked samples will then be evaluated using the model function and the outputs averaged.
+            As a shortcut for the standard masking using by SHAP you can pass a background data matrix
+            instead of a function and that matrix will be used for masking. Domain specific masking
+            functions are available in shap such as shap.maksers.Image for images and shap.maskers.Text
+            for text.
+        partition_tree : None or function or numpy.array
+            A hierarchical clustering of the input features represented by a matrix that follows the format
+            used by scipy.cluster.hierarchy (see the notebooks_html/partition_explainer directory an example).
+            If this is a function then the function produces a clustering matrix when given a single input
+            example. If you are using a standard SHAP masker object then you can pass masker.clustering
+            to use that masker's built-in clustering of the features, or if partition_tree is None then
+            masker.clustering will be used by default.
+        Examples
+        --------
+        See `Partition explainer examples <https://shap.readthedocs.io/en/latest/api_examples/explainers/PartitionExplainer.html>`_
+        """
+        super().__init__(model, masker, link=link, linearize_link=linearize_link, algorithm="partition", \
+                         output_names = output_names, feature_names=feature_names)
+        # convert dataframes
+        # if isinstance(masker, pd.DataFrame):
+        #     masker = TabularMasker(masker)
+        # elif isinstance(masker, np.ndarray) and len(masker.shape) == 2:
+        #     masker = TabularMasker(masker)
+        # elif safe_isinstance(masker, "transformers.PreTrainedTokenizer"):
+        #     masker = TextMasker(masker)
+        # self.masker = masker
+        # TODO: maybe? if we have a tabular masker then we build a PermutationExplainer that we
+        # will use for sampling
+        self.input_shape = masker.shape[1:] if hasattr(masker, "shape") and not callable(masker.shape) else None
+        # self.output_names = output_names
+        if not safe_isinstance(self.model, "shap.models.Model"):
+            self.model = Model(self.model)#lambda *args: np.array(model(*args))
+        self.expected_value = None
+        self._curr_base_value = None
+        if getattr(self.masker, "clustering", None) is None:
+            raise ValueError("The passed masker must have a .clustering attribute defined! Try shap.maskers.Partition(data) for example.")
+        # if partition_tree is None:
+        #     if not hasattr(masker, "partition_tree"):
+        #         raise ValueError("The passed masker does not have masker.clustering, so the partition_tree must be passed!")
+        #     self.partition_tree = masker.clustering
+        # else:
+        #     self.partition_tree = partition_tree
+        # handle higher dimensional tensor inputs
+        if self.input_shape is not None and len(self.input_shape) > 1:
+            self._reshaped_model = lambda x: self.model(x.reshape(x.shape[0], *self.input_shape))
+        else:
+            self._reshaped_model = self.model
+        # if we don't have a dynamic clustering algorithm then can precowe mpute
+        # a lot of information
+        if not callable(self.masker.clustering):
+            self._clustering = self.masker.clustering
+            self._mask_matrix = make_masks(self._clustering)
+        # if we have gotten default arguments for the call function we need to wrap ourselves in a new class that
+        # has a call function with those new default arguments
+        if len(call_args) > 0:
+            class PartitionExplainer(self.__class__):
+                # this signature should match the __call__ signature of the class defined below
+                def __call__(self, *args, max_evals=500, fixed_context=None, main_effects=False, error_bounds=False, batch_size="auto",
+                             outputs=None, silent=False):
+                    return super().__call__(
+                        *args, max_evals=max_evals, fixed_context=fixed_context, main_effects=main_effects, error_bounds=error_bounds,
+                        batch_size=batch_size, outputs=outputs, silent=silent
+                    )
+            PartitionExplainer.__call__.__doc__ = self.__class__.__call__.__doc__
+            self.__class__ = PartitionExplainer
+            for k, v in call_args.items():
+                self.__call__.__kwdefaults__[k] = v
+    # note that changes to this function signature should be copied to the default call argument wrapper above
+    def __call__(self, *args, max_evals=500, fixed_context=None, main_effects=False, error_bounds=False, batch_size="auto",
+                 outputs=None, silent=False):
+        """ Explain the output of the model on the given arguments.
+        """
+        return super().__call__(
+            *args, max_evals=max_evals, fixed_context=fixed_context, main_effects=main_effects, error_bounds=error_bounds, batch_size=batch_size,
+            outputs=outputs, silent=silent
+        )
+    def explain_row(self, *row_args, max_evals, main_effects, error_bounds, batch_size, outputs, silent, fixed_context = "auto"):
+        """ Explains a single row and returns the tuple (row_values, row_expected_values, row_mask_shapes).
+        """
+        if fixed_context == "auto":
+            # if isinstance(self.masker, maskers.Text):
+            #     fixed_context = 1 # we err on the side of speed for text models
+            # else:
+            fixed_context = None
+        elif fixed_context not in [0, 1, None]:
+            raise ValueError("Unknown fixed_context value passed (must be 0, 1 or None): %s" %fixed_context)
+        # build a masked version of the model for the current input sample
+        fm = MaskedModel(self.model, self.masker, self.link, self.linearize_link, *row_args)
+        # make sure we have the base value and current value outputs
+        M = len(fm)
+        m00 = np.zeros(M, dtype=bool)
+        # if not fixed background or no base value assigned then compute base value for a row
+        if self._curr_base_value is None or not getattr(self.masker, "fixed_background", False):
+            self._curr_base_value = fm(m00.reshape(1, -1), zero_index=0)[0] # the zero index param tells the masked model what the baseline is
+        f11 = fm(~m00.reshape(1, -1))[0]
+        if callable(self.masker.clustering):
+            self._clustering = self.masker.clustering(*row_args)
+            self._mask_matrix = make_masks(self._clustering)
+        if hasattr(self._curr_base_value, 'shape') and len(self._curr_base_value.shape) > 0:
+            if outputs is None:
+                outputs = np.arange(len(self._curr_base_value))
+            elif isinstance(outputs, OpChain):
+                outputs = outputs.apply(Explanation(f11)).values
+            out_shape = (2*self._clustering.shape[0]+1, len(outputs))
+        else:
+            out_shape = (2*self._clustering.shape[0]+1,)
+        if max_evals == "auto":
+            max_evals = 500
+        self.values = np.zeros(out_shape)
+        self.dvalues = np.zeros(out_shape)
+        self.owen(fm, self._curr_base_value, f11, max_evals - 2, outputs, fixed_context, batch_size, silent)
+        # if False:
+        #     if self.multi_output:
+        #         return [self.dvalues[:,i] for i in range(self.dvalues.shape[1])], oinds
+        #     else:
+        #         return self.dvalues.copy(), oinds
+        # else:
+        # drop the interaction terms down onto self.values
+        self.values[:] = self.dvalues
+        lower_credit(len(self.dvalues) - 1, 0, M, self.values, self._clustering)
+        return {
+            "values": self.values[:M].copy(),
+            "expected_values": self._curr_base_value if outputs is None else self._curr_base_value[outputs],
+            "mask_shapes": [s + out_shape[1:] for s in fm.mask_shapes],
+            "main_effects": None,
+            "hierarchical_values": self.dvalues.copy(),
+            "clustering": self._clustering,
+            "output_indices": outputs,
+            "output_names": getattr(self.model, "output_names", None)
+        }
+    def __str__(self):
+        return "shap.explainers.PartitionExplainer()"
+    def owen(self, fm, f00, f11, max_evals, output_indexes, fixed_context, batch_size, silent):
+        """ Compute a nested set of recursive Owen values based on an ordering recursion.
+        """
+        #f = self._reshaped_model
+        #r = self.masker
+        #masks = np.zeros(2*len(inds)+1, dtype=int)
+        M = len(fm)
+        m00 = np.zeros(M, dtype=bool)
+        #f00 = fm(m00.reshape(1,-1))[0]
+        base_value = f00
+        #f11 = fm(~m00.reshape(1,-1))[0]
+        #f11 = self._reshaped_model(r(~m00, x)).mean(0)
+        ind = len(self.dvalues)-1
+        # make sure output_indexes is a list of indexes
+        if output_indexes is not None:
+            # assert self.multi_output, "output_indexes is only valid for multi-output models!"
+            # inds = output_indexes.apply(f11, 0)
+            # out_len = output_indexes_len(output_indexes)
+            # if output_indexes.startswith("max("):
+            #     output_indexes = np.argsort(-f11)[:out_len]
+            # elif output_indexes.startswith("min("):
+            #     output_indexes = np.argsort(f11)[:out_len]
+            # elif output_indexes.startswith("max(abs("):
+            #     output_indexes = np.argsort(np.abs(f11))[:out_len]
+            f00 = f00[output_indexes]
+            f11 = f11[output_indexes]
+        q = queue.PriorityQueue()
+        q.put((0, 0, (m00, f00, f11, ind, 1.0)))
+        eval_count = 0
+        total_evals = min(max_evals, (M-1)*M) # TODO: (M-1)*M is only right for balanced clusterings, but this is just for plotting progress...
+        pbar = None
+        start_time = time.time()
+        while not q.empty():
+            # if we passed our execution limit then leave everything else on the internal nodes
+            if eval_count >= max_evals:
+                while not q.empty():
+                    m00, f00, f11, ind, weight = q.get()[2]
+                    self.dvalues[ind] += (f11 - f00) * weight
+                break
+            # create a batch of work to do
+            batch_args = []
+            batch_masks = []
+            while not q.empty() and len(batch_masks) < batch_size and eval_count + len(batch_masks) < max_evals:
+                # get our next set of arguments
+                m00, f00, f11, ind, weight = q.get()[2]
+                # get the left and right children of this cluster
+                lind = int(self._clustering[ind-M, 0]) if ind >= M else -1
+                rind = int(self._clustering[ind-M, 1]) if ind >= M else -1
+                # get the distance of this cluster's children
+                if ind < M:
+                    distance = -1
+                else:
+                    if self._clustering.shape[1] >= 3:
+                        distance = self._clustering[ind-M, 2]
+                    else:
+                        distance = 1
+                # check if we are a leaf node (or other negative distance cluster) and so should terminate our decent
+                if distance < 0:
+                    self.dvalues[ind] += (f11 - f00) * weight
+                    continue
+                # build the masks
+                m10 = m00.copy() # we separate the copy from the add so as to not get converted to a matrix
+                m10[:] += self._mask_matrix[lind, :]
+                m01 = m00.copy()
+                m01[:] += self._mask_matrix[rind, :]
+                batch_args.append((m00, m10, m01, f00, f11, ind, lind, rind, weight))
+                batch_masks.append(m10)
+                batch_masks.append(m01)
+            batch_masks = np.array(batch_masks)
+            # run the batch
+            if len(batch_args) > 0:
+                fout = fm(batch_masks)
+                if output_indexes is not None:
+                    fout = fout[:,output_indexes]
+                eval_count += len(batch_masks)
+                if pbar is None and time.time() - start_time > 5:
+                    pbar = tqdm(total=total_evals, disable=silent, leave=False)
+                    pbar.update(eval_count)
+                if pbar is not None:
+                    pbar.update(len(batch_masks))
+            # use the results of the batch to add new nodes
+            for i in range(len(batch_args)):
+                m00, m10, m01, f00, f11, ind, lind, rind, weight = batch_args[i]
+                # get the evaluated model output on the two new masked inputs
+                f10 = fout[2*i]
+                f01 = fout[2*i+1]
+                new_weight = weight
+                if fixed_context is None:
+                    new_weight /= 2
+                elif fixed_context == 0:
+                    self.dvalues[ind] += (f11 - f10 - f01 + f00) * weight # leave the interaction effect on the internal node
+                elif fixed_context == 1:
+                    self.dvalues[ind] -= (f11 - f10 - f01 + f00) * weight # leave the interaction effect on the internal node
+                if fixed_context is None or fixed_context == 0:
+                    # recurse on the left node with zero context
+                    args = (m00, f00, f10, lind, new_weight)
+                    q.put((-np.max(np.abs(f10 - f00)) * new_weight, np.random.randn(), args))
+                    # recurse on the right node with zero context
+                    args = (m00, f00, f01, rind, new_weight)
+                    q.put((-np.max(np.abs(f01 - f00)) * new_weight, np.random.randn(), args))
+                if fixed_context is None or fixed_context == 1:
+                    # recurse on the left node with one context
+                    args = (m01, f01, f11, lind, new_weight)
+                    q.put((-np.max(np.abs(f11 - f01)) * new_weight, np.random.randn(), args))
+                    # recurse on the right node with one context
+                    args = (m10, f10, f11, rind, new_weight)
+                    q.put((-np.max(np.abs(f11 - f10)) * new_weight, np.random.randn(), args))
+        if pbar is not None:
+            pbar.close()
+        self.last_eval_count = eval_count
+        return output_indexes, base_value
+    def owen3(self, fm, f00, f11, max_evals, output_indexes, fixed_context, batch_size, silent):
+        """ Compute a nested set of recursive Owen values based on an ordering recursion.
+        """
+        #f = self._reshaped_model
+        #r = self.masker
+        #masks = np.zeros(2*len(inds)+1, dtype=int)
+        M = len(fm)
+        m00 = np.zeros(M, dtype=bool)
+        #f00 = fm(m00.reshape(1,-1))[0]
+        base_value = f00
+        #f11 = fm(~m00.reshape(1,-1))[0]
+        #f11 = self._reshaped_model(r(~m00, x)).mean(0)
+        ind = len(self.dvalues)-1
+        # make sure output_indexes is a list of indexes
+        if output_indexes is not None:
+            # assert self.multi_output, "output_indexes is only valid for multi-output models!"
+            # inds = output_indexes.apply(f11, 0)
+            # out_len = output_indexes_len(output_indexes)
+            # if output_indexes.startswith("max("):
+            #     output_indexes = np.argsort(-f11)[:out_len]
+            # elif output_indexes.startswith("min("):
+            #     output_indexes = np.argsort(f11)[:out_len]
+            # elif output_indexes.startswith("max(abs("):
+            #     output_indexes = np.argsort(np.abs(f11))[:out_len]
+            f00 = f00[output_indexes]
+            f11 = f11[output_indexes]
+        # our starting plan is to evaluate all the nodes with a fixed_context
+        evals_planned = M
+        q = queue.PriorityQueue()
+        q.put((0, 0, (m00, f00, f11, ind, 1.0, fixed_context))) # (m00, f00, f11, tree_index, weight)
+        eval_count = 0
+        total_evals = min(max_evals, (M-1)*M) # TODO: (M-1)*M is only right for balanced clusterings, but this is just for plotting progress...
+        pbar = None
+        start_time = time.time()
+        while not q.empty():
+            # if we passed our execution limit then leave everything else on the internal nodes
+            if eval_count >= max_evals:
+                while not q.empty():
+                    m00, f00, f11, ind, weight, _ = q.get()[2]
+                    self.dvalues[ind] += (f11 - f00) * weight
+                break
+            # create a batch of work to do
+            batch_args = []
+            batch_masks = []
+            while not q.empty() and len(batch_masks) < batch_size and eval_count < max_evals:
+                # get our next set of arguments
+                m00, f00, f11, ind, weight, context = q.get()[2]
+                # get the left and right children of this cluster
+                lind = int(self._clustering[ind-M, 0]) if ind >= M else -1
+                rind = int(self._clustering[ind-M, 1]) if ind >= M else -1
+                # get the distance of this cluster's children
+                if ind < M:
+                    distance = -1
+                else:
+                    distance = self._clustering[ind-M, 2]
+                # check if we are a leaf node (or other negative distance cluster) and so should terminate our decent
+                if distance < 0:
+                    self.dvalues[ind] += (f11 - f00) * weight
+                    continue
+                # build the masks
+                m10 = m00.copy() # we separate the copy from the add so as to not get converted to a matrix
+                m10[:] += self._mask_matrix[lind, :]
+                m01 = m00.copy()
+                m01[:] += self._mask_matrix[rind, :]
+                batch_args.append((m00, m10, m01, f00, f11, ind, lind, rind, weight, context))
+                batch_masks.append(m10)
+                batch_masks.append(m01)
+            batch_masks = np.array(batch_masks)
+            # run the batch
+            if len(batch_args) > 0:
+                fout = fm(batch_masks)
+                if output_indexes is not None:
+                    fout = fout[:,output_indexes]
+                eval_count += len(batch_masks)
+                if pbar is None and time.time() - start_time > 5:
+                    pbar = tqdm(total=total_evals, disable=silent, leave=False)
+                    pbar.update(eval_count)
+                if pbar is not None:
+                    pbar.update(len(batch_masks))
+            # use the results of the batch to add new nodes
+            for i in range(len(batch_args)):
+                m00, m10, m01, f00, f11, ind, lind, rind, weight, context = batch_args[i]
+                # get the the number of leaves in this cluster
+                if ind < M:
+                    num_leaves = 0
+                else:
+                    num_leaves = self._clustering[ind-M, 3]
+                # get the evaluated model output on the two new masked inputs
+                f10 = fout[2*i]
+                f01 = fout[2*i+1]
+                # see if we have enough evaluations left to get both sides of a fixed context
+                if max_evals - evals_planned > num_leaves:
+                    evals_planned += num_leaves
+                    ignore_context = True
+                else:
+                    ignore_context = False
+                new_weight = weight
+                if context is None or ignore_context:
+                    new_weight /= 2
+                if context is None or context == 0 or ignore_context:
+                    self.dvalues[ind] += (f11 - f10 - f01 + f00) * weight # leave the interaction effect on the internal node
+                    # recurse on the left node with zero context, flip the context for all descendents if we are ignoring it
+                    args = (m00, f00, f10, lind, new_weight, 0 if context == 1 else context)
+                    q.put((-np.max(np.abs(f10 - f00)) * new_weight, np.random.randn(), args))
+                    # recurse on the right node with zero context, flip the context for all descendents if we are ignoring it
+                    args = (m00, f00, f01, rind, new_weight, 0 if context == 1 else context)
+                    q.put((-np.max(np.abs(f01 - f00)) * new_weight, np.random.randn(), args))
+                if context is None or context == 1 or ignore_context:
+                    self.dvalues[ind] -= (f11 - f10 - f01 + f00) * weight # leave the interaction effect on the internal node
+                    # recurse on the left node with one context, flip the context for all descendents if we are ignoring it
+                    args = (m01, f01, f11, lind, new_weight, 1 if context == 0 else context)
+                    q.put((-np.max(np.abs(f11 - f01)) * new_weight, np.random.randn(), args))
+                    # recurse on the right node with one context, flip the context for all descendents if we are ignoring it
+                    args = (m10, f10, f11, rind, new_weight, 1 if context == 0 else context)
+                    q.put((-np.max(np.abs(f11 - f10)) * new_weight, np.random.randn(), args))
+        if pbar is not None:
+            pbar.close()
+        self.last_eval_count = eval_count
+        return output_indexes, base_value
+    # def owen2(self, fm, f00, f11, max_evals, output_indexes, fixed_context, batch_size, silent):
+    #     """ Compute a nested set of recursive Owen values based on an ordering recursion.
+    #     """
+    #     #f = self._reshaped_model
+    #     #r = self.masker
+    #     #masks = np.zeros(2*len(inds)+1, dtype=int)
+    #     M = len(fm)
+    #     m00 = np.zeros(M, dtype=bool)
+    #     #f00 = fm(m00.reshape(1,-1))[0]
+    #     base_value = f00
+    #     #f11 = fm(~m00.reshape(1,-1))[0]
+    #     #f11 = self._reshaped_model(r(~m00, x)).mean(0)
+    #     ind = len(self.dvalues)-1
+    #     # make sure output_indexes is a list of indexes
+    #     if output_indexes is not None:
+    #         # assert self.multi_output, "output_indexes is only valid for multi-output models!"
+    #         # inds = output_indexes.apply(f11, 0)
+    #         # out_len = output_indexes_len(output_indexes)
+    #         # if output_indexes.startswith("max("):
+    #         #     output_indexes = np.argsort(-f11)[:out_len]
+    #         # elif output_indexes.startswith("min("):
+    #         #     output_indexes = np.argsort(f11)[:out_len]
+    #         # elif output_indexes.startswith("max(abs("):
+    #         #     output_indexes = np.argsort(np.abs(f11))[:out_len]
+    #         f00 = f00[output_indexes]
+    #         f11 = f11[output_indexes]
+    #     fc_owen(m00, m11, 1)
+    #     fc_owen(m00, m11, 0)
+    #     def fc_owen(m00, m11, context):
+    #         # recurse on the left node with zero context
+    #         args = (m00, f00, f10, lind, new_weight)
+    #         q.put((-np.max(np.abs(f10 - f00)) * new_weight, np.random.randn(), args))
+    #         # recurse on the right node with zero context
+    #         args = (m00, f00, f01, rind, new_weight)
+    #         q.put((-np.max(np.abs(f01 - f00)) * new_weight, np.random.randn(), args))
+    #         fc_owen(m00, m11, 1)
+    #     m00 m11
+    #     owen(fc=1)
+    #     owen(fc=0)
+    #     q = queue.PriorityQueue()
+    #     q.put((0, 0, (m00, f00, f11, ind, 1.0, 1)))
+    #     eval_count = 0
+    #     total_evals = min(max_evals, (M-1)*M) # TODO: (M-1)*M is only right for balanced clusterings, but this is just for plotting progress...
+    #     pbar = None
+    #     start_time = time.time()
+    #     while not q.empty():
+    #         # if we passed our execution limit then leave everything else on the internal nodes
+    #         if eval_count >= max_evals:
+    #             while not q.empty():
+    #                 m00, f00, f11, ind, weight, _ = q.get()[2]
+    #                 self.dvalues[ind] += (f11 - f00) * weight
+    #             break
+    #         # create a batch of work to do
+    #         batch_args = []
+    #         batch_masks = []
+    #         while not q.empty() and len(batch_masks) < batch_size and eval_count < max_evals:
+    #             # get our next set of arguments
+    #             m00, f00, f11, ind, weight, context = q.get()[2]
+    #             # get the left and right children of this cluster
+    #             lind = int(self._clustering[ind-M, 0]) if ind >= M else -1
+    #             rind = int(self._clustering[ind-M, 1]) if ind >= M else -1
+    #             # get the distance of this cluster's children
+    #             if ind < M:
+    #                 distance = -1
+    #             else:
+    #                 if self._clustering.shape[1] >= 3:
+    #                     distance = self._clustering[ind-M, 2]
+    #                 else:
+    #                     distance = 1
+    #             # check if we are a leaf node (or other negative distance cluster) and so should terminate our decent
+    #             if distance < 0:
+    #                 self.dvalues[ind] += (f11 - f00) * weight
+    #                 continue
+    #             # build the masks
+    #             m10 = m00.copy() # we separate the copy from the add so as to not get converted to a matrix
+    #             m10[:] += self._mask_matrix[lind, :]
+    #             m01 = m00.copy()
+    #             m01[:] += self._mask_matrix[rind, :]
+    #             batch_args.append((m00, m10, m01, f00, f11, ind, lind, rind, weight, context))
+    #             batch_masks.append(m10)
+    #             batch_masks.append(m01)
+    #         batch_masks = np.array(batch_masks)
+    #         # run the batch
+    #         if len(batch_args) > 0:
+    #             fout = fm(batch_masks)
+    #             if output_indexes is not None:
+    #                 fout = fout[:,output_indexes]
+    #             eval_count += len(batch_masks)
+    #             if pbar is None and time.time() - start_time > 5:
+    #                 pbar = tqdm(total=total_evals, disable=silent, leave=False)
+    #                 pbar.update(eval_count)
+    #             if pbar is not None:
+    #                 pbar.update(len(batch_masks))
+    #         # use the results of the batch to add new nodes
+    #         for i in range(len(batch_args)):
+    #             m00, m10, m01, f00, f11, ind, lind, rind, weight, context = batch_args[i]
+    #             # get the evaluated model output on the two new masked inputs
+    #             f10 = fout[2*i]
+    #             f01 = fout[2*i+1]
+    #             new_weight = weight
+    #             if fixed_context is None:
+    #                 new_weight /= 2
+    #             elif fixed_context == 0:
+    #                 self.dvalues[ind] += (f11 - f10 - f01 + f00) * weight # leave the interaction effect on the internal node
+    #             elif fixed_context == 1:
+    #                 self.dvalues[ind] -= (f11 - f10 - f01 + f00) * weight # leave the interaction effect on the internal node
+    #             if fixed_context is None or fixed_context == 0:
+    #                 self.dvalues[ind] += (f11 - f10 - f01 + f00) * weight # leave the interaction effect on the internal node
+    #                 # recurse on the left node with zero context
+    #                 args = (m00, f00, f10, lind, new_weight)
+    #                 q.put((-np.max(np.abs(f10 - f00)) * new_weight, np.random.randn(), args))
+    #                 # recurse on the right node with zero context
+    #                 args = (m00, f00, f01, rind, new_weight)
+    #                 q.put((-np.max(np.abs(f01 - f00)) * new_weight, np.random.randn(), args))
+    #             if fixed_context is None or fixed_context == 1:
+    #                 self.dvalues[ind] -= (f11 - f10 - f01 + f00) * weight # leave the interaction effect on the internal node
+    #                 # recurse on the left node with one context
+    #                 args = (m01, f01, f11, lind, new_weight)
+    #                 q.put((-np.max(np.abs(f11 - f01)) * new_weight, np.random.randn(), args))
+    #                 # recurse on the right node with one context
+    #                 args = (m10, f10, f11, rind, new_weight)
+    #                 q.put((-np.max(np.abs(f11 - f10)) * new_weight, np.random.randn(), args))
+    #     if pbar is not None:
+    #         pbar.close()
+    #     return output_indexes, base_value
+def output_indexes_len(output_indexes):
+    if output_indexes.startswith("max("):
+        return int(output_indexes[4:-1])
+    elif output_indexes.startswith("min("):
+        return int(output_indexes[4:-1])
+    elif output_indexes.startswith("max(abs("):
+        return int(output_indexes[8:-2])
+    elif not isinstance(output_indexes, str):
+        return len(output_indexes)
+@njit
+def lower_credit(i, value, M, values, clustering):
+    if i < M:
+        values[i] += value
+        return
+    li = int(clustering[i-M,0])
+    ri = int(clustering[i-M,1])
+    group_size = int(clustering[i-M,3])
+    lsize = int(clustering[li-M,3]) if li >= M else 1
+    rsize = int(clustering[ri-M,3]) if ri >= M else 1
+    assert lsize+rsize == group_size
+    values[i] += value
+    lower_credit(li, values[i] * lsize / group_size, M, values, clustering)
+    lower_credit(ri, values[i] * rsize / group_size, M, values, clustering)

lib/shap/explainers/_permutation.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import warnings
+import numpy as np
+from .. import links
+from ..models import Model
+from ..utils import MaskedModel, partition_tree_shuffle
+from ._explainer import Explainer
+class PermutationExplainer(Explainer):
+    """ This method approximates the Shapley values by iterating through permutations of the inputs.
+    This is a model agnostic explainer that guarantees local accuracy (additivity) by iterating completely
+    through an entire permutation of the features in both forward and reverse directions (antithetic sampling).
+    If we do this once, then we get the exact SHAP values for models with up to second order interaction effects.
+    We can iterate this many times over many random permutations to get better SHAP value estimates for models
+    with higher order interactions. This sequential ordering formulation also allows for easy reuse of
+    model evaluations and the ability to efficiently avoid evaluating the model when the background values
+    for a feature are the same as the current input value. We can also account for hierarchical data
+    structures with partition trees, something not currently implemented for KernalExplainer or SamplingExplainer.
+    """
+    def __init__(self, model, masker, link=links.identity, feature_names=None, linearize_link=True, seed=None, **call_args):
+        """ Build an explainers.Permutation object for the given model using the given masker object.
+        Parameters
+        ----------
+        model : function
+            A callable python object that executes the model given a set of input data samples.
+        masker : function or numpy.array or pandas.DataFrame
+            A callable python object used to "mask" out hidden features of the form `masker(binary_mask, x)`.
+            It takes a single input sample and a binary mask and returns a matrix of masked samples. These
+            masked samples are evaluated using the model function and the outputs are then averaged.
+            As a shortcut for the standard masking using by SHAP you can pass a background data matrix
+            instead of a function and that matrix will be used for masking. To use a clustering
+            game structure you can pass a shap.maskers.Tabular(data, clustering=\"correlation\") object.
+        seed: None or int
+            Seed for reproducibility
+        **call_args : valid argument to the __call__ method
+            These arguments are saved and passed to the __call__ method as the new default values for these arguments.
+        """
+        # setting seed for random generation: if seed is not None, then shap values computation should be reproducible
+        np.random.seed(seed)
+        if masker is None:
+            raise ValueError("masker cannot be None.")
+        super().__init__(model, masker, link=link, linearize_link=linearize_link, feature_names=feature_names)
+        if not isinstance(self.model, Model):
+            self.model = Model(self.model)
+        # if we have gotten default arguments for the call function we need to wrap ourselves in a new class that
+        # has a call function with those new default arguments
+        if len(call_args) > 0:
+            # this signature should match the __call__ signature of the class defined below
+            class PermutationExplainer(self.__class__):
+                def __call__(self, *args, max_evals=500, main_effects=False, error_bounds=False, batch_size="auto",
+                             outputs=None, silent=False):
+                    return super().__call__(
+                        *args, max_evals=max_evals, main_effects=main_effects, error_bounds=error_bounds,
+                        batch_size=batch_size, outputs=outputs, silent=silent
+                    )
+            PermutationExplainer.__call__.__doc__ = self.__class__.__call__.__doc__
+            self.__class__ = PermutationExplainer
+            for k, v in call_args.items():
+                self.__call__.__kwdefaults__[k] = v
+    # note that changes to this function signature should be copied to the default call argument wrapper above
+    def __call__(self, *args, max_evals=500, main_effects=False, error_bounds=False, batch_size="auto",
+                 outputs=None, silent=False):
+        """ Explain the output of the model on the given arguments.
+        """
+        return super().__call__(
+            *args, max_evals=max_evals, main_effects=main_effects, error_bounds=error_bounds, batch_size=batch_size,
+            outputs=outputs, silent=silent
+        )
+    def explain_row(self, *row_args, max_evals, main_effects, error_bounds, batch_size, outputs, silent):
+        """ Explains a single row and returns the tuple (row_values, row_expected_values, row_mask_shapes).
+        """
+        # build a masked version of the model for the current input sample
+        fm = MaskedModel(self.model, self.masker, self.link, self.linearize_link, *row_args)
+        # by default we run 10 permutations forward and backward
+        if max_evals == "auto":
+            max_evals = 10 * 2 * len(fm)
+        # compute any custom clustering for this row
+        row_clustering = None
+        if getattr(self.masker, "clustering", None) is not None:
+            if isinstance(self.masker.clustering, np.ndarray):
+                row_clustering = self.masker.clustering
+            elif callable(self.masker.clustering):
+                row_clustering = self.masker.clustering(*row_args)
+            else:
+                raise NotImplementedError("The masker passed has a .clustering attribute that is not yet supported by the Permutation explainer!")
+        # loop over many permutations
+        inds = fm.varying_inputs()
+        inds_mask = np.zeros(len(fm), dtype=bool)
+        inds_mask[inds] = True
+        masks = np.zeros(2*len(inds)+1, dtype=int)
+        masks[0] = MaskedModel.delta_mask_noop_value
+        npermutations = max_evals // (2*len(inds)+1)
+        row_values = None
+        row_values_history = None
+        history_pos = 0
+        main_effect_values = None
+        if len(inds) > 0:
+            for _ in range(npermutations):
+                # shuffle the indexes so we get a random permutation ordering
+                if row_clustering is not None:
+                    # [TODO] This is shuffle does not work when inds is not a complete set of integers from 0 to M TODO: still true?
+                    #assert len(inds) == len(fm), "Need to support partition shuffle when not all the inds vary!!"
+                    partition_tree_shuffle(inds, inds_mask, row_clustering)
+                else:
+                    np.random.shuffle(inds)
+                # create a large batch of masks to evaluate
+                i = 1
+                for ind in inds:
+                    masks[i] = ind
+                    i += 1
+                for ind in inds:
+                    masks[i] = ind
+                    i += 1
+                # evaluate the masked model
+                outputs = fm(masks, zero_index=0, batch_size=batch_size)
+                if row_values is None:
+                    row_values = np.zeros((len(fm),) + outputs.shape[1:])
+                    if error_bounds:
+                        row_values_history = np.zeros((2 * npermutations, len(fm),) + outputs.shape[1:])
+                # update our SHAP value estimates
+                i = 0
+                for ind in inds: # forward
+                    row_values[ind] += outputs[i + 1] - outputs[i]
+                    if error_bounds:
+                        row_values_history[history_pos][ind] = outputs[i + 1] - outputs[i]
+                    i += 1
+                history_pos += 1
+                for ind in inds: # backward
+                    row_values[ind] += outputs[i] - outputs[i + 1]
+                    if error_bounds:
+                        row_values_history[history_pos][ind] = outputs[i] - outputs[i + 1]
+                    i += 1
+                history_pos += 1
+            if npermutations == 0:
+                raise ValueError(f"max_evals={max_evals} is too low for the Permutation explainer, it must be at least 2 * num_features + 1 = {2 * len(inds) + 1}!")
+            expected_value = outputs[0]
+            # compute the main effects if we need to
+            if main_effects:
+                main_effect_values = fm.main_effects(inds, batch_size=batch_size)
+        else:
+            masks = np.zeros(1, dtype=int)
+            outputs = fm(masks, zero_index=0, batch_size=1)
+            expected_value = outputs[0]
+            row_values = np.zeros((len(fm),) + outputs.shape[1:])
+            if error_bounds:
+                row_values_history = np.zeros((2 * npermutations, len(fm),) + outputs.shape[1:])
+        return {
+            "values": row_values / (2 * npermutations),
+            "expected_values": expected_value,
+            "mask_shapes": fm.mask_shapes,
+            "main_effects": main_effect_values,
+            "clustering": row_clustering,
+            "error_std": None if row_values_history is None else row_values_history.std(0),
+            "output_names": self.model.output_names if hasattr(self.model, "output_names") else None
+        }
+    def shap_values(self, X, npermutations=10, main_effects=False, error_bounds=False, batch_evals=True, silent=False):
+        """ Legacy interface to estimate the SHAP values for a set of samples.
+        Parameters
+        ----------
+        X : numpy.array or pandas.DataFrame or any scipy.sparse matrix
+            A matrix of samples (# samples x # features) on which to explain the model's output.
+        npermutations : int
+            Number of times to cycle through all the features, re-evaluating the model at each step.
+            Each cycle evaluates the model function 2 * (# features + 1) times on a data matrix of
+            (# background data samples) rows. An exception to this is when PermutationExplainer can
+            avoid evaluating the model because a feature's value is the same in X and the background
+            dataset (which is common for example with sparse features).
+        Returns
+        -------
+        array or list
+            For models with a single output this returns a matrix of SHAP values
+            (# samples x # features). Each row sums to the difference between the model output for that
+            sample and the expected value of the model output (which is stored as expected_value
+            attribute of the explainer). For models with vector outputs this returns a list
+            of such matrices, one for each output.
+        """
+        warnings.warn("shap_values() is deprecated; use __call__().", DeprecationWarning)
+        explanation = self(X, max_evals=npermutations * X.shape[1], main_effects=main_effects)
+        return explanation.values
+    def __str__(self):
+        return "shap.explainers.PermutationExplainer()"

lib/shap/explainers/_sampling.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import logging
+import numpy as np
+import pandas as pd
+from .._explanation import Explanation
+from ..utils._exceptions import ExplainerError
+from ..utils._legacy import convert_to_instance, match_instance_to_data
+from ._kernel import KernelExplainer
+log = logging.getLogger('shap')
+class SamplingExplainer(KernelExplainer):
+    """Computes SHAP values using an extension of the Shapley sampling values explanation method
+    (also known as IME).
+    SamplingExplainer computes SHAP values under the assumption of feature independence and is an
+    extension of the algorithm proposed in "An Efficient Explanation of Individual Classifications
+    using Game Theory", Erik Strumbelj, Igor Kononenko, JMLR 2010. It is a good alternative to
+    KernelExplainer when you want to use a large background set (as opposed to a single reference
+    value for example).
+    Parameters
+    ----------
+    model : function
+        User supplied function that takes a matrix of samples (# samples x # features) and
+        computes the output of the model for those samples. The output can be a vector
+        (# samples) or a matrix (# samples x # model outputs).
+    data : numpy.array or pandas.DataFrame
+        The background dataset to use for integrating out features. To determine the impact
+        of a feature, that feature is set to "missing" and the change in the model output
+        is observed. Since most models aren't designed to handle arbitrary missing data at test
+        time, we simulate "missing" by replacing the feature with the values it takes in the
+        background dataset. So if the background dataset is a simple sample of all zeros, then
+        we would approximate a feature being missing by setting it to zero. Unlike the
+        KernelExplainer, this data can be the whole training set, even if that is a large set. This
+        is because SamplingExplainer only samples from this background dataset.
+    """
+    def __init__(self, model, data, **kwargs):
+        # silence warning about large datasets
+        level = log.level
+        log.setLevel(logging.ERROR)
+        super().__init__(model, data, **kwargs)
+        log.setLevel(level)
+        if str(self.link) != "identity":
+            emsg = f"SamplingExplainer only supports the identity link, not {self.link}"
+            raise ValueError(emsg)
+    def __call__(self, X, y=None, nsamples=2000):
+        if isinstance(X, pd.DataFrame):
+            feature_names = list(X.columns)
+            X = X.values
+        else:
+            feature_names = None # we can make self.feature_names from background data eventually if we have it
+        v = self.shap_values(X, nsamples=nsamples)
+        if isinstance(v, list):
+            v = np.stack(v, axis=-1) # put outputs at the end
+        e = Explanation(v, self.expected_value, X, feature_names=feature_names)
+        return e
+    def explain(self, incoming_instance, **kwargs):
+        # convert incoming input to a standardized iml object
+        instance = convert_to_instance(incoming_instance)
+        match_instance_to_data(instance, self.data)
+        if len(self.data.groups) != self.P:
+            emsg = "SamplingExplainer does not support feature groups!"
+            raise ExplainerError(emsg)
+        # find the feature groups we will test. If a feature does not change from its
+        # current value then we know it doesn't impact the model
+        self.varyingInds = self.varying_groups(instance.x)
+        #self.varyingFeatureGroups = [self.data.groups[i] for i in self.varyingInds]
+        self.M = len(self.varyingInds)
+        # find f(x)
+        if self.keep_index:
+            model_out = self.model.f(instance.convert_to_df())
+        else:
+            model_out = self.model.f(instance.x)
+        if isinstance(model_out, (pd.DataFrame, pd.Series)):
+            model_out = model_out.values[0]
+        self.fx = model_out[0]
+        if not self.vector_out:
+            self.fx = np.array([self.fx])
+        # if no features vary then there no feature has an effect
+        if self.M == 0:
+            phi = np.zeros((len(self.data.groups), self.D))
+            phi_var = np.zeros((len(self.data.groups), self.D))
+        # if only one feature varies then it has all the effect
+        elif self.M == 1:
+            phi = np.zeros((len(self.data.groups), self.D))
+            phi_var = np.zeros((len(self.data.groups), self.D))
+            diff = self.fx - self.fnull
+            for d in range(self.D):
+                phi[self.varyingInds[0],d] = diff[d]
+        # if more than one feature varies then we have to do real work
+        else:
+            # pick a reasonable number of samples if the user didn't specify how many they wanted
+            self.nsamples = kwargs.get("nsamples", "auto")
+            if self.nsamples == "auto":
+                self.nsamples = 1000 * self.M
+            min_samples_per_feature = kwargs.get("min_samples_per_feature", 100)
+            round1_samples = self.nsamples
+            round2_samples = 0
+            if round1_samples > self.M * min_samples_per_feature:
+                round2_samples = round1_samples - self.M * min_samples_per_feature
+                round1_samples -= round2_samples
+            # divide up the samples among the features for round 1
+            nsamples_each1 = np.ones(self.M, dtype=np.int64) * 2 * (round1_samples // (self.M * 2))
+            for i in range((round1_samples % (self.M * 2)) // 2):
+                nsamples_each1[i] += 2
+            # explain every feature in round 1
+            phi = np.zeros((self.P, self.D))
+            phi_var = np.zeros((self.P, self.D))
+            self.X_masked = np.zeros((nsamples_each1.max() * 2, self.data.data.shape[1]))
+            for i,ind in enumerate(self.varyingInds):
+                phi[ind,:],phi_var[ind,:] = self.sampling_estimate(ind, self.model.f, instance.x, self.data.data, nsamples=nsamples_each1[i])
+            # optimally allocate samples according to the variance
+            if phi_var.sum() == 0:
+                phi_var += 1 # spread samples uniformally if we found no variability
+            phi_var /= phi_var.sum(0)[np.newaxis, :]
+            nsamples_each2 = (phi_var[self.varyingInds,:].mean(1) * round2_samples).astype(int)
+            for i in range(len(nsamples_each2)):
+                if nsamples_each2[i] % 2 == 1:
+                    nsamples_each2[i] += 1
+            for i in range(len(nsamples_each2)):
+                if nsamples_each2.sum() > round2_samples:
+                    nsamples_each2[i] -= 2
+                elif nsamples_each2.sum() < round2_samples:
+                    nsamples_each2[i] += 2
+                else:
+                    break
+            self.X_masked = np.zeros((nsamples_each2.max() * 2, self.data.data.shape[1]))
+            for i,ind in enumerate(self.varyingInds):
+                if nsamples_each2[i] > 0:
+                    val,var = self.sampling_estimate(ind, self.model.f, instance.x, self.data.data, nsamples=nsamples_each2[i])
+                    total_samples = nsamples_each1[i] + nsamples_each2[i]
+                    phi[ind,:] = (phi[ind,:] * nsamples_each1[i] + val * nsamples_each2[i]) / total_samples
+                    phi_var[ind,:] = (phi_var[ind,:] * nsamples_each1[i] + var * nsamples_each2[i]) / total_samples
+            # convert from the variance of the differences to the variance of the mean (phi)
+            for i,ind in enumerate(self.varyingInds):
+                phi_var[ind,:] /= np.sqrt(nsamples_each1[i] + nsamples_each2[i])
+            # correct the sum of the SHAP values to equal the output of the model using a linear
+            # regression model with priors of the coefficients equal to the estimated variances for each
+            # SHAP value (note that 1e6 is designed to increase the weight of the sample and so closely
+            # match the correct sum)
+            sum_error = self.fx - phi.sum(0) - self.fnull
+            for i in range(self.D):
+                # this is a ridge regression with one sample of all ones with sum_error[i] as the label
+                # and 1/v as the ridge penalties. This simplified (and stable) form comes from the
+                # Sherman-Morrison formula
+                v = (phi_var[:,i] / phi_var[:,i].max()) * 1e6
+                adj = sum_error[i] * (v - (v * v.sum()) / (1 + v.sum()))
+                phi[:,i] += adj
+        if phi.shape[1] == 1:
+            phi = phi[:,0]
+        return phi
+    def sampling_estimate(self, j, f, x, X, nsamples=10):
+        X_masked = self.X_masked[:nsamples * 2,:]
+        inds = np.arange(X.shape[1])
+        for i in range(0, nsamples):
+            np.random.shuffle(inds)
+            pos = np.where(inds == j)[0][0]
+            rind = np.random.randint(X.shape[0])
+            X_masked[i, :] = x
+            X_masked[i, inds[pos+1:]] = X[rind, inds[pos+1:]]
+            X_masked[-(i+1), :] = x
+            X_masked[-(i+1), inds[pos:]] = X[rind, inds[pos:]]
+        evals = f(X_masked)
+        evals_on = evals[:nsamples]
+        evals_off = evals[nsamples:][::-1]
+        d = evals_on - evals_off
+        return np.mean(d, 0), np.var(d, 0)

lib/shap/explainers/_tree.py ADDED Viewed

The diff for this file is too large to render. See raw diff

lib/shap/explainers/other/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import warnings
+from ._coefficient import Coefficient
+from ._lime import LimeTabular
+from ._maple import Maple, TreeMaple
+from ._random import Random
+from ._treegain import TreeGain
+__all__ = [
+    "Coefficient",
+    "LimeTabular",
+    "Maple",
+    "TreeMaple",
+    "Random",
+    "TreeGain",
+]
+# Deprecated class alias with incorrect spelling
+def Coefficent(*args, **kwargs):  # noqa
+    warnings.warn(
+        "Coefficent has been renamed to Coefficient. "
+        "The former is deprecated and will be removed in shap 0.45.",
+        DeprecationWarning
+    )
+    return Coefficient(*args, **kwargs)

lib/shap/explainers/other/_coefficient.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import numpy as np
+from .._explainer import Explainer
+class Coefficient(Explainer):
+    """ Simply returns the model coefficients as the feature attributions.
+    This is only for benchmark comparisons and does not approximate SHAP values in a
+    meaningful way.
+    """
+    def __init__(self, model):
+        assert hasattr(model, "coef_"), "The passed model does not have a coef_ attribute!"
+        self.model = model
+    def attributions(self, X):
+        return np.tile(self.model.coef_, (X.shape[0], 1))

lib/shap/explainers/other/_lime.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import numpy as np
+import pandas as pd
+from .._explainer import Explainer
+try:
+    import lime
+    import lime.lime_tabular
+except ImportError:
+    pass
+class LimeTabular(Explainer):
+    """ Simply wrap of lime.lime_tabular.LimeTabularExplainer into the common shap interface.
+    Parameters
+    ----------
+    model : function or iml.Model
+        User supplied function that takes a matrix of samples (# samples x # features) and
+        computes the output of the model for those samples. The output can be a vector
+        (# samples) or a matrix (# samples x # model outputs).
+    data : numpy.array
+        The background dataset.
+    mode : "classification" or "regression"
+        Control the mode of LIME tabular.
+    """
+    def __init__(self, model, data, mode="classification"):
+        self.model = model
+        if mode not in ["classification", "regression"]:
+            emsg = f"Invalid mode {mode!r}, must be one of 'classification' or 'regression'"
+            raise ValueError(emsg)
+        self.mode = mode
+        if isinstance(data, pd.DataFrame):
+            data = data.values
+        self.data = data
+        self.explainer = lime.lime_tabular.LimeTabularExplainer(data, mode=mode)
+        out = self.model(data[0:1])
+        if len(out.shape) == 1:
+            self.out_dim = 1
+            self.flat_out = True
+            if mode == "classification":
+                def pred(X): # assume that 1d outputs are probabilities
+                    preds = self.model(X).reshape(-1, 1)
+                    p0 = 1 - preds
+                    return np.hstack((p0, preds))
+                self.model = pred
+        else:
+            self.out_dim = self.model(data[0:1]).shape[1]
+            self.flat_out = False
+    def attributions(self, X, nsamples=5000, num_features=None):
+        num_features = X.shape[1] if num_features is None else num_features
+        if isinstance(X, pd.DataFrame):
+            X = X.values
+        out = [np.zeros(X.shape) for j in range(self.out_dim)]
+        for i in range(X.shape[0]):
+            exp = self.explainer.explain_instance(X[i], self.model, labels=range(self.out_dim), num_features=num_features)
+            for j in range(self.out_dim):
+                for k,v in exp.local_exp[j]:
+                    out[j][i,k] = v
+        # because it output two results even for only one model output, and they are negated from what we expect
+        if self.mode == "regression":
+            for i in range(len(out)):
+                out[i] = -out[i]
+        return out[0] if self.flat_out else out

lib/shap/explainers/other/_maple.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from .._explainer import Explainer
+class Maple(Explainer):
+    """ Simply wraps MAPLE into the common SHAP interface.
+    Parameters
+    ----------
+    model : function
+        User supplied function that takes a matrix of samples (# samples x # features) and
+        computes the output of the model for those samples. The output can be a vector
+        (# samples) or a matrix (# samples x # model outputs).
+    data : numpy.array
+        The background dataset.
+    """
+    def __init__(self, model, data):
+        self.model = model
+        if isinstance(data, pd.DataFrame):
+            data = data.values
+        self.data = data
+        self.data_mean = self.data.mean(0)
+        out = self.model(data)
+        if len(out.shape) == 1:
+            self.out_dim = 1
+            self.flat_out = True
+        else:
+            self.out_dim = out.shape[1]
+            self.flat_out = False
+        X_train, X_valid, y_train, y_valid = train_test_split(data, out, test_size=0.2, random_state=0)
+        self.explainer = MAPLE(X_train, y_train, X_valid, y_valid)
+    def attributions(self, X, multiply_by_input=False):
+        """ Compute the MAPLE coef attributions.
+        Parameters
+        ----------
+        multiply_by_input : bool
+            If true, this multiplies the learned coefficients by the mean-centered input. This makes these
+            values roughly comparable to SHAP values.
+        """
+        if isinstance(X, pd.DataFrame):
+            X = X.values
+        out = [np.zeros(X.shape) for j in range(self.out_dim)]
+        for i in range(X.shape[0]):
+            exp = self.explainer.explain(X[i])["coefs"]
+            out[0][i,:] = exp[1:]
+            if multiply_by_input:
+                out[0][i,:] = out[0][i,:] * (X[i] - self.data_mean)
+        return out[0] if self.flat_out else out
+class TreeMaple(Explainer):
+    """ Simply tree MAPLE into the common SHAP interface.
+    Parameters
+    ----------
+    model : function
+        User supplied function that takes a matrix of samples (# samples x # features) and
+        computes the output of the model for those samples. The output can be a vector
+        (# samples) or a matrix (# samples x # model outputs).
+    data : numpy.array
+        The background dataset.
+    """
+    def __init__(self, model, data):
+        self.model = model
+        if str(type(model)).endswith("sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>"):
+            fe_type = "gbdt"
+        # elif str(type(model)).endswith("sklearn.tree.tree.DecisionTreeClassifier'>"):
+            # pass
+        elif str(type(model)).endswith("sklearn.ensemble.forest.RandomForestRegressor'>"):
+            fe_type = "rf"
+        # elif str(type(model)).endswith("sklearn.ensemble.forest.RandomForestClassifier'>"):
+        #     pass
+        # elif str(type(model)).endswith("xgboost.sklearn.XGBRegressor'>"):
+        #     pass
+        # elif str(type(model)).endswith("xgboost.sklearn.XGBClassifier'>"):
+        #     pass
+        else:
+            raise NotImplementedError("The passed model is not yet supported by TreeMapleExplainer: " + str(type(model)))
+        if isinstance(data, pd.DataFrame):
+            data = data.values
+        self.data = data
+        self.data_mean = self.data.mean(0)
+        out = self.model.predict(data[0:1])
+        if len(out.shape) == 1:
+            self.out_dim = 1
+            self.flat_out = True
+        else:
+            self.out_dim = self.model.predict(data[0:1]).shape[1]
+            self.flat_out = False
+        #_, X_valid, _, y_valid = train_test_split(data, self.model.predict(data), test_size=0.2, random_state=0)
+        preds = self.model.predict(data)
+        self.explainer = MAPLE(data, preds, data, preds, fe=self.model, fe_type=fe_type)
+    def attributions(self, X, multiply_by_input=False):
+        """ Compute the MAPLE coef attributions.
+        Parameters
+        ----------
+        multiply_by_input : bool
+            If true, this multiplies the learned coefficients by the mean-centered input. This makes these
+            values roughly comparable to SHAP values.
+        """
+        if isinstance(X, pd.DataFrame):
+            X = X.values
+        out = [np.zeros(X.shape) for j in range(self.out_dim)]
+        for i in range(X.shape[0]):
+            exp = self.explainer.explain(X[i])["coefs"]
+            out[0][i,:] = exp[1:]
+            if multiply_by_input:
+                out[0][i,:] = out[0][i,:] * (X[i] - self.data_mean)
+        return out[0] if self.flat_out else out
+#################################################
+# The code below was authored by Gregory Plumb and is
+# from: https://github.com/GDPlumb/MAPLE/blob/master/Code/MAPLE.py
+# It has by copied here to allow for benchmark comparisons. Please see
+# the original repo for the latest version, supporting material, and citations.
+#################################################
+# Notes:
+# -  Assumes any required data normalization has already been done
+# -  Can pass Y (desired response) instead of MR (model fit to Y) to make fitting MAPLE to datasets easy
+import numpy as np
+from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
+from sklearn.linear_model import Ridge
+from sklearn.metrics import mean_squared_error
+class MAPLE:
+    def __init__(self, X_train, MR_train, X_val, MR_val, fe_type = "rf", fe=None, n_estimators = 200, max_features = 0.5, min_samples_leaf = 10, regularization = 0.001):
+        # Features and the target model response
+        self.X_train = X_train
+        self.MR_train = MR_train
+        self.X_val = X_val
+        self.MR_val = MR_val
+        # Forest Ensemble Parameters
+        self.n_estimators = n_estimators
+        self.max_features = max_features
+        self.min_samples_leaf = min_samples_leaf
+        # Local Linear Model Parameters
+        self.regularization = regularization
+        # Data parameters
+        num_features = X_train.shape[1]
+        self.num_features = num_features
+        num_train = X_train.shape[0]
+        self.num_train = num_train
+        num_val = X_val.shape[0]
+        # Fit a Forest Ensemble to the model response
+        if fe is None:
+            if fe_type == "rf":
+                fe = RandomForestRegressor(n_estimators = n_estimators, min_samples_leaf = min_samples_leaf, max_features = max_features)
+            elif fe_type == "gbrt":
+                fe = GradientBoostingRegressor(n_estimators = n_estimators, min_samples_leaf = min_samples_leaf, max_features = max_features, max_depth = None)
+            else:
+                print("Unknown FE type ", fe)
+                import sys
+                sys.exit(0)
+            fe.fit(X_train, MR_train)
+        else:
+            self.n_estimators = n_estimators = len(fe.estimators_)
+        self.fe = fe
+        train_leaf_ids = fe.apply(X_train)
+        self.train_leaf_ids = train_leaf_ids
+        val_leaf_ids_list = fe.apply(X_val)
+        # Compute the feature importances: Non-normalized @ Root
+        scores = np.zeros(num_features)
+        if fe_type == "rf":
+            for i in range(n_estimators):
+                splits = fe[i].tree_.feature #-2 indicates leaf, index 0 is root
+                if splits[0] != -2:
+                    scores[splits[0]] += fe[i].tree_.impurity[0] #impurity reduction not normalized per tree
+        elif fe_type == "gbrt":
+            for i in range(n_estimators):
+                splits = fe[i, 0].tree_.feature #-2 indicates leaf, index 0 is root
+                if splits[0] != -2:
+                    scores[splits[0]] += fe[i, 0].tree_.impurity[0] #impurity reduction not normalized per tree
+        self.feature_scores = scores
+        mostImpFeats = np.argsort(-scores)
+        # Find the number of features to use for MAPLE
+        retain_best = 0
+        rmse_best = np.inf
+        for retain in range(1, num_features + 1):
+            # Drop less important features for local regression
+            X_train_p = np.delete(X_train, mostImpFeats[retain:], axis = 1)
+            X_val_p = np.delete(X_val, mostImpFeats[retain:], axis = 1)
+            lr_predictions = np.empty([num_val], dtype=float)
+            for i in range(num_val):
+                weights = self.training_point_weights(val_leaf_ids_list[i])
+                # Local linear model
+                lr_model = Ridge(alpha=regularization)
+                lr_model.fit(X_train_p, MR_train, weights)
+                lr_predictions[i] = lr_model.predict(X_val_p[i].reshape(1, -1))
+            rmse_curr = np.sqrt(mean_squared_error(lr_predictions, MR_val))
+            if rmse_curr < rmse_best:
+                rmse_best = rmse_curr
+                retain_best = retain
+        self.retain = retain_best
+        self.X = np.delete(X_train, mostImpFeats[retain_best:], axis = 1)
+    def training_point_weights(self, instance_leaf_ids):
+        weights = np.zeros(self.num_train)
+        for i in range(self.n_estimators):
+            # Get the PNNs for each tree (ones with the same leaf_id)
+            PNNs_Leaf_Node = np.where(self.train_leaf_ids[:, i] == instance_leaf_ids[i])[0]
+            if len(PNNs_Leaf_Node) > 0: # SML: added this to fix degenerate cases
+                weights[PNNs_Leaf_Node] += 1.0 / len(PNNs_Leaf_Node)
+        return weights
+    def explain(self, x):
+        x = x.reshape(1, -1)
+        mostImpFeats = np.argsort(-self.feature_scores)
+        x_p = np.delete(x, mostImpFeats[self.retain:], axis = 1)
+        curr_leaf_ids = self.fe.apply(x)[0]
+        weights = self.training_point_weights(curr_leaf_ids)
+        # Local linear model
+        lr_model = Ridge(alpha = self.regularization)
+        lr_model.fit(self.X, self.MR_train, weights)
+        # Get the model coefficients
+        coefs = np.zeros(self.num_features + 1)
+        coefs[0] = lr_model.intercept_
+        coefs[np.sort(mostImpFeats[0:self.retain]) + 1] = lr_model.coef_
+        # Get the prediction at this point
+        prediction = lr_model.predict(x_p.reshape(1, -1))
+        out = {}
+        out["weights"] = weights
+        out["coefs"] = coefs
+        out["pred"] = prediction
+        return out
+    def predict(self, X):
+        n = X.shape[0]
+        pred = np.zeros(n)
+        for i in range(n):
+            exp = self.explain(X[i, :])
+            pred[i] = exp["pred"][0]
+        return pred
+    # Make the predictions based on the forest ensemble (either random forest or gradient boosted regression tree) instead of MAPLE
+    def predict_fe(self, X):
+        return self.fe.predict(X)
+    # Make the predictions based on SILO (no feature selection) instead of MAPLE
+    def predict_silo(self, X):
+        n = X.shape[0]
+        pred = np.zeros(n)
+        for i in range(n): #The contents of this inner loop are similar to explain(): doesn't use the features selected by MAPLE or return as much information
+            x = X[i, :].reshape(1, -1)
+            curr_leaf_ids = self.fe.apply(x)[0]
+            weights = self.training_point_weights(curr_leaf_ids)
+            # Local linear model
+            lr_model = Ridge(alpha = self.regularization)
+            lr_model.fit(self.X_train, self.MR_train, weights)
+            pred[i] = lr_model.predict(x)[0]
+        return pred

lib/shap/explainers/other/_random.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import numpy as np
+from shap import links
+from shap.models import Model
+from shap.utils import MaskedModel
+from .._explainer import Explainer
+class Random(Explainer):
+    """ Simply returns random (normally distributed) feature attributions.
+    This is only for benchmark comparisons. It supports both fully random attributions and random
+    attributions that are constant across all explanations.
+    """
+    def __init__(self, model, masker, link=links.identity, feature_names=None, linearize_link=True, constant=False, **call_args):
+        super().__init__(model, masker, link=link, linearize_link=linearize_link, feature_names=feature_names)
+        if not isinstance(model, Model):
+            self.model = Model(model)
+        for arg in call_args:
+            self.__call__.__kwdefaults__[arg] = call_args[arg]
+        self.constant = constant
+        self.constant_attributions = None
+    def explain_row(self, *row_args, max_evals, main_effects, error_bounds, batch_size, outputs, silent):
+        """ Explains a single row.
+        """
+        # build a masked version of the model for the current input sample
+        fm = MaskedModel(self.model, self.masker, self.link, self.linearize_link, *row_args)
+        # compute any custom clustering for this row
+        row_clustering = None
+        if getattr(self.masker, "clustering", None) is not None:
+            if isinstance(self.masker.clustering, np.ndarray):
+                row_clustering = self.masker.clustering
+            elif callable(self.masker.clustering):
+                row_clustering = self.masker.clustering(*row_args)
+            else:
+                raise NotImplementedError("The masker passed has a .clustering attribute that is not yet supported by the Permutation explainer!")
+        # compute the correct expected value
+        masks = np.zeros(1, dtype=int)
+        outputs = fm(masks, zero_index=0, batch_size=1)
+        expected_value = outputs[0]
+        # generate random feature attributions
+        # we produce small values so our explanation errors are similar to a constant function
+        row_values = np.random.randn(*((len(fm),) + outputs.shape[1:])) * 0.001
+        return {
+            "values": row_values,
+            "expected_values": expected_value,
+            "mask_shapes": fm.mask_shapes,
+            "main_effects": None,
+            "clustering": row_clustering,
+            "error_std": None,
+            "output_names": self.model.output_names if hasattr(self.model, "output_names") else None
+        }
+    # def __call__(self, X):
+    #     start_time = time.time()
+    #     if self.constant:
+    #         if self.constant_attributions is None:
+    #             self.constant_attributions = np.random.randn(X.shape[1])
+    #         return Explanation(np.tile(self.constant_attributions, (X.shape[0],1)), X, compute_time=time.time() - start_time)
+    #     else:
+    #         return Explanation(np.random.randn(*X.shape), X, compute_time=time.time() - start_time)
+    # def attributions(self, X):
+    #     if self.constant:
+    #         if self.constant_attributions is None:
+    #             self.constant_attributions = np.random.randn(X.shape[1])
+    #         return np.tile(self.constant_attributions, (X.shape[0],1))
+    #     else:
+    #         return np.random.randn(*X.shape)