Note

Go to the end to download the full example code or to run this example in your browser via Binder

aac pdp

import os
from typing import Callable

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from ai4water.postprocessing.explain._partial_dependence import (compute_bounds,
    _add_dist_as_grid, process_axis)
from ai4water.postprocessing import PartialDependencePlot

from aac_utils import get_fitted_model, aac_data

model = get_fitted_model()

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (374, 6)
target shape:  (374, 1)

x, _, _, _ = aac_data()

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (374, 6)
target shape:  (374, 1)

class PartialDependencePlot1(PartialDependencePlot):

    def __init__(
            self,
            model: Callable,
            data,
            feature_names=None,
            num_points: int = 100,
            path=None,
            save: bool = True,
            show: bool = True,
            **kwargs
    ):
        """Initiates the class
        Parameters
        ----------
            model : Callable
                the trained/calibrated model which must be callable. It must take the
                `data` as input and sprout an array of predicted values. For example
                if you are using Keras/sklearn model, then you must pass model.predict
            data : np.ndarray, pd.DataFrame
                The inputs to the `model`. It can numpy array or pandas DataFrame.
            feature_names : list, optional
                Names of features. Used for labeling.
            num_points : int, optional
                determines the grid for evaluation of `model`
            path : str, optional
                path to save the plots. By default the results are saved in current directory
            show:
                whether to show the plot or not
            save:
                whether to save the plot or not
            **kwargs :
                any additional keyword arguments for `model`
        """

        self.model = model
        self.num_points = num_points
        self.xmin = "percentile(0)"
        self.xmax = "percentile(100)"
        self.kwargs = kwargs

        if isinstance(data, pd.DataFrame):
            if feature_names is None:
                feature_names = data.columns.tolist()
            data = data.values

        if not os.path.exists(path):
            os.makedirs(path)

        self.path = path
        self.data = data
        self.features = feature_names
        self.save = save
        self.show = show

    @property
    def data_is_2d(self):
        if isinstance(self.data, np.ndarray) and self.data.ndim == 2:
            return True
        elif isinstance(self.data, pd.DataFrame):
            return True
        else:
            return False

    @property
    def data_is_3d(self):
        if isinstance(self.data, np.ndarray) and self.data.ndim == 3:
            return True
        return False

    @property
    def single_source(self):
        if isinstance(self.data, list) and len(self.data) > 1:
            return False
        else:
            return True

    @property
    def features(self):
        return self._features

    @features.setter
    def features(self, features):
        if self.data_is_2d:
            if type(self.data) == pd.DataFrame:

                features = self.data.columns.to_list()
            elif features is None:
                features = [f"Feature {i}" for i in range(self.data.shape[-1])]
            else:
                assert isinstance(features, list) and len(features) == self.data.shape[-1], f"""
                    features must be given as list of length {self.data.shape[-1]}
                    but are of len {len(features)}
                    """

                features = features
        elif not self.single_source and features is None:
            features = []
            for data in self.data:
                if isinstance(data, pd.DataFrame):
                    _features = data.columns.to_list()
                else:
                    _features = [f"Feature {i}" for i in range(data.shape[-1])]

                features.append(_features)

        elif self.data_is_3d and features is None:
            features = [f"Feature {i}" for i in range(self.data.shape[-1])]

        self._features = features

    def plot_1d(
            self,
            feature,
            show_dist: bool = True,
            show_dist_as: str = "hist",
            ice: bool = True,
            feature_expected_value: bool = False,
            model_expected_value: bool = False,
            show_ci: bool = False,
            show_minima: bool = False,
            ice_only: bool = False,
            ice_color: str = "lightblue",
            ices_to_remove=None,
            ice_linewidth=None,
    ):
        """partial dependence plot in one dimension
        Parameters
        ----------
            feature :
                the feature name for which to plot the partial dependence
            show_dist :
                whether to show actual distribution of data or not
            show_dist_as :
                one of "hist" or "grid"
            ice :
                whether to show individual component elements on plot or not
            feature_expected_value :
                whether to show the average value of feature on the plot or not
            model_expected_value :
                whether to show average prediction on plot or not
            show_ci :
                whether to show confidence interval of pdp or not
            show_minima :
                whether to indicate the minima or not
            ice_only : bool, False
                whether to show only ice plots
            ice_color :
                color for ice lines. It can also be a valid maplotlib
                `colormap <https://matplotlib.org/3.5.1/tutorials/colors/colormaps.html>`_
        """
        if isinstance(feature, list) or isinstance(feature, tuple):
            raise NotImplementedError
        else:
            if self.single_source:
                if self.data_is_2d:
                    ax = self._plot_pdp_1dim(
                        *self._pdp_for_2d(self.data, feature),
                        self.data, feature,
                        show_dist=show_dist,
                        show_dist_as=show_dist_as,
                        ice=ice,
                        feature_expected_value=feature_expected_value,
                        show_ci=show_ci, show_minima=show_minima,
                        model_expected_value=model_expected_value,
                        show=self.show,
                        save=self.save,
                        ice_only=ice_only,
                        ice_color=ice_color,
                        ices_to_remove=ices_to_remove,
                        ice_linewidth=ice_linewidth,
                    )
                elif self.data_is_3d:
                    for lb in range(self.data.shape[1]):
                        ax = self._plot_pdp_1dim(
                            *self._pdp_for_2d(self.data, feature, lb),
                            data=self.data,
                            feature=feature,
                            lookback=lb,
                            show_ci=show_ci,
                            show_minima=show_minima,
                            show_dist=show_dist,
                            show_dist_as=show_dist_as,
                            ice=ice,
                            feature_expected_value=feature_expected_value,
                            model_expected_value=model_expected_value,
                            show=self.show,
                            save=self.save,
                            ice_only=ice_only,
                            ice_color=ice_color)
                else:
                    raise ValueError(f"invalid data shape {self.data.shape}")
            else:
                for data in self.data:
                    if self.data_is_2d:
                        ax = self._pdp_for_2d(data, feature)
                    else:
                        for lb in []:
                            ax = self._pdp_for_2d(data, feature, lb)
        return ax

    def _plot_pdp_1dim(
            self,
            pd_vals, ice_vals, data, feature,
            lookback=None,
            show_dist=True, show_dist_as="hist",
            ice=True, show_ci=False,
            show_minima=False,
            feature_expected_value=False,
            model_expected_value=False,
            show=True, save=False, ax=None,
            ice_color="lightblue",
            ice_only=False,
            ices_to_remove:list = None,
            ice_linewidth=None,
    ):

        xmin, xmax = compute_bounds(self.xmin,
                                    self.xmax,
                                    self.xv(data, feature, lookback))

        if ax is None:
            fig = plt.figure()
            ax = fig.add_axes((0.1, 0.3, 0.8, 0.6))

        xs = self.grid(data, feature, lookback)

        ylabel = "E[f(x) | " + feature + "]"
        if ice:
            n = ice_vals.shape[1]
            if ice_color in plt.colormaps():
                colors = plt.get_cmap(ice_color)(np.linspace(0, 0.8, n))
            elif hasattr(ice_color, '__len__') and not isinstance(ice_color, str):
                assert len(ice_color) == n, f"{len(ice_color)}"
                colors = ice_color
            else:
                colors = [ice_color for _ in range(n)]

            if ices_to_remove is None:
                ices_to_remove = []

            if ice_linewidth is None:
                ice_linewidth = min(1, 50 / n)  # pylint: disable=unsubscriptable-object
            for ice_idx in range(n):

                if ice_idx not in ices_to_remove:

                    ax.plot(xs, ice_vals[:, ice_idx], color=colors[ice_idx],
                        linewidth=ice_linewidth, alpha=1)
            ylabel = "f(x) | " + feature

        if show_ci:
            std = np.std(ice_vals, axis=1)
            upper = pd_vals + std
            lower = pd_vals - std
            color = '#66C2D7'
            if ice_color != "lightblue":
                if ice_color not in plt.colormaps():
                    color = ice_color

            ax.fill_between(xs, upper, lower, alpha=0.14, color=color)

        # the line plot
        if not ice_only:
            ax.plot(xs, pd_vals, color='blue', linewidth=2, alpha=1)

        title = None
        if lookback is not None:
            title = f"lookback: {lookback}"
        process_axis(ax,
                     ylabel=ylabel,
                     ylabel_kws=dict(fontsize=20),
                     right_spine=False,
                     top_spine=False,
                     tick_params=dict(labelsize=11),
                     xlabel=feature,
                     xlabel_kws=dict(fontsize=20),
                     title=title)
        ax.xaxis.set_ticks_position('bottom')
        ax.yaxis.set_ticks_position('left')

        ax2 = ax.twinx()

        if show_dist:
            xv = self.xv(data, feature, lookback)
            if show_dist_as == "hist":

                ax2.hist(xv, 50, density=False, facecolor='black', alpha=0.1,
                         range=(xmin, xmax))
            else:
                _add_dist_as_grid(fig, xv, other_axes=ax, xlabel=feature,
                                  xlabel_kws=dict(fontsize=20))

        process_axis(ax2,
                     right_spine=False,
                     top_spine=False,
                     left_spine=False,
                     bottom_spine=False,
                     ylim=(0, data.shape[0]))
        ax2.xaxis.set_ticks_position('bottom')
        ax2.yaxis.set_ticks_position('left')
        ax2.yaxis.set_ticks([])

        if feature_expected_value:
            self._add_feature_exp_val(ax2, ax, xmin, xmax, data, feature, lookback)

        if model_expected_value:
            self._add_model_exp_val(ax2, ax, data)

        if show_minima:
            minina = self.model(data, **self.kwargs).min()
            ax.axvline(minina, linestyle="--", color="r", lw=1)

        if save:
            lookback = lookback or ''
            fname = os.path.join(self.path, f"pdp_{feature}_{lookback}")
            plt.savefig(fname, bbox_inches="tight", dpi=400)

        if show:
            plt.show()

        return ax

    def _pdp_for_2d(self, data, feature, lookback=None):
        ind = self._feature_to_ind(feature)

        xs = self.grid(data, feature, lookback)

        data_temp = data.copy()

        # instead of calling the model for each num_point, prepare the data
        # stack it in 'data_all' and call the model only once
        total_samples = len(data) * self.num_points
        data_all = np.full((total_samples, *data.shape[1:]), np.nan)

        pd_vals = np.full(self.num_points, np.nan)
        ice_vals = np.full((self.num_points, data.shape[0]), np.nan)

        st, en = 0, len(data)
        for i in range(self.num_points):

            if data.ndim == 3:
                data_temp[:, lookback, ind] = xs[i]
            else:
                data_temp[:, ind] = xs[i]
                data_all[st:en] = data_temp

            st = en
            en += len(data)

        predictions = self.model(data_all, **self.kwargs)

        st, en = 0, len(data)
        for i in range(self.num_points):
            pred = predictions[st:en]
            pd_vals[i] = pred.mean()
            ice_vals[i, :] = pred.reshape(-1, )
            st = en
            en += len(data)

        return pd_vals, ice_vals

pdp = PartialDependencePlot1(
    model=model.predict,
    data=pd.DataFrame(x, columns=model.input_features),
    feature_names=model.input_features,
    num_points=100,
    save=False,
    path=model.path
)

sal_psu

_ = pdp.plot_1d("sal_psu")

pdp.plot_1d("sal_psu", ice=False)

<Axes: xlabel='sal_psu', ylabel='E[f(x) | sal_psu]'>

pal = sns.color_palette("bright", n_colors=len(pdp.data))

pdp.plot_1d("sal_psu", ice_only=True, ice_color=pal, ice_linewidth=0.5)

<Axes: xlabel='sal_psu', ylabel='f(x) | sal_psu'>

pdp.plot_1d("sal_psu", ice_only=True, ice_color=pal,
            ices_to_remove=[275], ice_linewidth=0.5)

<Axes: xlabel='sal_psu', ylabel='f(x) | sal_psu'>

pcp_mm

_ = pdp.plot_1d("pcp_mm")

_ = pdp.plot_1d("pcp_mm", ice=False)

_ = pdp.plot_1d("pcp_mm", show_ci=False, ice_only=True, ice_color=pal,
                ice_linewidth=0.5)

_ = pdp.plot_1d("pcp_mm", show_ci=False, ice_only=True, ice_color=pal,
                ice_linewidth=0.5, ices_to_remove=[275])

tide_cm

_ = pdp.plot_1d("tide_cm")

_ = pdp.plot_1d("tide_cm", ice=False)

_ = pdp.plot_1d("tide_cm", show_ci=False, ice_only=True,
                ice_color=pal, ice_linewidth=0.5)

_ = pdp.plot_1d("tide_cm", show_ci=False, ice_only=True,
                ice_color=pal, ice_linewidth=0.5, ices_to_remove=[275])

wat_temp_c

_ = pdp.plot_1d("wat_temp_c")

_ = pdp.plot_1d("wat_temp_c", ice=False)

_ = pdp.plot_1d("wat_temp_c", show_ci=False, ice_only=True,
                ice_color=pal, ice_linewidth=0.5)

_ = pdp.plot_1d("wat_temp_c", show_ci=False, ice_only=True,
                ice_color=pal, ice_linewidth=0.5, ices_to_remove=[275])

air_p_hpa

_ = pdp.plot_1d("air_p_hpa")

_ = pdp.plot_1d("air_p_hpa", ice=False)

_ = pdp.plot_1d("air_p_hpa", show_ci=False, ice_only=True,
                ice_color=pal, ice_linewidth=0.5)

_ = pdp.plot_1d("air_p_hpa", show_ci=False, ice_only=True,
                ice_color=pal, ice_linewidth=0.5, ices_to_remove=[275])

wind_speed_mps

_ = pdp.plot_1d("wind_speed_mps")

_ = pdp.plot_1d("wind_speed_mps", ice=False)

_ = pdp.plot_1d("wind_speed_mps", show_ci=False, ice_only=True,
                ice_color=pal, ice_linewidth=0.5)

_ = pdp.plot_1d("wind_speed_mps", show_ci=False, ice_only=True,
                ice_color=pal, ice_linewidth=0.5, ices_to_remove=[275])

## interaction

# _ = pdp.plot_interaction(["tide_cm", "pcp_mm"], cmap="Blues")
#
# #%%
#
# pdp.plot_interaction(["tide_cm", "wat_temp_c"], cmap="Blues")
#
# #%%
#
# pdp.plot_interaction(["tide_cm", "sal_psu"], cmap="Blues")
#
# #%%
#
# pdp.plot_interaction(["pcp_mm", "wat_temp_c"], cmap="Blues")
#
# #%%
#
# pdp.plot_interaction(["pcp_mm", "sal_psu"], cmap="Blues")
#
# #%%
#
# pdp.plot_interaction(["wat_temp_c", "sal_psu"], cmap="Blues")
#
# #%%
#
# _ = pdp.nd_interactions()

Total running time of the script: ( 0 minutes 20.194 seconds)

Gallery generated by Sphinx-Gallery