Source code for xgboost_distribution.model

"""XGBDistribution model
"""
import importlib
import json
import os
from typing import Any, Callable, List, Optional, Sequence, Tuple, Union, no_type_check

import numpy as np
from sklearn.base import RegressorMixin
from sklearn.utils.validation import check_is_fitted
from xgboost._typing import ArrayLike
from xgboost.callback import TrainingCallback
from xgboost.config import config_context
from xgboost.core import Booster, DMatrix, _deprecate_positional_args
from xgboost.sklearn import XGBModel, _wrap_evaluation_matrices, xgboost_model_doc
from xgboost.training import train

from xgboost_distribution.distributions import get_distribution, get_distribution_doc


[docs]@xgboost_model_doc(
    "Implementation of XGBoost to estimate distributions (in scikit-learn API).",
    ["estimators", "model"],
    extra_parameters=get_distribution_doc()
    + """
    natural_gradient : bool, default=True
        Whether or not natural gradients should be used.""",
)
class XGBDistribution(XGBModel, RegressorMixin):
[docs]    @_deprecate_positional_args
    def __init__(
        self,
        *,
        distribution: str = None,
        natural_gradient: bool = True,
        objective: str = None,
        **kwargs: Any,
    ) -> None:
        self.distribution = distribution or "normal"
        self.natural_gradient = natural_gradient

        if objective is not None:
            raise ValueError(
                "Please do not set objective directly! Use the `distribution` kwarg"
            )

        super().__init__(objective=None, **kwargs)

[docs]    @_deprecate_positional_args
    def fit(
        self,
        X: ArrayLike,
        y: ArrayLike,
        *,
        sample_weight: Optional[ArrayLike] = None,
        eval_set: Optional[Sequence[Tuple[ArrayLike, ArrayLike]]] = None,
        early_stopping_rounds: Optional[int] = None,
        verbose: Optional[Union[bool, int]] = True,
        xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
        sample_weight_eval_set: Optional[Sequence[ArrayLike]] = None,
        feature_weights: Optional[ArrayLike] = None,
        callbacks: Optional[Sequence[TrainingCallback]] = None,
    ) -> "XGBDistribution":
        """Fit gradient boosting distribution model.

        Note that calling ``fit()`` multiple times will cause the model object to be
        re-fit from scratch. To resume training from a previous checkpoint, explicitly
        pass ``xgb_model`` argument.

        Parameters
        ----------
        X :
            Feature matrix. See :ref:`py-data` for a list of supported types.

            When the ``tree_method`` is set to ``hist``, internally, the
            :py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix`
            for conserving memory. However, this has performance implications when the
            device of input data is not matched with algorithm. For instance, if the
            input is a numpy array on CPU but ``cuda`` is used for training, then the
            data is first processed on CPU then transferred to GPU.
        y :
            Labels
        sample_weight :
            instance weights
        eval_set :
            A list of (X, y) tuple pairs to use as validation sets, for which
            metrics will be computed.
            Validation metrics will help us track the performance of the model.

        early_stopping_rounds : int

            .. deprecated:: 1.6.0

            Use `early_stopping_rounds` in :py:meth:`__init__` or :py:meth:`set_params`
            instead.
        verbose :
            If `verbose` is True and an evaluation set is used, the evaluation metric
            measured on the validation set is printed to stdout at each boosting stage.
            If `verbose` is an integer, the evaluation metric is printed at each
            `verbose` boosting stage. The last boosting stage / the boosting stage found
            by using `early_stopping_rounds` is also printed.
        xgb_model :
            file name of stored XGBoost model or 'Booster' instance XGBoost model to be
            loaded before training (allows training continuation).
        sample_weight_eval_set :
            A list of the form [L_1, L_2, ..., L_n], where each L_i is an array like
            object storing instance weights for the i-th validation set.
        feature_weights :
            Weight for each feature, defines the probability of each feature being
            selected when colsample is being used.  All values must be greater than 0,
            otherwise a `ValueError` is thrown.

        callbacks :
            .. deprecated:: 1.6.0
                Use `callbacks` in :py:meth:`__init__` or :py:meth:`set_params` instead.

        """
        with config_context(verbosity=self.verbosity):
            evals_result: TrainingCallback.EvalsLog = {}

            self._distribution = get_distribution(self.distribution)
            self._distribution.check_target(y)

            params = self.get_xgb_params()

            # we remove unexpected (i.e. not xgb native) params before fitting
            for param in ["distribution", "natural_gradient"]:
                params.pop(param)

            params["objective"] = None
            params["disable_default_eval_metric"] = True
            params["num_class"] = len(self._distribution.params)

            # we set `base_score` to zero and instead use base_margin in dmatrices
            # -> this allows different starting values for each distribution parameter
            params["base_score"] = 0.0
            self._starting_params = self._distribution.starting_params(y)

            base_margin = self._get_base_margin(len(y))
            if eval_set is not None:
                base_margin_eval_set: Optional[List[np.ndarray]] = [
                    self._get_base_margin(len(evals[1])) for evals in eval_set
                ]
            else:
                base_margin_eval_set = None

            model, _, params, early_stopping_rounds, callbacks = self._configure_fit(
                booster=xgb_model,
                eval_metric=None,
                params=params,
                early_stopping_rounds=early_stopping_rounds,
                callbacks=callbacks,
            )

            train_dmatrix, evals = _wrap_evaluation_matrices(
                missing=self.missing,
                X=X,
                y=y,
                group=None,
                qid=None,
                sample_weight=sample_weight,
                base_margin=base_margin,
                feature_weights=feature_weights,
                eval_set=eval_set,
                sample_weight_eval_set=sample_weight_eval_set,
                base_margin_eval_set=base_margin_eval_set,
                eval_group=None,
                eval_qid=None,
                create_dmatrix=self._create_dmatrix,
                enable_categorical=self.enable_categorical,
                feature_types=self.feature_types,
            )

            self._Booster = train(
                params,
                train_dmatrix,
                num_boost_round=self.get_num_boosting_rounds(),
                evals=evals,
                early_stopping_rounds=early_stopping_rounds,
                evals_result=evals_result,
                obj=self._objective_func(),
                custom_metric=self._evaluation_func(),
                verbose_eval=verbose,
                xgb_model=model,
                callbacks=callbacks,
            )

            self._set_evaluation_result(evals_result)
            self.objective = f"distribution:{self.distribution}"

            # we set additional params needed for XGBDistribution on the Booster object,
            # in order to make use of the Booster's serialisation methods
            self._Booster.set_attr(distribution=self.distribution)
            self._Booster.set_attr(starting_params=json.dumps(self._starting_params))

            return self

[docs]    @no_type_check
    def predict(
        self,
        X: ArrayLike,
        validate_features: bool = True,
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> Tuple[np.ndarray]:
        """Predict all params of distribution of each `X` example.

        Parameters
        ----------
        X : ArrayLike
            Feature matrix.
        validate_features : bool
            When this is True, validate that the Booster's and data's feature_names are
            identical.  Otherwise, it is assumed that the feature_names are the same.
        iteration_range :
            Specifies which layer of trees are used in prediction.  For example, if a
            random forest is trained with 100 rounds.  Specifying `iteration_range=(10,
            20)`, then only the forests built during [10, 20) (half open set) rounds are
            used in this prediction.

        Returns
        -------
        predictions : namedtuple
            A namedtuple of the distribution parameters. Each parameter is a
            numpy array of shape (n_samples,).
        """
        with config_context(verbosity=self.verbosity):
            check_is_fitted(self, attributes=("_distribution", "_starting_params"))

            base_margin = self._get_base_margin(X.shape[0])

            params = super().predict(
                X=X,
                output_margin=True,
                validate_features=validate_features,
                base_margin=base_margin,
                iteration_range=iteration_range,
            )
            return self._distribution.predict(params)

[docs]    def save_model(self, fname: Union[str, os.PathLike]) -> None:
        # self._distribution class cannot be saved by `super().save_model`, as it
        # attempts to call `json.dumps({"_distribution": self._distribution})`
        # Hence we delete, and then reinstantiate
        # (this is safe as distributions are by definition stateless)
        del self._distribution
        super().save_model(fname)
        self._distribution = get_distribution(self.distribution)

[docs]    def load_model(self, fname: Union[str, bytearray, os.PathLike]) -> None:
        super().load_model(fname)

        # See above: Currently need to reinstantiate distribution post loading
        self.distribution = self._Booster.attr("distribution")
        self._distribution = get_distribution(self.distribution)

        distribution_module = importlib.import_module(self._distribution.__module__)
        self._starting_params = distribution_module.Params(
            *json.loads(self._Booster.attr("starting_params"))
        )
        del distribution_module

    def _objective_func(
        self,
    ) -> Callable[[np.ndarray, DMatrix], Tuple[np.ndarray, np.ndarray]]:
        def obj(params: np.ndarray, data: DMatrix) -> Tuple[np.ndarray, np.ndarray]:
            y = data.get_label()
            grad, hess = self._distribution.gradient_and_hessian(
                y=y, params=params, natural_gradient=self.natural_gradient
            )

            weights = data.get_weight()
            if weights.size != 0:
                weights = weights.reshape(-1, 1)
                grad *= weights
                hess *= weights

            return grad.flatten(), hess.flatten()

        return obj

    def _evaluation_func(self) -> Callable[[np.ndarray, DMatrix], Tuple[str, float]]:
        def feval(params: np.ndarray, data: DMatrix) -> Tuple[str, float]:
            y = data.get_label()
            weights = data.get_weight()
            if weights.size == 0:
                weights = None

            loss_name, loss = self._distribution.loss(y=y, params=params)
            return loss_name, np.average(loss, weights=weights)

        return feval

    def _get_base_margin(self, n_samples: int) -> np.ndarray:
        return np.ones(shape=(n_samples, 1)) * np.array(self._starting_params)