Source code for verde.model_selection

"""
Functions for automating model selection through cross-validation.

Supports using a dask.distributed.Client object for parallelism. The
DummyClient is used as a serial version of the parallel client.
"""
import warnings

import numpy as np
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.base import clone

from .base import check_fit_input
from .utils import dispatch


# Otherwise, DeprecationWarning won't be shown, kind of defeating the purpose.
warnings.simplefilter("default")


[docs]def train_test_split(coordinates, data, weights=None, **kwargs):
    r"""
    Split a dataset into a training and a testing set for cross-validation.

    Similar to :func:`sklearn.model_selection.train_test_split` but is tuned to
    work on multi-component spatial data with optional weights.

    Extra keyword arguments will be passed to
    :class:`sklearn.model_selection.ShuffleSplit`, except for ``n_splits``
    which is always 1.

    Parameters
    ----------
    coordinates : tuple of arrays
        Arrays with the coordinates of each data point. Should be in the
        following order: (easting, northing, vertical, ...).
    data : array or tuple of arrays
        the data values of each data point. If the data has more than one
        component, *data* must be a tuple of arrays (one for each component).
    weights : none or array or tuple of arrays
        if not none, then the weights assigned to each data point. If more than
        one data component is provided, you must provide a weights array for
        each data component (if not none).

    Returns
    -------
    train, test : tuples
        Each is a tuple = (coordinates, data, weights) generated by separating
        the input values randomly.

    Examples
    --------

    >>> import numpy as np
    >>> # Split 2-component data with weights
    >>> data = (np.array([1, 3, 5, 7]), np.array([0, 2, 4, 6]))
    >>> coordinates = (np.arange(4), np.arange(-4, 0))
    >>> weights = (np.array([1, 1, 2, 1]), np.array([1, 2, 1, 1]))
    >>> train, test = train_test_split(coordinates, data, weights,
    ...                                random_state=0)
    >>> print("Coordinates:", train[0], test[0], sep='\n  ')
    Coordinates:
      (array([3, 1, 0]), array([-1, -3, -4]))
      (array([2]), array([-2]))
    >>> print("Data:", train[1], test[1], sep='\n  ')
    Data:
      (array([7, 3, 1]), array([6, 2, 0]))
      (array([5]), array([4]))
    >>> print("Weights:", train[2], test[2], sep='\n  ')
    Weights:
      (array([1, 1, 1]), array([1, 2, 1]))
      (array([2]), array([1]))
    >>> # Split single component data without weights
    >>> train, test = train_test_split(coordinates, data[0], None,
    ...                                random_state=0)
    >>> print("Coordinates:", train[0], test[0], sep='\n  ')
    Coordinates:
      (array([3, 1, 0]), array([-1, -3, -4]))
      (array([2]), array([-2]))
    >>> print("Data:", train[1], test[1], sep='\n  ')
    Data:
      (array([7, 3, 1]),)
      (array([5]),)
    >>> print("Weights:", train[2], test[2], sep='\n  ')
    Weights:
      (None,)
      (None,)

    """
    args = check_fit_input(coordinates, data, weights, unpack=False)
    ndata = args[1][0].size
    indices = np.arange(ndata)
    split = next(ShuffleSplit(n_splits=1, **kwargs).split(indices))
    train, test = (tuple(select(i, index) for i in args) for index in split)
    return train, test


[docs]def cross_val_score(
    estimator, coordinates, data, weights=None, cv=None, client=None, delayed=False
):
    """
    Score an estimator/gridder using cross-validation.

    Similar to :func:`sklearn.model_selection.cross_val_score` but modified to
    accept spatial multi-component data with weights.

    By default, will use :class:`sklearn.model_selection.KFold` with
    ``n_splits=5`` and ``random_state=0`` to split the dataset. Any other
    cross-validation class can be passed in through the *cv* argument.

    Can optionally run in parallel using :mod:`dask`. To do this, use
    ``delayed=True`` to dispatch computations with :func:`dask.delayed` instead
    of running them. The returned scores will be "lazy" objects instead of the
    actual scores. To trigger the computation (which Dask will run in parallel)
    call the `.compute()` method of each score or :func:`dask.compute` with the
    entire list of scores.

    .. warning::

        The ``client`` parameter is deprecated and will be removed in Verde
        v2.0.0. Use ``delayed`` instead.

    Parameters
    ----------
    estimator : verde gridder
        Any verde gridder class that has the ``fit`` and ``score`` methods.
    coordinates : tuple of arrays
        Arrays with the coordinates of each data point. Should be in the
        following order: (easting, northing, vertical, ...).
    data : array or tuple of arrays
        the data values of each data point. If the data has more than one
        component, *data* must be a tuple of arrays (one for each component).
    weights : none or array or tuple of arrays
        if not none, then the weights assigned to each data point. If more than
        one data component is provided, you must provide a weights array for
        each data component (if not none).
    cv : None or cross-validation generator
        Any scikit-learn cross-validation generator. Defaults to
        :class:`sklearn.model_selection.KFold`.
    client : None or dask.distributed.Client
        **DEPRECATED:** This option is deprecated and will be removed in Verde
        v2.0.0. If None, then computations are run serially. Otherwise, should
        be a dask ``Client`` object. It will be used to dispatch computations
        to the dask cluster.
    delayed : bool
        If True, will use :func:`dask.delayed` to dispatch computations without
        actually executing them. The returned scores will be a list of delayed
        objects. Call `.compute()` on each score or :func:`dask.compute` on the
        entire list to trigger the actual computations.

    Returns
    -------
    scores : array
        Array of scores for each split of the cross-validation generator. If
        *delayed*, will be a list of Dask delayed objects (see the *delayed*
        option). If *client* is not None, then the scores will be futures.

    Examples
    --------

    As an example, we can score :class:`verde.Trend` on data that actually
    follows a linear trend.

    >>> from verde import grid_coordinates, Trend
    >>> coords = grid_coordinates((0, 10, -10, -5), spacing=0.1)
    >>> data = 10 - coords[0] + 0.5*coords[1]
    >>> model = Trend(degree=1)

    In this case, the model should perfectly predict the data and R² scores
    should be equal to 1.

    >>> scores = cross_val_score(model, coords, data)
    >>> print(', '.join(['{:.2f}'.format(score) for score in scores]))
    1.00, 1.00, 1.00, 1.00, 1.00

    There are 5 scores because the default cross-validator is
    :class:`sklearn.model_selection.KFold` with ``n_splits=5``.

    We can use different cross-validators by assigning them to the ``cv``
    argument:

    >>> from sklearn.model_selection import ShuffleSplit
    >>> # Set the random state to get reproducible results
    >>> cross_validator = ShuffleSplit(n_splits=3, random_state=0)
    >>> scores = cross_val_score(model, coords, data, cv=cross_validator)
    >>> print(', '.join(['{:.2f}'.format(score) for score in scores]))
    1.00, 1.00, 1.00

    If using many splits, we can speed up computations by running them in
    parallel with Dask:

    >>> cross_validator = ShuffleSplit(n_splits=10, random_state=0)
    >>> scores_delayed = cross_val_score(
    ...     model, coords, data, cv=cross_validator, delayed=True
    ... )
    >>> # The scores are delayed objects.
    >>> # To actually run the computations, call dask.compute
    >>> import dask
    >>> scores = dask.compute(*scores_delayed)
    >>> print(', '.join(['{:.2f}'.format(score) for score in scores]))
    1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00

    Note that you must have enough RAM to fit multiple models simultaneously.
    So this is best used when fitting several smaller models.

    """
    if client is not None:
        warnings.warn(
            "The 'client' parameter of 'verde.cross_val_score' is deprecated "
            "and will be removed in Verde 2.0.0. "
            "Use the 'delayed' parameter instead.",
            DeprecationWarning,
        )
    coordinates, data, weights = check_fit_input(
        coordinates, data, weights, unpack=False
    )
    if cv is None:
        cv = KFold(shuffle=True, random_state=0, n_splits=5)
    ndata = data[0].size
    fit_args = (coordinates, data, weights)
    scores = []
    for train_index, test_index in cv.split(np.arange(ndata)):
        train = tuple(select(i, train_index) for i in fit_args)
        test = tuple(select(i, test_index) for i in fit_args)
        # Clone the estimator to avoid fitting the same object simultaneously
        # when delayed=True.
        score = dispatch(fit_score, client=client, delayed=delayed)(
            clone(estimator), train, test
        )
        scores.append(score)
    if not delayed and client is None:
        scores = np.asarray(scores)
    return scores


def fit_score(estimator, train_data, test_data):
    """
    Fit an estimator on the training data and then score it on the testing data
    """
    return estimator.fit(*train_data).score(*test_data)


def select(arrays, index):
    """
    Index each array in a tuple of arrays.

    If the arrays tuple contains a ``None``, the entire tuple will be returned
    as is.

    Parameters
    ----------
    arrays : tuple of arrays
    index : array
        An array of indices to select from arrays.

    Returns
    -------
    indexed_arrays : tuple of arrays

    Examples
    --------

    >>> import numpy as np
    >>> select((np.arange(5), np.arange(-3, 2, 1)), [1, 3])
    (array([1, 3]), array([-2,  0]))
    >>> select((None, None, None, None), [1, 2])
    (None, None, None, None)

    """
    if arrays is None or any(i is None for i in arrays):
        return arrays
    return tuple(i.ravel()[index] for i in arrays)