Source code for verde.model_selection

"""
Functions for automating model selection through cross-validation.

Supports using a dask.distributed.Client object for parallelism. The
DummyClient is used as a serial version of the parallel client.
"""
import numpy as np
from sklearn.model_selection import KFold, ShuffleSplit

from .base import check_fit_input


class DummyClient:  # pylint: disable=no-self-use,too-few-public-methods
    """
    Dummy client to mimic a dask.distributed.Client for immediate local
    execution.

    >>> client = DummyClient()
    >>> client.submit(sum, (1, 2, 3))
    6

    """

    def submit(self, function, *args, **kwargs):
        "Execute function with the given arguments and return its output"
        return function(*args, **kwargs)


[docs]def train_test_split(coordinates, data, weights=None, **kwargs): r""" Split a dataset into a training and a testing set for cross-validation. Similar to :func:`sklearn.model_selection.train_test_split` but is tuned to work on multi-component spatial data with optional weights. Extra keyword arguments will be passed to :class:`sklearn.model_selection.ShuffleSplit`, except for ``n_splits`` which is always 1. Parameters ---------- coordinates : tuple of arrays Arrays with the coordinates of each data point. Should be in the following order: (easting, northing, vertical, ...). data : array or tuple of arrays the data values of each data point. If the data has more than one component, *data* must be a tuple of arrays (one for each component). weights : none or array or tuple of arrays if not none, then the weights assigned to each data point. If more than one data component is provided, you must provide a weights array for each data component (if not none). Returns ------- train, test : tuples Each is a tuple = (coordinates, data, weights) generated by separating the input values randomly. Examples -------- >>> import numpy as np >>> # Split 2-component data with weights >>> data = (np.array([1, 3, 5, 7]), np.array([0, 2, 4, 6])) >>> coordinates = (np.arange(4), np.arange(-4, 0)) >>> weights = (np.array([1, 1, 2, 1]), np.array([1, 2, 1, 1])) >>> train, test = train_test_split(coordinates, data, weights, ... random_state=0) >>> print("Coordinates:", train[0], test[0], sep='\n ') Coordinates: (array([3, 1, 0]), array([-1, -3, -4])) (array([2]), array([-2])) >>> print("Data:", train[1], test[1], sep='\n ') Data: (array([7, 3, 1]), array([6, 2, 0])) (array([5]), array([4])) >>> print("Weights:", train[2], test[2], sep='\n ') Weights: (array([1, 1, 1]), array([1, 2, 1])) (array([2]), array([1])) >>> # Split single component data without weights >>> train, test = train_test_split(coordinates, data[0], None, ... random_state=0) >>> print("Coordinates:", train[0], test[0], sep='\n ') Coordinates: (array([3, 1, 0]), array([-1, -3, -4])) (array([2]), array([-2])) >>> print("Data:", train[1], test[1], sep='\n ') Data: (array([7, 3, 1]),) (array([5]),) >>> print("Weights:", train[2], test[2], sep='\n ') Weights: (None,) (None,) """ args = check_fit_input(coordinates, data, weights, unpack=False) ndata = args[1][0].size indices = np.arange(ndata) split = next(ShuffleSplit(n_splits=1, **kwargs).split(indices)) train, test = (tuple(select(i, index) for i in args) for index in split) return train, test
[docs]def cross_val_score(estimator, coordinates, data, weights=None, cv=None, client=None): """ Score an estimator/gridder using cross-validation. Similar to :func:`sklearn.model_selection.cross_val_score` but modified to accept spatial multi-component data with weights. By default, will use :class:`sklearn.model_selection.KFold` to split the dataset. Another cross-validation class can be passed in through the *cv* argument. Can optionally run in parallel using `dask <https://dask.pydata.org/>`__. To do this, pass in a :class:`dask.distributed.Client` as the *client* argument. Tasks in this function will be submitted to the dask cluster, which can be local. In this case, the resulting scores won't be the actual values but :class:`dask.distributed.Future` objects. Call their ``.result()`` methods to get back the values or pass them along to other dask computations. Parameters ---------- estimator : verde gridder Any verde gridder class that has the ``fit`` and ``score`` methods. coordinates : tuple of arrays Arrays with the coordinates of each data point. Should be in the following order: (easting, northing, vertical, ...). data : array or tuple of arrays the data values of each data point. If the data has more than one component, *data* must be a tuple of arrays (one for each component). weights : none or array or tuple of arrays if not none, then the weights assigned to each data point. If more than one data component is provided, you must provide a weights array for each data component (if not none). cv : None or cross-validation generator Any scikit-learn cross-validation generator. Defaults to :class:`sklearn.model_selection.KFold`. client : None or dask.distributed.Client If None, then computations are run serially. Otherwise, should be a dask ``Client`` object. It will be used to dispatch computations to the dask cluster. Returns ------- scores : list List of scores for each split of the cross-validation generator. If *client* is not None, then the scores will be futures. Examples -------- >>> from verde import grid_coordinates, Trend >>> coords = grid_coordinates((0, 10, -10, -5), spacing=0.1) >>> data = 10 - coords[0] + 0.5*coords[1] >>> # A linear trend should perfectly predict this data >>> scores = cross_val_score(Trend(degree=1), coords, data) >>> print(', '.join(['{:.2f}'.format(score) for score in scores])) 1.00, 1.00, 1.00, 1.00, 1.00 >>> # To run parallel, we need to create a dask.distributed Client. It will >>> # create a local cluster if no arguments are given so we can run the >>> # scoring on a single machine. >>> from dask.distributed import Client >>> client = Client() >>> # The scoring will now only submit tasks to our local cluster >>> scores = cross_val_score(Trend(degree=1), coords, data, client=client) >>> # The scores are not the actual values but Futures >>> type(scores[0]) <class 'distributed.client.Future'> >>> # We need to call .result() to get back the actual value >>> print('{:.2f}'.format(scores[0].result())) 1.00 """ coordinates, data, weights = check_fit_input( coordinates, data, weights, unpack=False ) if client is None: client = DummyClient() if cv is None: cv = KFold(shuffle=True, random_state=0, n_splits=5) ndata = data[0].size args = (coordinates, data, weights) scores = [] for train, test in cv.split(np.arange(ndata)): train_data, test_data = ( tuple(select(i, index) for i in args) for index in (train, test) ) score = client.submit(fit_score, estimator, train_data, test_data) scores.append(score) return scores
def fit_score(estimator, train_data, test_data): """ Fit an estimator on the training data and then score it on the testing data """ estimator.fit(*train_data) return estimator.score(*test_data) def select(arrays, index): """ Index each array in a tuple of arrays. If the arrays tuple contains a ``None``, the entire tuple will be returned as is. Parameters ---------- arrays : tuple of arrays index : array An array of indices to select from arrays. Returns ------- indexed_arrays : tuple of arrays Examples -------- >>> import numpy as np >>> select((np.arange(5), np.arange(-3, 2, 1)), [1, 3]) (array([1, 3]), array([-2, 0])) >>> select((None, None, None, None), [1, 2]) (None, None, None, None) """ if arrays is None or any(i is None for i in arrays): return arrays return tuple(i.ravel()[index] for i in arrays)