Source code for verde.model_selection

"""
Functions for automating model selection through cross-validation.

Supports using a dask.distributed.Client object for parallelism. The
DummyClient is used as a serial version of the parallel client.
"""
import warnings

import numpy as np
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.base import clone

from .base import check_fit_input
from .utils import dispatch


# Otherwise, DeprecationWarning won't be shown, kind of defeating the purpose.
warnings.simplefilter("default")


[docs]def train_test_split(coordinates, data, weights=None, **kwargs): r""" Split a dataset into a training and a testing set for cross-validation. Similar to :func:`sklearn.model_selection.train_test_split` but is tuned to work on multi-component spatial data with optional weights. Extra keyword arguments will be passed to :class:`sklearn.model_selection.ShuffleSplit`, except for ``n_splits`` which is always 1. Parameters ---------- coordinates : tuple of arrays Arrays with the coordinates of each data point. Should be in the following order: (easting, northing, vertical, ...). data : array or tuple of arrays the data values of each data point. If the data has more than one component, *data* must be a tuple of arrays (one for each component). weights : none or array or tuple of arrays if not none, then the weights assigned to each data point. If more than one data component is provided, you must provide a weights array for each data component (if not none). Returns ------- train, test : tuples Each is a tuple = (coordinates, data, weights) generated by separating the input values randomly. Examples -------- >>> import numpy as np >>> # Split 2-component data with weights >>> data = (np.array([1, 3, 5, 7]), np.array([0, 2, 4, 6])) >>> coordinates = (np.arange(4), np.arange(-4, 0)) >>> weights = (np.array([1, 1, 2, 1]), np.array([1, 2, 1, 1])) >>> train, test = train_test_split(coordinates, data, weights, ... random_state=0) >>> print("Coordinates:", train[0], test[0], sep='\n ') Coordinates: (array([3, 1, 0]), array([-1, -3, -4])) (array([2]), array([-2])) >>> print("Data:", train[1], test[1], sep='\n ') Data: (array([7, 3, 1]), array([6, 2, 0])) (array([5]), array([4])) >>> print("Weights:", train[2], test[2], sep='\n ') Weights: (array([1, 1, 1]), array([1, 2, 1])) (array([2]), array([1])) >>> # Split single component data without weights >>> train, test = train_test_split(coordinates, data[0], None, ... random_state=0) >>> print("Coordinates:", train[0], test[0], sep='\n ') Coordinates: (array([3, 1, 0]), array([-1, -3, -4])) (array([2]), array([-2])) >>> print("Data:", train[1], test[1], sep='\n ') Data: (array([7, 3, 1]),) (array([5]),) >>> print("Weights:", train[2], test[2], sep='\n ') Weights: (None,) (None,) """ args = check_fit_input(coordinates, data, weights, unpack=False) ndata = args[1][0].size indices = np.arange(ndata) split = next(ShuffleSplit(n_splits=1, **kwargs).split(indices)) train, test = (tuple(select(i, index) for i in args) for index in split) return train, test
[docs]def cross_val_score( estimator, coordinates, data, weights=None, cv=None, client=None, delayed=False ): """ Score an estimator/gridder using cross-validation. Similar to :func:`sklearn.model_selection.cross_val_score` but modified to accept spatial multi-component data with weights. By default, will use :class:`sklearn.model_selection.KFold` with ``n_splits=5`` and ``random_state=0`` to split the dataset. Any other cross-validation class can be passed in through the *cv* argument. Can optionally run in parallel using :mod:`dask`. To do this, use ``delayed=True`` to dispatch computations with :func:`dask.delayed` instead of running them. The returned scores will be "lazy" objects instead of the actual scores. To trigger the computation (which Dask will run in parallel) call the `.compute()` method of each score or :func:`dask.compute` with the entire list of scores. .. warning:: The ``client`` parameter is deprecated and will be removed in Verde v2.0.0. Use ``delayed`` instead. Parameters ---------- estimator : verde gridder Any verde gridder class that has the ``fit`` and ``score`` methods. coordinates : tuple of arrays Arrays with the coordinates of each data point. Should be in the following order: (easting, northing, vertical, ...). data : array or tuple of arrays the data values of each data point. If the data has more than one component, *data* must be a tuple of arrays (one for each component). weights : none or array or tuple of arrays if not none, then the weights assigned to each data point. If more than one data component is provided, you must provide a weights array for each data component (if not none). cv : None or cross-validation generator Any scikit-learn cross-validation generator. Defaults to :class:`sklearn.model_selection.KFold`. client : None or dask.distributed.Client **DEPRECATED:** This option is deprecated and will be removed in Verde v2.0.0. If None, then computations are run serially. Otherwise, should be a dask ``Client`` object. It will be used to dispatch computations to the dask cluster. delayed : bool If True, will use :func:`dask.delayed` to dispatch computations without actually executing them. The returned scores will be a list of delayed objects. Call `.compute()` on each score or :func:`dask.compute` on the entire list to trigger the actual computations. Returns ------- scores : array Array of scores for each split of the cross-validation generator. If *delayed*, will be a list of Dask delayed objects (see the *delayed* option). If *client* is not None, then the scores will be futures. Examples -------- As an example, we can score :class:`verde.Trend` on data that actually follows a linear trend. >>> from verde import grid_coordinates, Trend >>> coords = grid_coordinates((0, 10, -10, -5), spacing=0.1) >>> data = 10 - coords[0] + 0.5*coords[1] >>> model = Trend(degree=1) In this case, the model should perfectly predict the data and R² scores should be equal to 1. >>> scores = cross_val_score(model, coords, data) >>> print(', '.join(['{:.2f}'.format(score) for score in scores])) 1.00, 1.00, 1.00, 1.00, 1.00 There are 5 scores because the default cross-validator is :class:`sklearn.model_selection.KFold` with ``n_splits=5``. We can use different cross-validators by assigning them to the ``cv`` argument: >>> from sklearn.model_selection import ShuffleSplit >>> # Set the random state to get reproducible results >>> cross_validator = ShuffleSplit(n_splits=3, random_state=0) >>> scores = cross_val_score(model, coords, data, cv=cross_validator) >>> print(', '.join(['{:.2f}'.format(score) for score in scores])) 1.00, 1.00, 1.00 If using many splits, we can speed up computations by running them in parallel with Dask: >>> cross_validator = ShuffleSplit(n_splits=10, random_state=0) >>> scores_delayed = cross_val_score( ... model, coords, data, cv=cross_validator, delayed=True ... ) >>> # The scores are delayed objects. >>> # To actually run the computations, call dask.compute >>> import dask >>> scores = dask.compute(*scores_delayed) >>> print(', '.join(['{:.2f}'.format(score) for score in scores])) 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00 Note that you must have enough RAM to fit multiple models simultaneously. So this is best used when fitting several smaller models. """ if client is not None: warnings.warn( "The 'client' parameter of 'verde.cross_val_score' is deprecated " "and will be removed in Verde 2.0.0. " "Use the 'delayed' parameter instead.", DeprecationWarning, ) coordinates, data, weights = check_fit_input( coordinates, data, weights, unpack=False ) if cv is None: cv = KFold(shuffle=True, random_state=0, n_splits=5) ndata = data[0].size fit_args = (coordinates, data, weights) scores = [] for train_index, test_index in cv.split(np.arange(ndata)): train = tuple(select(i, train_index) for i in fit_args) test = tuple(select(i, test_index) for i in fit_args) # Clone the estimator to avoid fitting the same object simultaneously # when delayed=True. score = dispatch(fit_score, client=client, delayed=delayed)( clone(estimator), train, test ) scores.append(score) if not delayed and client is None: scores = np.asarray(scores) return scores
def fit_score(estimator, train_data, test_data): """ Fit an estimator on the training data and then score it on the testing data """ return estimator.fit(*train_data).score(*test_data) def select(arrays, index): """ Index each array in a tuple of arrays. If the arrays tuple contains a ``None``, the entire tuple will be returned as is. Parameters ---------- arrays : tuple of arrays index : array An array of indices to select from arrays. Returns ------- indexed_arrays : tuple of arrays Examples -------- >>> import numpy as np >>> select((np.arange(5), np.arange(-3, 2, 1)), [1, 3]) (array([1, 3]), array([-2, 0])) >>> select((None, None, None, None), [1, 2]) (None, None, None, None) """ if arrays is None or any(i is None for i in arrays): return arrays return tuple(i.ravel()[index] for i in arrays)