Source code for verde.model_selection

# Copyright (c) 2017 The Verde Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
Functions for automating model selection through cross-validation.
"""
import warnings

import numpy as np
from sklearn.base import clone
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.utils import check_random_state

from .base import BaseBlockCrossValidator, check_fit_input, n_1d_arrays
from .base.utils import score_estimator
from .coordinates import block_split
from .utils import dispatch, partition_by_sum


[docs]class BlockShuffleSplit(BaseBlockCrossValidator): """ Random permutation of spatial blocks cross-validator. Yields indices to split data into training and test sets. Data are first grouped into rectangular blocks of size given by the *spacing* argument. Alternatively, blocks can be defined by the number of blocks in each dimension using the *shape* argument instead of *spacing*. The blocks are then split into testing and training sets randomly. The proportion of blocks assigned to each set is controlled by *test_size* and/or *train_size*. However, the total amount of actual data points in each set could be different from these values since blocks can have a different number of data points inside them. To guarantee that the proportion of actual data is as close as possible to the proportion of blocks, this cross-validator generates an extra number of splits and selects the one with proportion of data points in each set closer to the desired amount [Valavi_etal2019]_. The number of balancing splits per iteration is controlled by the *balancing* argument. This cross-validator is preferred over :class:`sklearn.model_selection.ShuffleSplit` for spatial data to avoid overestimating cross-validation scores. This can happen because of the inherent autocorrelation that is usually associated with this type of data (points that are close together are more likely to have similar values). See [Roberts_etal2017]_ for an overview of this topic. .. note:: Like :class:`sklearn.model_selection.ShuffleSplit`, this cross-validator cannot guarantee that all folds will be different, although this is still very likely for sizeable datasets. Parameters ---------- spacing : float, tuple = (s_north, s_east), or None The block size in the South-North and West-East directions, respectively. A single value means that the spacing is equal in both directions. If None, then *shape* **must be provided**. shape : tuple = (n_north, n_east) or None The number of blocks in the South-North and West-East directions, respectively. If None, then *spacing* **must be provided**. n_splits : int, default 10 Number of re-shuffling & splitting iterations. test_size : float, int, None, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.1. train_size : float, int, or None, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. balancing : int The number of splits generated per iteration to try to balance the amount of data in each set so that *test_size* and *train_size* are respected. If 1, then no extra splits are generated (essentially disabling the balancing). Must be >= 1. See also -------- train_test_split : Split a dataset into a training and a testing set. cross_val_score : Score an estimator/gridder using cross-validation. Examples -------- >>> from verde import grid_coordinates >>> import numpy as np >>> # Make a regular grid of data points >>> coords = grid_coordinates(region=(0, 3, -10, -7), spacing=1) >>> # Need to convert the coordinates into a feature matrix >>> X = np.transpose([i.ravel() for i in coords]) >>> shuffle = BlockShuffleSplit(spacing=1.5, n_splits=3, random_state=0) >>> # These are the 1D indices of the points belonging to each set >>> for train, test in shuffle.split(X): ... print("Train: {} Test: {}".format(train, test)) Train: [ 0 1 2 3 4 5 6 7 10 11 14 15] Test: [ 8 9 12 13] Train: [ 2 3 6 7 8 9 10 11 12 13 14 15] Test: [0 1 4 5] Train: [ 0 1 4 5 8 9 10 11 12 13 14 15] Test: [2 3 6 7] >>> # A better way to visualize this is to create a 2D array and put >>> # "train" or "test" in the corresponding locations. >>> shape = coords[0].shape >>> mask = np.full(shape=shape, fill_value=" ") >>> for iteration, (train, test) in enumerate(shuffle.split(X)): ... # The index needs to be converted to 2D so we can index our matrix. ... mask[np.unravel_index(train, shape)] = "train" ... mask[np.unravel_index(test, shape)] = " test" ... print("Iteration {}:".format(iteration)) ... print(mask) Iteration 0: [['train' 'train' 'train' 'train'] ['train' 'train' 'train' 'train'] [' test' ' test' 'train' 'train'] [' test' ' test' 'train' 'train']] Iteration 1: [[' test' ' test' 'train' 'train'] [' test' ' test' 'train' 'train'] ['train' 'train' 'train' 'train'] ['train' 'train' 'train' 'train']] Iteration 2: [['train' 'train' ' test' ' test'] ['train' 'train' ' test' ' test'] ['train' 'train' 'train' 'train'] ['train' 'train' 'train' 'train']] """ def __init__( self, spacing=None, shape=None, n_splits=10, test_size=0.1, train_size=None, random_state=None, balancing=10, ): super().__init__(spacing=spacing, shape=shape, n_splits=n_splits) if balancing < 1: raise ValueError( "The *balancing* argument must be >= 1. To disable balancing, use 1." ) self.test_size = test_size self.train_size = train_size self.random_state = random_state self.balancing = balancing def _iter_test_indices(self, X=None, y=None, groups=None): # noqa: N803,U100 """ Generates integer indices corresponding to test sets. Runs several iterations until a split is found that yields blocks with the right amount of data points in it. Parameters ---------- X : array-like, shape (n_samples, 2) Columns should be the easting and northing coordinates of data points, respectively. y : array-like, shape (n_samples,) The target variable for supervised learning problems. Always ignored. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. Always ignored. Yields ------ test : ndarray The testing set indices for that split. """ labels = block_split( coordinates=(X[:, 0], X[:, 1]), spacing=self.spacing, shape=self.shape, region=None, adjust="spacing", )[1] block_ids = np.unique(labels) # Generate many more splits so that we can pick and choose the ones # that have the right balance of training and testing data. shuffle = ShuffleSplit( n_splits=self.n_splits * self.balancing, test_size=self.test_size, train_size=self.train_size, random_state=self.random_state, ).split(block_ids) for _ in range(self.n_splits): test_sets, balance = [], [] for _ in range(self.balancing): train_blocks, test_blocks = next(shuffle) train_points = np.where(np.isin(labels, block_ids[train_blocks]))[0] test_points = np.where(np.isin(labels, block_ids[test_blocks]))[0] # The proportion of data points assigned to each group should # be close the proportion of blocks assigned to each group. balance.append( abs( train_points.size / test_points.size - train_blocks.size / test_blocks.size ) ) test_sets.append(test_points) best = np.argmin(balance) yield test_sets[best]
[docs]class BlockKFold(BaseBlockCrossValidator): """ K-Folds over spatial blocks cross-validator. Yields indices to split data into training and test sets. Data are first grouped into rectangular blocks of size given by the *spacing* argument. Alternatively, blocks can be defined by the number of blocks in each dimension using the *shape* argument instead of *spacing*. The blocks are then split into testing and training sets iteratively along k folds of the data (k is given by *n_splits*). By default, the blocks are split into folds in a way that makes each fold have approximately the same number of data points. Sometimes this might not be possible, which can happen if the number of splits is close to the number of blocks. In these cases, each fold will have the same number of blocks, regardless of how many data points are in each block. This behaviour can also be disabled by setting ``balance=False``. Shuffling the blocks prior to splitting is strongly encouraged. Not shuffling will essentially lead to the creation of *n_splits* large blocks since blocks are spatially adjacent when not shuffled. The default behaviour is not to shuffle for compatibility with similar cross-validators in scikit-learn. This cross-validator is preferred over :class:`sklearn.model_selection.KFold` for spatial data to avoid overestimating cross-validation scores. This can happen because of the inherent autocorrelation that is usually associated with this type of data (points that are close together are more likely to have similar values). See [Roberts_etal2017]_ for an overview of this topic. Parameters ---------- spacing : float, tuple = (s_north, s_east), or None The block size in the South-North and West-East directions, respectively. A single value means that the spacing is equal in both directions. If None, then *shape* **must be provided**. shape : tuple = (n_north, n_east) or None The number of blocks in the South-North and West-East directions, respectively. If None, then *spacing* **must be provided**. n_splits : int Number of folds. Must be at least 2. shuffle : bool Whether to shuffle the data before splitting into batches. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. balance : bool Whether or not to split blocks into fold with approximately equal number of data points. If False, each fold will have the same number of blocks (which can have different number of data points in them). See also -------- train_test_split : Split a dataset into a training and a testing set. cross_val_score : Score an estimator/gridder using cross-validation. Examples -------- >>> from verde import grid_coordinates >>> import numpy as np >>> # Make a regular grid of data points >>> coords = grid_coordinates(region=(0, 3, -10, -7), spacing=1) >>> # Need to convert the coordinates into a feature matrix >>> X = np.transpose([i.ravel() for i in coords]) >>> kfold = BlockKFold(spacing=1.5, n_splits=4) >>> # These are the 1D indices of the points belonging to each set >>> for train, test in kfold.split(X): ... print("Train: {} Test: {}".format(train, test)) Train: [ 2 3 6 7 8 9 10 11 12 13 14 15] Test: [0 1 4 5] Train: [ 0 1 4 5 8 9 10 11 12 13 14 15] Test: [2 3 6 7] Train: [ 0 1 2 3 4 5 6 7 10 11 14 15] Test: [ 8 9 12 13] Train: [ 0 1 2 3 4 5 6 7 8 9 12 13] Test: [10 11 14 15] >>> # A better way to visualize this is to create a 2D array and put >>> # "train" or "test" in the corresponding locations. >>> shape = coords[0].shape >>> mask = np.full(shape=shape, fill_value=" ") >>> for iteration, (train, test) in enumerate(kfold.split(X)): ... # The index needs to be converted to 2D so we can index our matrix. ... mask[np.unravel_index(train, shape)] = "train" ... mask[np.unravel_index(test, shape)] = " test" ... print("Iteration {}:".format(iteration)) ... print(mask) Iteration 0: [[' test' ' test' 'train' 'train'] [' test' ' test' 'train' 'train'] ['train' 'train' 'train' 'train'] ['train' 'train' 'train' 'train']] Iteration 1: [['train' 'train' ' test' ' test'] ['train' 'train' ' test' ' test'] ['train' 'train' 'train' 'train'] ['train' 'train' 'train' 'train']] Iteration 2: [['train' 'train' 'train' 'train'] ['train' 'train' 'train' 'train'] [' test' ' test' 'train' 'train'] [' test' ' test' 'train' 'train']] Iteration 3: [['train' 'train' 'train' 'train'] ['train' 'train' 'train' 'train'] ['train' 'train' ' test' ' test'] ['train' 'train' ' test' ' test']] For spatial data, it's often good to shuffle the blocks before assigning them to folds: >>> # Set the random_state to make sure we always get the same result. >>> kfold = BlockKFold( ... spacing=1.5, n_splits=4, shuffle=True, random_state=123, ... ) >>> for train, test in kfold.split(X): ... print("Train: {} Test: {}".format(train, test)) Train: [ 0 1 2 3 4 5 6 7 8 9 12 13] Test: [10 11 14 15] Train: [ 2 3 6 7 8 9 10 11 12 13 14 15] Test: [0 1 4 5] Train: [ 0 1 4 5 8 9 10 11 12 13 14 15] Test: [2 3 6 7] Train: [ 0 1 2 3 4 5 6 7 10 11 14 15] Test: [ 8 9 12 13] These should be the same splits as we got before but in a different order. This only happens because in this example we have the number of splits equal to the number of blocks (4). With real data the effects would be more dramatic. """ def __init__( self, spacing=None, shape=None, n_splits=5, shuffle=False, random_state=None, balance=True, ): super().__init__(spacing=spacing, shape=shape, n_splits=n_splits) if n_splits < 2: raise ValueError( "Number of splits must be >=2 for BlockKFold. Given {}.".format( n_splits ) ) self.shuffle = shuffle self.random_state = random_state self.balance = balance def _iter_test_indices(self, X=None, y=None, groups=None): # noqa: N803,U100 """ Generates integer indices corresponding to test sets. Parameters ---------- X : array-like, shape (n_samples, 2) Columns should be the easting and northing coordinates of data points, respectively. y : array-like, shape (n_samples,) The target variable for supervised learning problems. Always ignored. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. Always ignored. Yields ------ test : ndarray The testing set indices for that split. """ labels = block_split( coordinates=(X[:, 0], X[:, 1]), spacing=self.spacing, shape=self.shape, region=None, adjust="spacing", )[1] block_ids = np.unique(labels) if self.n_splits > block_ids.size: raise ValueError( "Number of k-fold splits ({}) cannot be greater than the number of " "blocks ({}). Either decrease n_splits or increase the number of " "blocks.".format(self.n_splits, block_ids.size) ) if self.shuffle: check_random_state(self.random_state).shuffle(block_ids) if self.balance: block_sizes = [np.isin(labels, i).sum() for i in block_ids] try: split_points = partition_by_sum(block_sizes, parts=self.n_splits) folds = np.split(np.arange(block_ids.size), split_points) except ValueError: warnings.warn( "Could not balance folds to have approximately the same " "number of data points. Dividing into folds with equal " "number of blocks instead. Decreasing n_splits or increasing " "the number of blocks may help.", UserWarning, ) folds = [i for _, i in KFold(n_splits=self.n_splits).split(block_ids)] else: folds = [i for _, i in KFold(n_splits=self.n_splits).split(block_ids)] for test_blocks in folds: test_points = np.where(np.isin(labels, block_ids[test_blocks]))[0] yield test_points
[docs]def train_test_split( coordinates, data, weights=None, spacing=None, shape=None, **kwargs ): r""" Split a dataset into a training and a testing set for cross-validation. Similar to :func:`sklearn.model_selection.train_test_split` but is tuned to work on single- or multi-component spatial data with optional weights. If arguments *shape* or *spacing* are provided, will group the data by spatial blocks before random splitting (using :class:`verde.BlockShuffleSplit` instead of :class:`sklearn.model_selection.ShuffleSplit`). The argument *spacing* specifies the size of the spatial blocks. Alternatively, use *shape* to specify the number of blocks in each dimension. Extra keyword arguments will be passed to the cross-validation class. The exception is ``n_splits`` which is always 1. Grouping by spatial blocks is preferred over plain random splits for spatial data to avoid overestimating validation scores. This can happen because of the inherent autocorrelation that is usually associated with this type of data (points that are close together are more likely to have similar values). See [Roberts_etal2017]_ for an overview of this topic. To use spatial blocking, you **must provide** a *spacing* or *shape* argument (see below). Parameters ---------- coordinates : tuple of arrays Arrays with the coordinates of each data point. Should be in the following order: (easting, northing, vertical, ...). data : array or tuple of arrays the data values of each data point. If the data has more than one component, *data* must be a tuple of arrays (one for each component). weights : none or array or tuple of arrays if not none, then the weights assigned to each data point. If more than one data component is provided, you must provide a weights array for each data component (if not none). spacing : float, tuple = (s_north, s_east), or None The spatial block size in the South-North and West-East directions, respectively. A single value means that the spacing is equal in both directions. If None, then *shape* must be provided in order to use spatial blocking. shape : tuple = (n_north, n_east) or None The number of blocks in the South-North and West-East directions, respectively. If None, then *spacing* must be provided in order to use spatial blocking. Returns ------- train, test : tuples Each is a tuple = (coordinates, data, weights) generated by separating the input values randomly. See also -------- cross_val_score : Score an estimator/gridder using cross-validation. BlockShuffleSplit : Random permutation of spatial blocks cross-validator. Examples -------- To randomly split the data between training and testing sets: >>> import numpy as np >>> # Make some data >>> data = np.array([10, 30, 50, 70]) >>> coordinates = (np.arange(4), np.arange(-4, 0)) >>> train, test = train_test_split(coordinates, data, random_state=0) >>> # The training set: >>> print("coords:", train[0]) coords: (array([3, 1, 0]), array([-1, -3, -4])) >>> print("data:", train[1]) data: (array([70, 30, 10]),) >>> # The testing set: >>> print("coords:", test[0]) coords: (array([2]), array([-2])) >>> print("data:", test[1]) data: (array([50]),) If weights are given, they will also be split among the sets: >>> weights = np.array([4, 3, 2, 5]) >>> train, test = train_test_split( ... coordinates, data, weights, random_state=0, ... ) >>> # The training set: >>> print("coords:", train[0]) coords: (array([3, 1, 0]), array([-1, -3, -4])) >>> print("data:", train[1]) data: (array([70, 30, 10]),) >>> print("weights:", train[2]) weights: (array([5, 3, 4]),) >>> # The testing set: >>> print("coords:", test[0]) coords: (array([2]), array([-2])) >>> print("data:", test[1]) data: (array([50]),) >>> print("weights:", test[2]) weights: (array([2]),) Data with multiple components can also be split: >>> data = (np.array([10, 30, 50, 70]), np.array([-70, -50, -30, -10])) >>> train, test = train_test_split(coordinates, data, random_state=0) >>> # The training set: >>> print("coords:", train[0]) coords: (array([3, 1, 0]), array([-1, -3, -4])) >>> print("data:", train[1]) data: (array([70, 30, 10]), array([-10, -50, -70])) >>> # The testing set: >>> print("coords:", test[0]) coords: (array([2]), array([-2])) >>> print("data:", test[1]) data: (array([50]), array([-30])) To split data grouped in spatial blocks: >>> from verde import grid_coordinates >>> # Make a regular grid of data points >>> coordinates = grid_coordinates(region=(0, 3, 4, 7), spacing=1) >>> data = np.arange(16).reshape((4, 4)) >>> # We must specify the size of the blocks via the spacing argument. >>> # Blocks of 1.5 will split the domain into 4 blocks. >>> train, test = train_test_split( ... coordinates, data, random_state=0, spacing=1.5, ... ) >>> # The training set: >>> print("coords:", train[0][0], train[0][1], sep="\n") coords: [0. 1. 2. 3. 0. 1. 2. 3. 2. 3. 2. 3.] [4. 4. 4. 4. 5. 5. 5. 5. 6. 6. 7. 7.] >>> print("data:", train[1]) data: (array([ 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 14, 15]),) >>> # The testing set: >>> print("coords:", test[0][0], test[0][1]) coords: [0. 1. 0. 1.] [6. 6. 7. 7.] >>> print("data:", test[1]) data: (array([ 8, 9, 12, 13]),) """ args = check_fit_input(coordinates, data, weights, unpack=False) if spacing is None and shape is None: indices = np.arange(args[1][0].size) shuffle = ShuffleSplit(n_splits=1, **kwargs).split(indices) else: feature_matrix = np.transpose(n_1d_arrays(coordinates, 2)) shuffle = BlockShuffleSplit( n_splits=1, spacing=spacing, shape=shape, **kwargs ).split(feature_matrix) split = next(shuffle) train, test = (tuple(select(i, index) for i in args) for index in split) return train, test
[docs]def cross_val_score( estimator, coordinates, data, weights=None, cv=None, client=None, delayed=False, scoring=None, ): """ Score an estimator/gridder using cross-validation. Similar to :func:`sklearn.model_selection.cross_val_score` but modified to accept spatial multi-component data with weights. By default, will use :class:`sklearn.model_selection.KFold` with ``n_splits=5`` and ``random_state=0`` to split the dataset. Any other cross-validation class from scikit-learn or Verde can be passed in through the *cv* argument. The score is calculated using the estimator/gridder's ``.score`` method by default. Alternatively, use the *scoring* parameter to specify a different scoring function (e.g., mean square error, mean absolute error, etc). Can optionally run in parallel using :mod:`dask`. To do this, use ``delayed=True`` to dispatch computations with :func:`dask.delayed` instead of running them. The returned scores will be "lazy" objects instead of the actual scores. To trigger the computation (which Dask will run in parallel) call the `.compute()` method of each score or :func:`dask.compute` with the entire list of scores. .. warning:: The ``client`` parameter is deprecated and will be removed in Verde v2.0.0. Use ``delayed`` instead. Parameters ---------- estimator : verde gridder Any verde gridder class that has the ``fit`` and ``score`` methods. coordinates : tuple of arrays Arrays with the coordinates of each data point. Should be in the following order: (easting, northing, vertical, ...). data : array or tuple of arrays the data values of each data point. If the data has more than one component, *data* must be a tuple of arrays (one for each component). weights : none or array or tuple of arrays if not none, then the weights assigned to each data point. If more than one data component is provided, you must provide a weights array for each data component (if not none). cv : None or cross-validation generator Any scikit-learn or Verde cross-validation generator. Defaults to :class:`sklearn.model_selection.KFold`. client : None or dask.distributed.Client **DEPRECATED:** This option is deprecated and will be removed in Verde v2.0.0. If None, then computations are run serially. Otherwise, should be a dask ``Client`` object. It will be used to dispatch computations to the dask cluster. delayed : bool If True, will use :func:`dask.delayed` to dispatch computations without actually executing them. The returned scores will be a list of delayed objects. Call `.compute()` on each score or :func:`dask.compute` on the entire list to trigger the actual computations. scoring : None, str, or callable A scoring function (or name of a function) known to scikit-learn. See the description of *scoring* in :func:`sklearn.model_selection.cross_val_score` for details. If None, will fall back to the estimator's ``.score`` method. Returns ------- scores : array Array of scores for each split of the cross-validation generator. If *delayed*, will be a list of Dask delayed objects (see the *delayed* option). If *client* is not None, then the scores will be futures. See also -------- train_test_split : Split a dataset into a training and a testing set. BlockShuffleSplit : Random permutation of spatial blocks cross-validator. Examples -------- As an example, we can score :class:`verde.Trend` on data that actually follows a linear trend. >>> from verde import grid_coordinates, Trend >>> coords = grid_coordinates((0, 10, -10, -5), spacing=0.1) >>> data = 10 - coords[0] + 0.5*coords[1] >>> model = Trend(degree=1) In this case, the model should perfectly predict the data and R² scores should be equal to 1. >>> scores = cross_val_score(model, coords, data) >>> print(', '.join(['{:.2f}'.format(score) for score in scores])) 1.00, 1.00, 1.00, 1.00, 1.00 There are 5 scores because the default cross-validator is :class:`sklearn.model_selection.KFold` with ``n_splits=5``. To calculate the score with a different metric, use the *scoring* argument: >>> scores = cross_val_score( ... model, coords, data, scoring="neg_mean_squared_error", ... ) >>> print(', '.join(['{:.2f}'.format(-score) for score in scores])) 0.00, 0.00, 0.00, 0.00, 0.00 In this case, we calculated the (negative) mean squared error (MSE) which measures the distance between test data and predictions. This way, 0 is the best possible value meaning that the data and prediction are the same. The "neg" part indicates that this is the negative mean square error. This is required because scikit-learn assumes that higher scores are always treated as better (which is the opposite for MSE). For display, we take the negative of the score to get the actual MSE. We can use different cross-validators by assigning them to the ``cv`` argument: >>> from sklearn.model_selection import ShuffleSplit >>> # Set the random state to get reproducible results >>> cross_validator = ShuffleSplit(n_splits=3, random_state=0) >>> scores = cross_val_score(model, coords, data, cv=cross_validator) >>> print(', '.join(['{:.2f}'.format(score) for score in scores])) 1.00, 1.00, 1.00 Often, spatial data are autocorrelated (points that are close together are more likely to have similar values), which can cause cross-validation with random splits to overestimate the prediction accuracy [Roberts_etal2017]_. To account for the autocorrelation, we can split the data in blocks rather than randomly with :class:`verde.BlockShuffleSplit`: >>> from verde import BlockShuffleSplit >>> # spacing controls the size of the spatial blocks >>> cross_validator = BlockShuffleSplit( ... spacing=2, n_splits=3, random_state=0 ... ) >>> scores = cross_val_score(model, coords, data, cv=cross_validator) >>> print(', '.join(['{:.2f}'.format(score) for score in scores])) 1.00, 1.00, 1.00 We didn't see a difference here since our model and data are perfect. See :ref:`model_evaluation` for an example with real data. If using many splits, we can speed up computations by running them in parallel with Dask: >>> cross_validator = ShuffleSplit(n_splits=10, random_state=0) >>> scores_delayed = cross_val_score( ... model, coords, data, cv=cross_validator, delayed=True ... ) >>> # The scores are delayed objects. >>> # To actually run the computations, call dask.compute >>> import dask >>> scores = dask.compute(*scores_delayed) >>> print(', '.join(['{:.2f}'.format(score) for score in scores])) 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00 Note that you must have enough RAM to fit multiple models simultaneously. So this is best used when fitting several smaller models. """ if client is not None: warnings.warn( "The 'client' parameter of 'verde.cross_val_score' is deprecated " "and will be removed in Verde 2.0.0. " "Use the 'delayed' parameter instead.", FutureWarning, ) coordinates, data, weights = check_fit_input( coordinates, data, weights, unpack=False ) if cv is None: cv = KFold(shuffle=True, random_state=0, n_splits=5) feature_matrix = np.transpose(n_1d_arrays(coordinates, 2)) fit_args = (coordinates, data, weights) scores = [] for train_index, test_index in cv.split(feature_matrix): # Clone the estimator to avoid fitting the same object simultaneously # when delayed=True. score = dispatch(fit_score, client=client, delayed=delayed)( clone(estimator), tuple(select(i, train_index) for i in fit_args), tuple(select(i, test_index) for i in fit_args), scoring, ) scores.append(score) if not delayed and client is None: scores = np.asarray(scores) return scores
def fit_score(estimator, train_data, test_data, scoring): """ Fit an estimator on the training data and then score it on the testing data """ estimator.fit(*train_data) if scoring is None: score = estimator.score(*test_data) else: score = score_estimator(scoring, estimator, *test_data) return score def select(arrays, index): """ Index each array in a tuple of arrays. If the arrays tuple contains a ``None``, the entire tuple will be returned as is. Parameters ---------- arrays : tuple of arrays The arrays to index index : array An array of indices to select from arrays. Returns ------- indexed_arrays : tuple of arrays Examples -------- >>> import numpy as np >>> select((np.arange(5), np.arange(-3, 2, 1)), [1, 3]) (array([1, 3]), array([-2, 0])) >>> select((None, None, None, None), [1, 2]) (None, None, None, None) """ if arrays is None or any(i is None for i in arrays): return arrays return tuple(i.ravel()[index] for i in arrays)