# Copyright (c) 2017 The Verde Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
Utility functions for building gridders and checking arguments.
"""
import numpy as np
from sklearn.metrics import check_scoring
def score_estimator(scoring, estimator, coordinates, data, weights=None):
"""
Score the given gridder against the given data using the given metric.
If the data and predictions have more than 1 component, the scores of each
component will be averaged.
Parameters
----------
scoring : str or callable
A scoring specification known to scikit-learn. See
:func:`sklearn.metrics.check_scoring`.
estimator : a Verde gridder
The gridder to score. Usually derived from
:class:`verde.base.BaseGridder`.
coordinates : tuple of arrays
Arrays with the coordinates of each data point. Should be in the
following order: (easting, northing, vertical, ...).
For the specific definition of coordinate systems and what these
names mean, see the class docstring.
data : array or tuple of arrays
The data values of each data point. If the data has more than one
component, *data* must be a tuple of arrays (one for each
component).
weights : None or array or tuple of arrays
If not None, then the weights assigned to each data point. If more
than one data component is provided, you must provide a weights
array for each data component (if not None).
Returns
-------
score : float
The score.
"""
coordinates, data, weights = check_fit_input(
coordinates, data, weights, unpack=False
)
predicted = check_data(estimator.predict(coordinates))
scorer = check_scoring(DummyEstimator, scoring=scoring)
result = np.mean(
[
scorer(
DummyEstimator(np.ravel(pred)),
coordinates,
np.ravel(data[i]),
sample_weight=weights[i],
)
for i, pred in enumerate(predicted)
]
)
return result
class DummyEstimator:
"""
Dummy estimator that does nothing but pass along the predicted data.
Used to fool the scikit-learn scorer functions to fit our API
(multi-component estimators return a tuple on .predict).
>>> est = DummyEstimator([1, 2, 3])
>>> print(est.fit().predict())
[1, 2, 3]
"""
def __init__(self, predicted):
self._predicted = predicted
def predict(self, *args, **kwargs): # noqa: U100
"Return the stored predicted values"
return self._predicted
def fit(self, *args, **kwards): # noqa: U100
"Does nothing. Just here to satisfy the API."
return self
def check_data(data):
"""
Check the *data* argument and make sure it's a tuple.
If the data is a single array, return it as a tuple with a single element.
This is the default format accepted and used by all gridders and processing
functions.
Examples
--------
>>> check_data([1, 2, 3])
([1, 2, 3],)
>>> check_data(([1, 2], [3, 4]))
([1, 2], [3, 4])
"""
if not isinstance(data, tuple):
data = (data,)
return data
def check_data_names(data, data_names):
"""
Check *data_names* against *data*.
Also, convert ``data_names`` to a tuple if it's a single string.
Examples
--------
>>> import numpy as np
>>> east, north, scalar = [np.array(10)]*3
>>> check_data_names((scalar,), "dummy")
('dummy',)
>>> check_data_names((scalar,), ("dummy",))
('dummy',)
>>> check_data_names((scalar,), ["dummy"])
['dummy']
>>> check_data_names((east, north), ("component_x", "component_y"))
('component_x', 'component_y')
"""
# Convert single string to tuple
if isinstance(data_names, str):
data_names = (data_names,)
# Raise error if data_names is None
if data_names is None:
raise ValueError("Invalid data_names equal to None.")
# Raise error if data and data_names don't have the same number of elements
if len(data) != len(data_names):
raise ValueError(
"Data has {} components but only {} names provided: {}".format(
len(data), len(data_names), str(data_names)
)
)
return data_names
def check_coordinates(coordinates):
"""
Check that the given coordinate arrays are what we expect them to be.
Should be a tuple with arrays of the same shape.
"""
shapes = [coord.shape for coord in coordinates]
if not all(shape == shapes[0] for shape in shapes):
raise ValueError(
"Coordinate arrays must have the same shape. Coordinate shapes: {}".format(
shapes
)
)
return coordinates
def check_extra_coords_names(coordinates, extra_coords_names):
"""
Check extra_coords_names against coordinates.
Also, convert ``extra_coords_names`` to a tuple if it's a single string.
Assume that there are extra coordinates on the ``coordinates`` tuple.
Examples
--------
>>> import numpy as np
>>> coordinates = [np.array(10)]*3
>>> check_extra_coords_names(coordinates, "upward")
('upward',)
>>> check_extra_coords_names(coordinates, ("upward",))
('upward',)
>>> coordinates = [np.array(10)]*4
>>> check_extra_coords_names(coordinates, ("upward", "time"))
('upward', 'time')
"""
# Convert single string to a tuple
if isinstance(extra_coords_names, str):
extra_coords_names = (extra_coords_names,)
# Check if it's not None
if extra_coords_names is None:
raise ValueError(
"Invalid extra_coords_names equal to None. "
+ "When passing one or more extra coordinate, "
+ "extra_coords_names cannot be None."
)
# Check if there are the same number of extra_coords than extra_coords_name
if len(coordinates[2:]) != len(extra_coords_names):
raise ValueError(
"Invalid extra_coords_names '{}'. ".format(extra_coords_names)
+ "Number of extra coordinates names must match the number of "
+ "additional coordinates ('{}').".format(len(coordinates[2:]))
)
return extra_coords_names
[docs]
def n_1d_arrays(arrays, n):
"""
Get the first n elements from a tuple/list, convert to arrays, and ravel.
Use this function to make sure that coordinate and data arrays are ready
for building Jacobian matrices and least-squares fitting.
Parameters
----------
arrays : tuple of arrays
The arrays. Can be lists or anything that can be converted to a numpy
array (including numpy arrays).
n : int
How many arrays to return.
Returns
-------
1darrays : tuple of arrays
The converted 1D numpy arrays.
Examples
--------
>>> import numpy as np
>>> arrays = [np.arange(4).reshape(2, 2)]*3
>>> n_1d_arrays(arrays, n=2)
(array([0, 1, 2, 3]), array([0, 1, 2, 3]))
"""
return tuple(np.ravel(np.atleast_1d(i)) for i in arrays[:n])