Source code for verde.blockreduce

"""
Classes for reducing/aggregating data in blocks.
"""
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator

from .coordinates import block_split
from .base import check_fit_input
from .utils import variance_to_weights


def attach_weights(reduction, weights):
    """
    Create a partial application of reduction with the proper weights attached.

    Makes a function that calls *reduction* and gives it the weights
    corresponding to the index of the particular values it receives. Meant for
    used in a groupby aggregation of a pandas.DataFrame. See class BlockReduce.
    """

    def weighted_reduction(values):
        "weighted reduction using the stored from the outer scope weights"
        w = weights[values.index]
        return reduction(values, weights=w)

    return weighted_reduction


[docs]class BlockReduce(BaseEstimator):
    """
    Apply a reduction/aggregation operation to the data in blocks/windows.

    Returns the reduced data value for each block along with the associated
    coordinates, which can be determined through the same reduction applied to
    the coordinates or as the center of each block.

    If a data region to be divided into blocks is not given, it will be the
    bounding region of the data. When using this class to decimate data before
    gridding, it's best to use the same region and spacing as the desired grid.

    If the given region is not divisible by the spacing (block size), either
    the region or the spacing will have to be adjusted. By default, the spacing
    will be rounded to the nearest multiple. Optionally, the East and North
    boundaries of the region can be adjusted to fit the exact spacing given.

    Blocks without any data are omitted from the output.

    Implements the :meth:`~verde.BlockReduce.filter` method so it can be used
    with :class:`verde.Chain`. Only acts during data fitting and is ignored
    during prediction.

    Parameters
    ----------
    reduction : function
        A reduction function that takes an array and returns a single value
        (e.g., ``np.mean``, ``np.median``, etc).
    spacing : float, tuple = (s_north, s_east), or None
        The block size in the South-North and West-East directions,
        respectively. A single value means that the size is equal in both
        directions.
    region : list = [W, E, S, N]
        The boundaries of a given region in Cartesian or geographic
        coordinates.
    adjust : {'spacing', 'region'}
        Whether to adjust the spacing or the region if required. Ignored if
        *shape* is given instead of *spacing*. Defaults to adjusting the
        spacing.
    center_coordinates : bool
        If True, then the returned coordinates correspond to the center of each
        block. Otherwise, the coordinates are calculated by applying the same
        reduction operation to the input coordinates.

    See also
    --------
    block_split : Split a region into blocks and label points accordingly.
    BlockMean : Apply the mean in blocks. Will output weights.
    verde.Chain : Apply filter operations successively on data.

    """

    def __init__(
        self,
        reduction,
        spacing,
        region=None,
        adjust="spacing",
        center_coordinates=False,
    ):
        self.reduction = reduction
        self.spacing = spacing
        self.region = region
        self.adjust = adjust
        self.center_coordinates = center_coordinates

[docs]    def filter(self, coordinates, data, weights=None):
        """
        Apply the blocked aggregation to the given data.

        Returns the reduced data value for each block along with the associated
        coordinates, which can be determined through the same reduction applied
        to the coordinates or as the center of each block.

        If weights are given, the reduction function must accept a ``weights``
        keyword argument. The weights are passed in to the reduction but we
        have no generic way aggregating the weights or reporting uncertainties.
        For that, look to the specialized classes like
        :class:`verde.BlockMean`.

        Parameters
        ----------
        coordinates : tuple of arrays
            Arrays with the coordinates of each data point. Should be in the
            following order: (easting, northing, vertical, ...). Only easting
            and northing will be used, all subsequent coordinates will be
            ignored.
        data : array or tuple of arrays
            The data values at each point. If you want to reduce more than one
            data component, pass in multiple arrays as elements of a tuple. All
            arrays must have the same shape.
        weights : None or array or tuple of arrays
            If not None, then the weights assigned to each data point. If more
            than one data component is provided, you must provide a weights
            array for each data component (if not None).

        Returns
        -------
        blocked_coordinates : tuple of arrays
            (easting, northing) arrays with the coordinates of each block that
            contains data.
        blocked_data : array
            The block reduced data values.

        """
        coordinates, data, weights = check_fit_input(
            coordinates, data, weights, unpack=False
        )
        blocks, labels = block_split(
            coordinates, self.spacing, self.adjust, self.region
        )
        if any(w is None for w in weights):
            reduction = self.reduction
        else:
            reduction = {
                "data{}".format(i): attach_weights(self.reduction, w)
                for i, w in enumerate(weights)
            }
        columns = {"data{}".format(i): comp.ravel() for i, comp in enumerate(data)}
        columns["block"] = labels
        blocked = pd.DataFrame(columns).groupby("block").aggregate(reduction)
        blocked_data = tuple(
            blocked["data{}".format(i)].values.ravel() for i, _ in enumerate(data)
        )
        blocked_coords = self._block_coordinates(coordinates, blocks, labels)
        if len(blocked_data) == 1:
            return blocked_coords, blocked_data[0]
        return blocked_coords, blocked_data

    def _block_coordinates(self, coordinates, block_coordinates, labels):
        """
        Calculate a coordinate assigned to each block.

        If self.center_coordinates, the coordinates will be the center of each
        block. Otherwise, will apply the reduction to the coordinates.

        Blocks without any data will be omitted.

        *block_coordinates* and *labels* should be the outputs of
        :func:`verde.block_split`.

        Parameters
        ----------
        coordinates : tuple of arrays
            Arrays with the coordinates of each data point. Should be in the
            following order: (easting, northing, vertical, ...). Only easting
            and northing will be used, all subsequent coordinates will be
            ignored.
        block_coordinates : tuple of arrays
            (easting, northing) arrays with the coordinates of the center of
            each block.
        labels : array
            integer label for each data point. The label is the index of the
            block to which that point belongs.

        Returns
        -------
        coordinates : tuple of arrays
            (easting, northing) arrays with the coordinates assigned to each
            non-empty block.

        """
        if self.center_coordinates:
            unique = np.unique(labels)
            return tuple(i[unique] for i in block_coordinates)
        # Doing the coordinates separately from the data because in case of
        # weights the reduction applied to then is different (no weights
        # ever)
        easting, northing = coordinates[:2]
        table = pd.DataFrame(
            dict(easting=easting.ravel(), northing=northing.ravel(), block=labels)
        )
        grouped = table.groupby("block").aggregate(self.reduction)
        return grouped.easting.values, grouped.northing.values


[docs]class BlockMean(BlockReduce):
    """
    Apply a (weighted) mean to the data in blocks/windows.

    Returns the mean data value for each block along with the associated
    coordinates and weights. Coordinates can be determined through the mean of
    the data coordinates or as the center of each block. Weights can be
    calculated in three ways:

    1. Using the variance of the data: ``weights=1/variance``. This is the
       only possible option when no input weights are provided.
    2. Using the uncertainty of the weighted mean propagated from the
       uncertainties in the data: ``weights=1/uncertainty**2``. In this case,
       we assume that the input weights are also ``1/uncertainty**2``. **Do not
       normalize or scale the weights if using uncertainty propagation**.
    3. Using the weighted variance of the data: ``1/weighted_variance``. In
       this case, we make no assumptions about the nature of the weights.

    For all three options, the output weights are scaled to the range [0, 1].

    This class always outputs weights. If you want to calculate a blocked
    mean and not output any weights, use :class:`verde.BlockReduce` with
    ``numpy.average`` instead.

    Using the propagated uncertainties may be more adequate if your data is
    smooth in each block but have very different uncertainties. The propagation
    will preserve a low weight for data that have large uncertainties but don't
    vary much inside the block.

    The weighted variance should be used when the data vary a lot in each block
    but have very similar uncertainties. This is also the best choice if your
    input weights aren't ``1/uncertainty**2`` but are a relative importance of
    the data instead.

    If a data region to be divided into blocks is not given, it will be the
    bounding region of the data. When using this class to decimate data before
    gridding, it's best to use the same region and spacing as the desired grid.

    If the given region is not divisible by the spacing (block size), either
    the region or the spacing will have to be adjusted. By default, the spacing
    will be rounded to the nearest multiple. Optionally, the East and North
    boundaries of the region can be adjusted to fit the exact spacing given.

    Blocks without any data are omitted from the output.

    Implements the :meth:`~verde.BlockMean.filter` method so it can be used
    with :class:`verde.Chain`. Only acts during data fitting and is ignored
    during prediction.

    Parameters
    ----------
    spacing : float, tuple = (s_north, s_east), or None
        The block size in the South-North and West-East directions,
        respectively. A single value means that the size is equal in both
        directions.
    region : list = [W, E, S, N]
        The boundaries of a given region in Cartesian or geographic
        coordinates.
    adjust : {'spacing', 'region'}
        Whether to adjust the spacing or the region if required. Ignored if
        *shape* is given instead of *spacing*. Defaults to adjusting the
        spacing.
    center_coordinates : bool
        If True, then the returned coordinates correspond to the center of each
        block. Otherwise, the coordinates are calculated by applying the same
        reduction operation to the input coordinates.
    uncertainty : bool
        If True, the blocked weights will be calculated by uncertainty
        propagation of the data uncertainties. If this is case, then the input
        weights **must be** ``1/uncertainty**2``. **Do not normalize the
        input weights**. If False, then the blocked weights will be calculated
        as ``1/variance`` and no assumptions are made of the input weights (so
        they can be normalized).

    See also
    --------
    block_split : Split a region into blocks and label points accordingly.
    BlockReduce : Apply the mean in blocks. Will output weights.
    verde.Chain : Apply filter operations successively on data.

    """

    def __init__(
        self,
        spacing,
        region=None,
        adjust="spacing",
        center_coordinates=False,
        uncertainty=False,
    ):
        super().__init__(np.average, spacing, region, adjust, center_coordinates)
        self.uncertainty = uncertainty

[docs]    def filter(self, coordinates, data, weights=None):
        """
        Apply the blocked mean to the given data.

        Returns the reduced data value for each block along with the associated
        coordinates and weights. See the class docstring for details.

        Parameters
        ----------
        coordinates : tuple of arrays
            Arrays with the coordinates of each data point. Should be in the
            following order: (easting, northing, vertical, ...). Only easting
            and northing will be used, all subsequent coordinates will be
            ignored.
        data : array or tuple of arrays
            The data values at each point. If you want to reduce more than one
            data component, pass in multiple arrays as elements of a tuple. All
            arrays must have the same shape.
        weights : None or array or tuple of arrays
            If not None, then the weights assigned to each data point. If more
            than one data component is provided, you must provide a weights
            array for each data component (if not None). If calculating the
            output weights through uncertainty propagation, then *weights*
            **must be** ``1/uncertainty**2``.

        Returns
        -------
        blocked_coordinates : tuple of arrays
            (easting, northing) arrays with the coordinates of each block that
            contains data.
        blocked_mean : array or tuple of arrays
            The block averaged data values.
        blocked_weights : array or tuple of arrays
            The weights calculated for the blocked data values.

        """
        coordinates, data, weights = check_fit_input(
            coordinates, data, weights, unpack=False
        )
        if any(w is None for w in weights) and self.uncertainty:
            raise ValueError(
                "Weights are required for uncertainty propagation."
                "Either provide weights (as 1/uncertainty**2) or use "
                "'uncertainty=False' to produce variance weights instead."
            )
        blocks, labels = block_split(
            coordinates, self.spacing, self.adjust, self.region
        )
        ncomps = len(data)
        columns = {"data{}".format(i): comp.ravel() for i, comp in enumerate(data)}
        columns["block"] = labels
        if any(w is None for w in weights):
            mean, variance = self._blocked_mean_variance(pd.DataFrame(columns), ncomps)
        else:
            columns.update(
                {"weight{}".format(i): comp.ravel() for i, comp in enumerate(weights)}
            )
            table = pd.DataFrame(columns)
            if self.uncertainty:
                mean, variance = self._blocked_mean_uncertainty(table, ncomps)
            else:
                mean, variance = self._blocked_mean_variance_weighted(table, ncomps)
        blocked_data = tuple(comp.values.ravel() for comp in mean)
        blocked_weights = tuple(
            variance_to_weights(var.values.ravel()) for var in variance
        )
        blocked_coords = self._block_coordinates(coordinates, blocks, labels)
        if ncomps == 1:
            return blocked_coords, blocked_data[0], blocked_weights[0]
        return blocked_coords, blocked_data, blocked_weights

    def _blocked_mean_uncertainty(self, table, ncomps):
        """
        Calculate the blocked weighted mean and propagate the uncertainty from
        the points to the weighted mean. Assumes that the weights are
        1/uncertainty**2. The propagated uncertainty of the weighted mean
        squared is 1/(sum(1/uncertainty**2)) = 1/sum(weights).
        """
        reduction = {
            "data{}".format(i): attach_weights(
                self.reduction, table["weight{}".format(i)]
            )
            for i in range(ncomps)
        }
        # The reduction of the weights will turn them into the propagated
        # uncertainty squared.
        reduction.update(
            {"weight{}".format(i): lambda x: 1 / x.sum() for i in range(ncomps)}
        )
        blocked = table.groupby("block").aggregate(reduction)
        variance = [blocked["weight{}".format(i)] for i in range(ncomps)]
        mean = [blocked["data{}".format(i)] for i in range(ncomps)]
        return mean, variance

    def _blocked_mean_variance(self, table, ncomps):
        """
        Calculate the blocked mean and variance without weights.
        The variance will be the unweighted variance of the blocks.
        """
        reduction = {
            "data{}".format(i): (("mean", self.reduction), ("variance", np.var))
            for i in range(ncomps)
        }
        blocked = table.groupby("block").aggregate(reduction)
        mean = [blocked["data{}".format(i), "mean"] for i in range(ncomps)]
        variance = [blocked["data{}".format(i), "variance"] for i in range(ncomps)]
        return mean, variance

    def _blocked_mean_variance_weighted(self, table, ncomps):
        """
        Calculate the blocked weighted mean and the weighted variance.
        """
        # Need to make a function that takes a group (a pandas.DataFrame) and
        # calculates the weighted average and weighted variance. Can't use
        # reduce because the weighted variance requires the average, so it
        # would be calculated twice. This way, we can calculate the average
        # only once and return both values.
        columns = ["mean{}".format(i) for i in range(ncomps)]
        columns.extend("variance{}".format(i) for i in range(ncomps))

        def weighted_average_variance(group):
            """
            Calculate the weighted average and variance of a group.
            Returns a DataFrame with columns for the averages and variances.
            """
            data = np.empty(ncomps * 2)
            for i in range(ncomps):
                weights = group["weight{}".format(i)]
                values = group["data{}".format(i)]
                data[i] = self.reduction(values, weights=weights)
                data[i + ncomps] = self.reduction(
                    (values - data[i]) ** 2, weights=weights
                )
            return pd.DataFrame(
                data.reshape((1, data.size)), index=[0], columns=columns
            )

        blocked = table.groupby("block").apply(weighted_average_variance)
        mean = [blocked[i] for i in columns[:ncomps]]
        variance = [blocked[i] for i in columns[ncomps:]]
        return mean, variance