Content - a4f0f4e5611f08de57d34faf300cf666a66524f2 - 2f7851c/multihist.py

visit type:
Tip revision: 6c4f786bc95f0e73ffec228e17c957744e0d9594 authored by Jelle Aalbers on 27 January 2022, 03:57:40 UTC
Add pypi deploy Github Actions workflow
Tip revision: 6c4f786
multihist.py
from __future__ import division
from copy import deepcopy
from functools import reduce
try:
    from itertools import izip as zip
except ImportError:
    # Hello, python 3!
    pass

import numpy as np

try:
    from scipy.ndimage import zoom
    from scipy import stats
    HAVE_SCIPY = True
except ImportError:
    HAVE_SCIPY = False

try:
    import matplotlib
    import matplotlib.pyplot as plt
    CAN_PLOT = True
except ImportError:
    plt = None
    CAN_PLOT = False

COLUMNAR_DATA_SOURCES = []

try:
    import dask
    import dask.dataframe
    import dask.multiprocessing
    WE_HAVE_DASK = True
    DEFAULT_DASK_COMPUTE_KWARGS = dict(get=dask.multiprocessing.get)
    COLUMNAR_DATA_SOURCES.append(dask.dataframe.DataFrame)
except Exception:           # Sometimes dask import succeeds, but throws error when starting up
    WE_HAVE_DASK = False
    pass

try:
    import pandas as pd
    COLUMNAR_DATA_SOURCES.append(pd.DataFrame)
except ImportError:
    pass

COLUMNAR_DATA_SOURCES = tuple(COLUMNAR_DATA_SOURCES)

from operator import itemgetter

__version__ = '0.6.5'


class CoordinateOutOfRangeException(Exception):
    pass


class MultiHistBase(object):

    def similar_blank_hist(self):
        newhist = deepcopy(self)
        newhist.histogram = np.zeros_like(self.histogram)
        return newhist

    @property
    def n(self):
        """Returns number of data points loaded into histogram"""
        return np.sum(self.histogram)

    # Overload binary numeric operators to work on histogram
    # TODO: logical operators

    def __getitem__(self, item):
        return self.histogram[item]

    def __setitem__(self, key, value):
        self.histogram[key] = value

    # Let unary operators work on wrapped histogram:
    def min(self):
        return self.histogram.min()

    def max(self):
        return self.histogram.max()

    def __len__(self):
        return len(self.histogram)

    def __neg__(self):
        return self.__class__.from_histogram(-self.histogram, self.bin_edges, self.axis_names)

    def __pos__(self):
        return self.__class__.from_histogram(+self.histogram, self.bin_edges, self.axis_names)

    def __abs__(self):
        return self.__class__.from_histogram(abs(self.histogram), self.bin_edges, self.axis_names)

    def __invert__(self):
        return self.__class__.from_histogram(~self.histogram, self.bin_edges, self.axis_names)

    # Let binary operators work on wrapped histogram

    @classmethod
    def _make_binop(cls, opname):
        def binop(self, other):
            return self.__class__.from_histogram(
                getattr(self.histogram, opname)(other),
                self.bin_edges,
                self.axis_names)
        return binop

for methodname in 'add sub mul div truediv floordiv mod divmod pow lshift rshift and or'.split():
    dundername = '__%s__' % methodname
    setattr(MultiHistBase,
            dundername,
            MultiHistBase._make_binop(dundername))
    setattr(MultiHistBase,
            '__r%s__' % methodname,
            getattr(MultiHistBase, dundername))

# Verbose alias
MultiHistBase.similar_blank_histogram = MultiHistBase.similar_blank_hist


class Hist1d(MultiHistBase):
    axis_names = None
    dimensions = 1

    @classmethod
    def from_histogram(cls, histogram, bin_edges, axis_names=None):
        """Make a Hist1D from a numpy bin_edges + histogram pair
        :param histogram: Initial histogram
        :param bin_edges: Bin edges of histogram. Must be one longer than length of histogram
        :param axis_names: Ignored. Sorry :-)
        :return:
        """
        if len(bin_edges) != len(histogram) + 1:
            raise ValueError("Bin edges must be of length %d, you gave %d!" % (len(histogram) + 1, len(bin_edges)))
        self = cls(bins=bin_edges)
        self.histogram = np.array(histogram)
        return self

    def __init__(self, data=None, bins=10, range=None, weights=None):
        """
        :param data: Initial data to histogram.
        :param bins: Number of bins, or list of bin edges (like np.histogram)
        :param weights: Weights for initial data.
        :param range: Range of histogram.
        :return: None
        """
        if data is None:
            data = []
        self.histogram, self.bin_edges = np.histogram(data, bins=bins, range=range, weights=weights)

    def add(self, data, weights=None):
        hist, _ = np.histogram(data, self.bin_edges, weights=weights)
        self.histogram += hist

    @property
    def bin_centers(self):
        return 0.5 * (self.bin_edges[1:] + self.bin_edges[:-1])

    def bin_volumes(self):
        return np.diff(self.bin_edges)

    @property
    def density(self):
        """Gives emprical PDF, like np.histogram(...., density=True)"""
        h = self.histogram.astype(float)
        bindifs = np.array(np.diff(self.bin_edges), float)
        return h / (bindifs * self.n)

    @property
    def normalized_histogram(self):
        """Gives histogram with sum of entries normalized to 1."""
        return self.histogram / self.n

    @property
    def cumulative_histogram(self):
        return np.cumsum(self.histogram)

    @property
    def cumulative_density(self):
        cs = np.cumsum(self.histogram)
        return cs / cs[-1]

    def get_random(self, size=10):
        """Returns random variates from the histogram.
        Note this assumes the histogram is an 'events per bin', not a pdf.
        Inside the bins, a uniform distribution is assumed.
        """
        bin_i = np.random.choice(np.arange(len(self.bin_centers)), size=size, p=self.normalized_histogram)
        return self.bin_centers[bin_i] + np.random.uniform(-0.5, 0.5, size=size) * self.bin_volumes()[bin_i]

    def items(self):
        """Iterate over (bin_center, hist_value) from left to right"""
        return zip(self.bin_centers, self.histogram)

    @property
    def mean(self):
        """Estimates mean of underlying data, assuming each datapoint was exactly in the center of its bin."""
        return np.average(self.bin_centers, weights=self.histogram)

    @property
    def std(self, bessel_correction=True):
        """Estimates std of underlying data, assuming each datapoint was exactly in the center of its bin."""
        if bessel_correction:
            n = self.n
            bc = n / (n - 1)
        else:
            bc = 1
        return np.sqrt(np.average((self.bin_centers - self.mean) ** 2, weights=self.histogram)) * bc

    def data_for_plot(
        self,
        normed=False, scale_histogram_by=1.0, scale_errors_by=1.0,
        errors=False, error_style='bar'):
        """Return (x, y, ylow, yhigh) of data used for plotting.

        Specifically, returns:
         x: x-values of lines/points; bin centers or edges, depending on error_style
         y: y-values of lines/points, same length as x
         ylow: lower bound of error bars/bands, same length as x
         yhigh: upper bound of error bars/bands, same length as x

        Arguments are as described in plot
        """
        y = self.histogram
        if normed:
            scale_histogram_by /= y.sum()
        if errors == 'sqrtn':
            _yerr = y**0.5
            ylow, yhigh = y - _yerr, y + _yerr
        elif errors == 'central':
            ylow, yhigh = poisson_1s_interval(y, fc=False)
        elif errors == 'model_quantiles':
            ylow, yhigh = stats.poisson(y).ppf(stats.norm.cdf(-1)), stats.poisson(y).ppf(stats.norm.cdf(1))
            # Show some error band <0.2 expected events
            yhigh = yhigh.clip(y, None)
        elif errors:
            ylow, yhigh = poisson_1s_interval(y, fc=True)
        else:
            ylow, yhigh = y, y

        y = y.astype(float) * scale_histogram_by
        ylow = ylow.astype(float) * scale_histogram_by
        yhigh = yhigh.astype(float) * scale_histogram_by

        ylow = (y - (y - ylow) * scale_errors_by).clip(0, None)
        yhigh = y + (yhigh - y) * scale_errors_by

        if errors and error_style == 'bar':
            # Plotting (dots with) bars
            x = self.bin_centers

        else:
            # Plot some kind of stepped line: we will plot vs the bin *edges*
            # using steps-pre (not steps-mid)
            # If we would have plotted values only vs the centers
            #  * the steps won't be correct for log scales
            #  * the final bins will not fully show

            x = self.bin_edges
            y, ylow, yhigh = [_repeat_first(q) for q in [y, ylow, yhigh]]

        return x, y, ylow, yhigh

    def plot(self,
             normed=False, scale_histogram_by=1.0, scale_errors_by=1.0,
             errors=False, error_style='bar', error_alpha=0.3,
             plt=plt, set_xlim=False,
             **kwargs):
        """Plot the histogram, with error bars if desired.

        :param normed: Scale the histogram so the sum is 1 before plotting.
            Errors are computed before scaling, then scaled accordingly.
        :param scale_histogram_by: Custom multiplier to apply to histogram.
            Errors are computed before scaling, then scaled accordingly.
        :param scale_errors_by: Custom multiplier to apply to errors.
            This scales the differences between the low/high errors and the
            mean. For low counts / asymmetric errors, this will look strange.
        :param errors: Whether and how to plot 1 sigma error bars
            * False shows no errors
            * True or 'fc' for Feldman-Cousin errors.
              For > 20 events, central Poisson intervals are used
            * 'central' for central Poisson confidence intervals
            * 'sqrtn' for sqrt(n) errors
            * 'model_quantiles' for central Poisson inverval assuming
               bin content is the Poisson mean (expectation), NOT a
               confidence interval
        :param errorstyle: How to plot errors (if errors is not False)
         * 'bar' for error bars
         * 'band' for shaded bands
        :param error_alpha: Alpha multiplier for errorstyle='band'
        :param plt: Object to call plt... on; matplotlib.pyplot by default.
        :param set_xlim: If True, set xlim to the range of the hist
        """
        if not CAN_PLOT:
            raise ValueError(
                "matplotlib did not import, so can't plot your histogram...")

        x, y, ylow, yhigh = self.data_for_plot(
            normed=normed,
            scale_histogram_by=scale_histogram_by,
            scale_errors_by=scale_errors_by,
            errors=errors,
            error_style=error_style)

        if errors and error_style == 'bar':
            # Plot as points with error bars
            kwargs.setdefault('linestyle', 'none')
            kwargs.setdefault('marker', '.')
            plt.errorbar(x,
                         y,
                         yerr=[y - ylow, yhigh - y],
                         **kwargs)

        else:

            if errors and error_style == 'band':
                line = plt.plot(x, y, drawstyle='steps-pre', **kwargs)
                kwargs['color'] = line[0].get_color()
                kwargs['alpha'] = error_alpha * kwargs.get('alpha', 1)
                kwargs['linewidth'] = 0
                if 'label' in kwargs:
                    # Don't want to double-label!
                    del kwargs['label']
                plt.fill_between(x, ylow, yhigh,
                                 step='pre', **kwargs)

            else:
                plt.plot(x, y, drawstyle='steps-pre', **kwargs)

        if set_xlim:
            plt.xlim(x[0], x[-1])

    def percentile(self, percentile):
        """Return bin center nearest to percentile"""
        return self.bin_centers[np.argmin(np.abs(self.cumulative_density * 100 - percentile))]

    def lookup(self, coordinates):
        """Lookup values at coordinates.
        coordinates: arraylike of coordinates.

        Clips if out of range!! TODO: option to throw exception instead.
        TODO: Needs tests!!
        """
        # Convert coordinates to indices
        index_array = np.clip(np.searchsorted(self.bin_edges, coordinates) - 1,
                              0,
                              len(self.bin_edges) - 2)

        # Use the index array to slice the histogram
        return self.histogram[index_array]


class Histdd(MultiHistBase):
    """multidimensional histogram object
    """
    axis_names = None

    @classmethod
    def from_histogram(cls, histogram, bin_edges, axis_names=None):
        """Make a HistdD from numpy histogram + bin edges
        :param histogram: Initial histogram
        :param bin_edges: x bin edges of histogram, y bin edges, ...
        :param axis_names: Names of axes
        :return: Histnd instance
        """
        self = cls(bins=bin_edges, axis_names=axis_names)
        self.histogram = histogram
        return self

    def __init__(self, *data, **kwargs):
        for k, v in {'bins': 10, 'range': None, 'weights': None, 'axis_names': None}.items():
            kwargs.setdefault(k, v)

        # dimensions is a shorthand [(axis_name_1, bins_1), (axis_name_2, bins_2), ...]
        if 'dimensions' in kwargs:
            kwargs['axis_names'], kwargs['bins'] = zip(*kwargs['dimensions'])

        if len(data) == 0:
            if kwargs['range'] is None:
                if kwargs['bins'] is None:
                    raise ValueError("Must specify data, bins, or range")
                try:
                    dimensions = len(kwargs['bins'])
                except TypeError:
                    raise ValueError("If you specify no data and no ranges, must specify a bin specification "
                                     "which tells me what dimension you want. E.g. [10, 10, 10] instead of 10.")
            else:
                dimensions = len(kwargs['range'])
            data = np.zeros((0, dimensions)).T

        self.axis_names = kwargs['axis_names']
        self.histogram, self.bin_edges = self._data_to_hist(data, **kwargs)

    def add(self, *data, **kwargs):
        self.histogram += self._data_to_hist(data, **kwargs)[0]

    @staticmethod
    def _is_columnar(x):
        if isinstance(x, COLUMNAR_DATA_SOURCES):
            return True
        if isinstance(x, np.ndarray) and x.dtype.fields:
            return True
        return False

    def _data_to_hist(self, data, **kwargs):
        """Return bin_edges, histogram array"""
        if hasattr(self, 'bin_edges'):
            kwargs.setdefault('bins', self.bin_edges)

        if len(data) == 1 and self._is_columnar(data[0]):
            data = data[0]

            if self.axis_names is None:
                raise ValueError("When histogramming from a columnar data source, "
                                 "axis_names or dimensions is mandatory")
            is_dask = False
            if WE_HAVE_DASK:
                is_dask = isinstance(data, dask.dataframe.DataFrame)

            if is_dask:
                fake_histogram = Histdd(axis_names=self.axis_names, bins=kwargs['bins'])

                partial_hists = []
                for partition in data.to_delayed():
                    ph = dask.delayed(Histdd)(partition, axis_names=self.axis_names, bins=kwargs['bins'])
                    ph = dask.delayed(lambda x: x.histogram)(ph)
                    ph = dask.array.from_delayed(ph,
                                                 shape=fake_histogram.histogram.shape,
                                                 dtype=fake_histogram.histogram.dtype)
                    partial_hists.append(ph)
                partial_hists = dask.array.stack(partial_hists, axis=0)

                compute_options = kwargs.get('compute_options', {})
                for k, v in DEFAULT_DASK_COMPUTE_KWARGS.items():
                    compute_options.setdefault(k, v)
                histogram = partial_hists.sum(axis=0).compute(**compute_options)

                bin_edges = fake_histogram.bin_edges
                return histogram, bin_edges

            else:
                data = np.vstack([data[x].values if isinstance(data, pd.DataFrame) else data[x]
                                  for x in self.axis_names])

        data = np.array(data).T
        return np.histogramdd(data,
                              bins=kwargs.get('bins'),
                              weights=kwargs.get('weights'),
                              range=kwargs.get('range'))

    @property
    def dimensions(self):
        return len(self.bin_edges)

    ##
    # Axis selection
    ##
    def get_axis_number(self, axis):
        if isinstance(axis, int):
            return axis
        if isinstance(axis, str):
            if self.axis_names is None:
                raise ValueError("Axis name %s not in histogram: histogram has no named axes." % axis)
            if axis in self.axis_names:
                return self.axis_names.index(axis)
            raise ValueError("Axis name %s not in histogram. Axis names which are: %s" % (axis, self.axis_names))
        raise ValueError("Argument to get_axis_number should be string or integer, but you gave %s" % axis)

    def other_axes(self, axis):
        axis = self.get_axis_number(axis)
        return tuple([i for i in range(self.dimensions) if i != axis])

    def axis_names_without(self, axis):
        """Return axis names without axis, or None if axis_names is None"""
        if self.axis_names is None:
            return None
        return itemgetter(*self.other_axes(axis))(self.axis_names)

    ##
    # Bin wrangling: centers <-> edges, values <-> indices
    ##
    def bin_centers(self, axis=None):
        """Return bin centers along an axis, or if axis=None, list of bin_centers along each axis"""
        if axis is None:
            return [self.bin_centers(axis=i) for i in range(self.dimensions)]
        axis = self.get_axis_number(axis)
        return 0.5 * (self.bin_edges[axis][1:] + self.bin_edges[axis][:-1])

    def get_axis_bin_index(self, value, axis):
        """Returns index along axis of bin in histogram which contains value
        Inclusive on both endpoints
        """
        axis = self.get_axis_number(axis)
        bin_edges = self.bin_edges[axis]
        # The right bin edge of np.histogram is inclusive:
        if value == bin_edges[-1]:
            # Minus two: one for bin edges rather than centers, one for 0-based indexing
            return len(bin_edges) - 2
        # For all other bins, it is exclusive.
        result = np.searchsorted(bin_edges, [value], side='right')[0] - 1
        if not 0 <= result <= len(bin_edges) - 1:
            raise CoordinateOutOfRangeException("Value %s is not in range (%s-%s) of axis %s" % (
                value, bin_edges[0], bin_edges[-1], axis))
        return result

    def get_bin_indices(self, values):
        """Returns index tuple in histogram of bin which contains value"""
        return tuple([self.get_axis_bin_index(values[ax_i], ax_i)
                      for ax_i in range(self.dimensions)])

    def all_axis_bin_centers(self, axis):
        """Return ndarray of same shape as histogram containing bin center value along axis at each point"""
        # Arcane hack that seems to work, at least in 3d... hope
        axis = self.get_axis_number(axis)
        return np.meshgrid(*self.bin_centers(), indexing='ij')[axis]

    ##
    # Data reduction: sum, slice, project, ...
    ##
    def sum(self, axis):
        """Sums all data along axis, returns d-1 dimensional histogram"""
        axis = self.get_axis_number(axis)
        if self.dimensions == 2:
            new_hist = Hist1d
        else:
            new_hist = Histdd
        return new_hist.from_histogram(np.sum(self.histogram, axis=axis),
                                       bin_edges=itemgetter(*self.other_axes(axis))(self.bin_edges),
                                       axis_names=self.axis_names_without(axis))

    def slice(self, start, stop=None, axis=0):
        """Restrict histogram to bins whose data values (not bin numbers) along axis are between start and stop
        (both inclusive). Returns d dimensional histogram."""
        if stop is None:
            # Make a 1=bin slice
            stop = start
        axis = self.get_axis_number(axis)
        start_bin = max(0, self.get_axis_bin_index(start, axis))
        stop_bin = min(len(self.bin_centers(axis)) - 1,  # TODO: test off by one!
                       self.get_axis_bin_index(stop, axis))
        new_bin_edges = self.bin_edges.copy()
        new_bin_edges[axis] = new_bin_edges[axis][start_bin:stop_bin + 2]   # TODO: Test off by one here!
        return Histdd.from_histogram(np.take(self.histogram, np.arange(start_bin, stop_bin + 1), axis=axis),
                                     bin_edges=new_bin_edges, axis_names=self.axis_names)

    def slicesum(self, start, stop=None, axis=0):
        """Slices the histogram along axis, then sums over that slice, returning a d-1 dimensional histogram"""
        return self.slice(start, stop, axis).sum(axis)

    def projection(self, axis):
        """Sums all data along all other axes, then return Hist1D"""
        axis = self.get_axis_number(axis)
        projected_hist = np.sum(self.histogram, axis=self.other_axes(axis))
        return Hist1d.from_histogram(projected_hist, bin_edges=self.bin_edges[axis])

    ##
    # Density methods: cumulate, normalize, ...
    ##

    def cumulate(self, axis):
        """Returns new histogram with all data cumulated along axis."""
        axis = self.get_axis_number(axis)
        return Histdd.from_histogram(np.cumsum(self.histogram, axis=axis),
                                     bin_edges=self.bin_edges,
                                     axis_names=self.axis_names)

    def _simsalabim_slice(self, axis):
        return [slice(None) if i != axis else np.newaxis
                for i in range(self.dimensions)]

    def normalize(self, axis):
        """Returns new histogram where all values along axis (in one bin of the other axes) sum to 1"""
        axis = self.get_axis_number(axis)
        sum_along_axis = np.sum(self.histogram, axis=axis)
        # Don't do anything for subspaces without any entries -- this avoids nans everywhere
        sum_along_axis[sum_along_axis == 0] = 1
        hist = self.histogram / sum_along_axis[tuple(self._simsalabim_slice(axis))]
        return Histdd.from_histogram(hist,
                                     bin_edges=self.bin_edges,
                                     axis_names=self.axis_names)

    def cumulative_density(self, axis):
        """Returns new histogram with all values replaced by their cumulative densities along axis."""
        return self.normalize(axis).cumulate(axis)

    def central_likelihood(self, axis):
        """Returns new histogram with all values replaced by their central likelihoods along axis."""
        result = self.cumulative_density(axis)
        result.histogram = 1 - 2 * np.abs(result.histogram - 0.5)
        return result

    ##
    # Mixed methods: both reduce and summarize the data
    ##

    def percentile(self, percentile, axis, inclusive=True):
        """Returns d-1 dimensional histogram containing percentile of values along axis
        if inclusive=True, will report bin center of first bin for which percentile% of data lies in or below the bin
                    =False, ... data lies strictly below the bin
        10% percentile is calculated as: value at least 10% data is LOWER than
        """
        axis = self.get_axis_number(axis)

        # Shape of histogram
        s = self.histogram.shape

        # Shape of histogram after axis has been collapsed to 1
        s_collapsed = list(s)
        s_collapsed[axis] = 1

        # Shape of histogram with axis removed entirely
        s_removed = np.concatenate([s[:axis], s[axis + 1:]]).astype(int)

        # Using np.where here is too tricky, as it may not return a value for each "bin-columns"
        # First, get an array which has a minimum at the percentile-containing bins
        # The minimum may not be unique: if later bins are empty, they will not be
        ecdf = self.cumulative_density(axis).histogram
        if not inclusive:
            raise NotImplementedError("Non-inclusive percentiles not yet implemented")
        ecdf = np.nan_to_num(ecdf)    # Since we're relying on self-equality later
        x = ecdf - 2 * (ecdf >= percentile / 100)

        # We now want to get the location of the minimum
        # To ensure it is unique, add a very very very small monotonously increasing bit to x
        # Nobody will want 1e-9th percentiles, right? TODO
        sz = np.ones(len(s), dtype=int)
        sz[axis] = -1
        x += np.linspace(0, 1e-9, s[axis]).reshape(sz)

        # 1. Find the minimum along the axis
        # 2. Reshape to s_collapsed and perform == to get a mask
        # 3. Apply the mask to the bin centers along axis
        # 4. Unflatten with reshape
        result = self.all_axis_bin_centers(axis)[
            x == np.min(x, axis=axis).reshape(s_collapsed)
        ]
        result = result.reshape(s_removed)

        if self.dimensions == 2:
            new_hist = Hist1d
        else:
            new_hist = Histdd
        return new_hist.from_histogram(histogram=result,
                                       bin_edges=itemgetter(*self.other_axes(axis))(self.bin_edges),
                                       axis_names=self.axis_names_without(axis))

    def average(self, axis):
        """Returns d-1 dimensional histogram of (estimated) mean value of axis
        NB this is very different from averaging over the axis!!!
        """
        axis = self.get_axis_number(axis)
        avg_hist = np.ma.average(self.all_axis_bin_centers(axis),
                                 weights=self.histogram, axis=axis)
        if self.dimensions == 2:
            new_hist = Hist1d
        else:
            new_hist = Histdd
        return new_hist.from_histogram(histogram=avg_hist,
                                       bin_edges=itemgetter(*self.other_axes(axis))(self.bin_edges),
                                       axis_names=self.axis_names_without(axis))

    def std(self, axis):
        """Returns d-1 dimensional histogram of (estimated) std value along axis
        NB this is very different from just std of the histogram values (which describe bin counts)
        """
        def weighted_std(values, weights, axis):
            # Stolen from http://stackoverflow.com/questions/2413522
            average = np.average(values, weights=weights, axis=axis)
            average = average[self._simsalabim_slice(axis)]
            variance = np.average((values-average)**2, weights=weights, axis=axis)
            return np.sqrt(variance)

        axis = self.get_axis_number(axis)
        std_hist = weighted_std(self.all_axis_bin_centers(axis),
                                weights=self.histogram, axis=axis)
        if self.dimensions == 2:
            new_hist = Hist1d
        else:
            new_hist = Histdd
        return new_hist.from_histogram(histogram=std_hist,
                                       bin_edges=itemgetter(*self.other_axes(axis))(self.bin_edges),
                                       axis_names=self.axis_names_without(axis))

    ##
    # Other stuff
    ##
    def bin_volumes(self):
        return reduce(np.multiply, np.ix_(*[np.diff(bs) for bs in self.bin_edges]))

    def rebin(self, *factors, **kwargs):
        """Return a new histogram that is 'rebinned' (zoomed) by factors (tuple of floats) along each dimensions
          factors: tuple with zoom factors along each axis. e.g. 2 = double number of bins, 0.5 = halve them.
          order: Order for spline interpolation in scipy.ndimage.zoom. Defaults to  linear interpolation (order=1).

        The only accepted keyword argument is 'order'!!! (python 2 is not nice)

        The normalization is set to the normalization of the current histogram
        The factors don't have to be integers or fractions: scipy.ndimage.zoom deals with the rebinning arcana.
        """
        if not HAVE_SCIPY:
            raise NotImplementedError("Rebinning requires scipy.ndimage")
        if any([x != 'order' for x in kwargs.keys()]):
            raise ValueError("Only 'order' keyword argument is accepted. Yeah, this is confusing.. blame python 2.")
        order = kwargs.get('order', 1)

        # Construct a new histogram
        mh = self.similar_blank_histogram()

        if not len(factors) == self.dimensions:
            raise ValueError("You must pass %d rebin factors to rebin a %d-dimensional histogram" % (
                self.dimensions, self.dimensions
            ))

        # Zoom the bin edges.
        # It's a bit tricky for non-uniform bins:
        # we first construct a linear interpolator to take
        # fraction along axis -> axis coordinate according to current binning.
        # Then we feed it the new desired binning fractions.
        for i, f in enumerate(factors):
            x = self.bin_edges[i]
            mh.bin_edges[i] = np.interp(
                x=np.linspace(0, 1, round((len(x) - 1) * f) + 1),
                xp=np.linspace(0, 1, len(x)),
                fp=x)

        # Rebin the histogram using ndimage.zoom, then renormalize
        mh.histogram = zoom(self.histogram, factors, order=order)
        if mh.histogram.sum() != 0:
            mh.histogram *= self.histogram.sum() / mh.histogram.sum()
        # mh.histogram /= np.product(factors)

        return mh

    def get_random(self, size=10):
        """Returns (size, n_dim) array of random variates from the histogram.
        Inside the bins, a uniform distribution is assumed
        Note this assumes the histogram is an 'events per bin', not a pdf.
        TODO: test more.
        """
        # Sample random bin centers
        bin_centers_ravel = np.array(np.meshgrid(*self.bin_centers(),
                                                 indexing='ij')).reshape(self.dimensions, -1).T
        hist_ravel = self.histogram.ravel()
        hist_ravel = hist_ravel.astype(float)
        hist_ravel = hist_ravel / np.nansum(hist_ravel)
        result = bin_centers_ravel[np.random.choice(len(bin_centers_ravel),
                                                    p=hist_ravel,
                                                    size=size)]

        # Randomize the position inside the bin
        for dim_i in range(self.dimensions):
            bin_edges = self.bin_edges[dim_i]
            bin_widths = np.diff(bin_edges)

            # Note the - 1: for the first bin's bin center, searchsorted gives 1, but we want 0 here:
            index_of_bin = np.searchsorted(bin_edges, result[:, dim_i]) - 1
            result[:, dim_i] += (np.random.rand(size) - 0.5) * bin_widths[index_of_bin]

        return result

    def lookup(self, *coordinate_arrays):
        """Lookup values at specific points.
        coordinate_arrays: numpy arrays of coordinates, one for each dimension
        e.g. lookup(np.array([0, 2]), np.array([1, 3])) looks up (x=0, y=1) and (x=2, y3).

        Clips if out of range!! TODO: option to throw exception instead.
        TODO: Needs tests!!
        TODO: port to Hist1d... or finally join the classes
        TODO: Support for scalar arguments
        """
        assert len(coordinate_arrays) == self.dimensions
        # Convert each coordinate array to an index array
        index_arrays = [np.clip(np.searchsorted(self.bin_edges[i], coordinate_arrays[i]) - 1,
                                0,
                                len(self.bin_edges[i]) - 2)
                        for i in range(self.dimensions)]
        # Use the index arrays to slice the histogram
        return self.histogram[tuple(index_arrays)]

        # Check against slow version:
        # def hist_to_interpolator_slow(mh):
        #      bin_centers_ravel = np.array(np.meshgrid(*mh.bin_centers(), indexing='ij')).reshape(2, -1).T
        #      return NearestNDInterpolator(bin_centers_ravel, mh.histogram.ravel())
        # x = np.random.uniform(0, 400, 100)
        # y = np.random.uniform(0, 200, 100)
        # hist_to_interpolator(mh)(x, y) - hist_to_interpolator_slow(mh)(x, y)

    def lookup_hist(self, mh):
        """Return histogram within binning of Histdd mh, with values looked up in this histogram.

        This is not rebinning: no interpolation /renormalization is performed.
        It's just a lookup.
        """
        result = mh.similar_blank_histogram()
        points = np.stack([mh.all_axis_bin_centers(i)
                           for i in range(mh.dimensions)]).reshape(mh.dimensions, -1)
        values = self.lookup(*points)
        result.histogram = values.reshape(result.histogram.shape)
        return result

    def plot(self, log_scale=False, log_scale_vmin=1,
             colorbar=True,
             cblabel='Number of entries',
             colorbar_kwargs=None,
             plt=plt,
             **kwargs):

        if colorbar_kwargs is None:
            colorbar_kwargs = dict()
        colorbar_kwargs['label'] = cblabel

        if not CAN_PLOT:
            raise ValueError("matplotlib did not import, so can't plot your histogram...")

        if self.dimensions == 1:
            return Hist1d.from_histogram(self.histogram, self.bin_edges[0]).plot(**kwargs)

        elif self.dimensions == 2:
            if log_scale:
                kwargs.setdefault('norm',
                                  matplotlib.colors.LogNorm(
                                      vmin=kwargs.pop('vmin', max(log_scale_vmin, self.histogram.min())),
                                      vmax=kwargs.pop('vmax', self.histogram.max()))
                                  )
            mesh = plt.pcolormesh(self.bin_edges[0], self.bin_edges[1], self.histogram.T, **kwargs)
            plt.xlim(np.min(self.bin_edges[0]), np.max(self.bin_edges[0]))
            plt.ylim(np.min(self.bin_edges[1]), np.max(self.bin_edges[1]))
            if self.axis_names:
                plt.xlabel(self.axis_names[0])
                plt.ylabel(self.axis_names[1])
            if colorbar:
                cb = plt.colorbar(**colorbar_kwargs)
                cb.ax.minorticks_on()
                return mesh, cb
            return mesh

        else:
            raise ValueError("Can only plot 1- or 2-dimensional histograms!")


Histdd.project = Histdd.projection


if __name__ == '__main__':
    # Create histograms just like from numpy...
    m = Hist1d([0, 3, 1, 6, 2, 9], bins=3)

    # ...or add data incrementally:
    m = Hist1d(bins=100, range=(-3, 4))
    m.add(np.random.normal(0, 0.5, 10**4))
    m.add(np.random.normal(2, 0.2, 10**3))

    # Get the data back out:
    print(m.histogram, m.bin_edges)

    if CAN_PLOT:
        # Access derived quantities like bin_centers, normalized_histogram, density, cumulative_density, mean, std
        plt.plot(m.bin_centers, m.normalized_histogram, label="Normalized histogram", linestyle='steps')
        plt.plot(m.bin_centers, m.density, label="Empirical PDF", linestyle='steps')
        plt.plot(m.bin_centers, m.cumulative_density, label="Empirical CDF", linestyle='steps')
        plt.title("Estimated mean %0.2f, estimated std %0.2f" % (m.mean, m.std))
        plt.legend(loc='best')
        plt.show()

    # Slicing and arithmetic behave just like ordinary ndarrays
    print("The fourth bin has %d entries" % m[3])
    m[1:4] += 4 + 2 * m[-27:-24]
    print("Now it has %d entries" % m[3])

    # Of course I couldn't resist adding a canned plotting function:
    if CAN_PLOT:
        m.plot()
        plt.show()

        # Create and show a 2d histogram. Axis names are optional.
        m2 = Histdd(bins=100, range=[[-5, 3], [-3, 5]], axis_names=['x', 'y'])
        m2.add(np.random.normal(1, 1, 10**6), np.random.normal(1, 1, 10**6))
        m2.add(np.random.normal(-2, 1, 10**6), np.random.normal(2, 1, 10**6))

        # x and y projections return Hist1d objects
        m2.projection('x').plot(label='x projection')
        m2.projection(1).plot(label='y projection')

        plt.legend()
        plt.show()


##
# Error bar helpers
##

# Zero-background 1 sigma Poisson Feldman-Cousins intervals
# From table II in https://arxiv.org/pdf/physics/9711021.pdf
_fc_intervals = np.array([
    [0.0, 1.29],
    [0.37, 2.75],
    [0.74, 4.25],
    [1.1, 5.3],
    [2.34, 6.78],
    [2.75, 7.81],
    [3.82, 9.28],
    [4.25, 10.3],
    [5.3, 11.32],
    [6.44, 12.79],
    [6.78, 13.81],
    [7.81, 14.82],
    [8.83, 16.29],
    [9.28, 17.3],
    [10.3, 18.32],
    [11.32, 19.32],
    [12.33, 20.8],
    [12.79, 21.81],
    [13.81, 22.82],
    [14.82, 23.82],
    [15.83, 25.3]])


def poisson_central_interval(k, cl=0.6826894921370859):
    """Return central Poisson confidence interval
    :param k: observed events
    :param cl: confidence level
    """
    if not HAVE_SCIPY:
        raise NotImplementedError("Poisson errors require scipy")
    # Adapted from https://stackoverflow.com/a/14832525
    k = np.asarray(k).astype(int)
    alpha = 1 - cl
    low = stats.chi2.ppf(alpha / 2, 2 * k) / 2
    high = stats.chi2.ppf(1 - alpha / 2, 2 * k + 2) / 2
    return np.stack([np.nan_to_num(low), high])


def poisson_1s_interval(k, fc=True):
    """Return (low, high) 1 sigma Poisson confidence intervals

    :param k: Observed events (int or array of ints)
    :param fc: if True (default), use Feldman-Cousins for k <= 20,
    and central intervals otherwise.
    (at k = 20, the difference between these is 1-2%).
    """
    k = np.asarray(k).astype(int)
    result = poisson_central_interval(k)
    if fc:
        mask = k <= 20
        result[:, mask] = _fc_intervals[k[mask]].T
    return result


def _repeat_first(q):
    return np.concatenate([[q[0]], q])
Browse the archive

https://github.com/JelleAalbers/multihist