Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Histogram2d #2262

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
241 changes: 198 additions & 43 deletions dpnp/dpnp_iface_histograms.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,21 +42,25 @@

import dpctl.utils as dpu
import numpy
from dpctl.tensor._type_utils import _can_cast

import dpnp

# pylint: disable=no-name-in-module
import dpnp.backend.extensions.statistics._statistics_impl as statistics_ext
from dpnp.dpnp_utils.dpnp_utils_common import (
result_type_for_device,
to_supported_dtypes,
)

# pylint: disable=no-name-in-module
from .dpnp_utils import get_usm_allocations, map_dtype_to_device
from .dpnp_utils import get_usm_allocations

__all__ = [
"bincount",
"digitize",
"histogram",
"histogram_bin_edges",
"histogram2d",
"histogramdd",
]

Expand All @@ -65,33 +69,15 @@
_range = range


def _result_type_for_device(dtypes, device):
rt = dpnp.result_type(*dtypes)
return map_dtype_to_device(rt, device)


def _align_dtypes(a_dtype, bins_dtype, ntype, supported_types, device):
has_fp64 = device.has_aspect_fp64
has_fp16 = device.has_aspect_fp16

a_bin_dtype = _result_type_for_device([a_dtype, bins_dtype], device)
a_bin_dtype = result_type_for_device([a_dtype, bins_dtype], device)

# histogram implementation doesn't support uint64 as histogram type
# we can use int64 instead. Result would be correct even in case of overflow
if ntype == numpy.uint64:
ntype = dpnp.int64

if (a_bin_dtype, ntype) in supported_types:
return a_bin_dtype, ntype

for sample_type, hist_type in supported_types:
if _can_cast(
a_bin_dtype, sample_type, has_fp16, has_fp64
) and _can_cast(ntype, hist_type, has_fp16, has_fp64):
return sample_type, hist_type

# should not happen
return None, None # pragma: no cover
return to_supported_dtypes([a_bin_dtype, ntype], supported_types, device)


def _ravel_check_a_and_weights(a, weights):
Expand Down Expand Up @@ -138,6 +124,9 @@ def _is_finite(a):
return numpy.isfinite(a)

if range is not None:
if len(range) != 2:
raise ValueError("range argument must consist of 2 elements.")

first_edge, last_edge = range
if first_edge > last_edge:
raise ValueError("max must be larger than min in range parameter.")
Expand Down Expand Up @@ -520,6 +509,7 @@ def histogram(a, bins=10, range=None, density=None, weights=None):
If `bins` is a sequence, it defines a monotonically increasing array
of bin edges, including the rightmost edge, allowing for non-uniform
bin widths.

Default: ``10``.
range : {None, 2-tuple of float}, optional
The lower and upper range of the bins. If not provided, range is simply
Expand All @@ -528,6 +518,7 @@ def histogram(a, bins=10, range=None, density=None, weights=None):
affects the automatic bin computation as well. While bin width is
computed to be optimal based on the actual data within `range`, the bin
count will fill the entire range including portions containing no data.

Default: ``None``.
density : {None, bool}, optional
If ``False`` or ``None``, the result will contain the number of samples
Expand All @@ -536,6 +527,7 @@ def histogram(a, bins=10, range=None, density=None, weights=None):
the range is ``1``. Note that the sum of the histogram values will not
be equal to ``1`` unless bins of unity width are chosen; it is not
a probability *mass* function.

Default: ``None``.
weights : {None, dpnp.ndarray, usm_ndarray}, optional
An array of weights, of the same shape as `a`. Each value in `a` only
Expand All @@ -545,6 +537,7 @@ def histogram(a, bins=10, range=None, density=None, weights=None):
Please note that the ``dtype`` of `weights` will also become the
``dtype`` of the returned accumulator (`hist`), so it must be large
enough to hold accumulated values as well.

Default: ``None``.

Returns
Expand Down Expand Up @@ -751,6 +744,167 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None):
return bin_edges


def histogram2d(x, y, bins=10, range=None, density=None, weights=None):
"""
Compute the bi-dimensional histogram of two data samples.

Parameters
----------
x : {dpnp.ndarray, usm_ndarray} of shape (N,)
An array containing the `x` coordinates of the points to be
histogrammed.
y : {dpnp.ndarray, usm_ndarray} of shape (N,)
An array containing the `y` coordinates of the points to be
histogrammed.
bins : {int, dpnp.ndarray, usm_ndarray, [int, int], [array, array], \
[int, array], [array, int]}, optional

The bins specification:

* If int, the number of bins for the two dimensions (nx=ny=bins).
* If array, the bin edges for the two dimensions
(x_edges=y_edges=bins).
* If [int, int], the number of bins in each dimension
(nx, ny = bins).
* If [array, array], the bin edges in each dimension
(x_edges, y_edges = bins).
* A combination [int, array] or [array, int], where int
is the number of bins and array is the bin edges.

Default: ``10``.
range : {None, dpnp.ndarray, usm_ndarray} of shape (2,2), optional
The leftmost and rightmost edges of the bins along each dimension
If ``None`` the ranges are
``[[x.min(), x.max()], [y.min(), y.max()]]``. All values outside
of this range will be considered outliers and not tallied in the
histogram.

Default: ``None``.
density : {None, bool}, optional
If ``False`` or ``None``, the default, returns the number of
samples in each bin.
If ``True``, returns the probability *density* function at the bin,
``bin_count / sample_count / bin_volume``.

Default: ``None``.
weights : {None, dpnp.ndarray, usm_ndarray} of shape (N,), optional
An array of values ``w_i`` weighing each sample ``(x_i, y_i)``.
Weights are normalized to ``1`` if `density` is ``True``.
If `density` is ``False``, the values of the returned histogram
antonwolfy marked this conversation as resolved.
Show resolved Hide resolved
are equal to the sum of the weights belonging to the samples
falling into each bin.
If ``None`` all samples are assigned a weight of ``1``.

Default: ``None``.
Returns
-------
H : dpnp.ndarray of shape (nx, ny)
The bi-dimensional histogram of samples `x` and `y`. Values in `x`
are histogrammed along the first dimension and values in `y` are
histogrammed along the second dimension.
xedges : dpnp.ndarray of shape (nx+1,)
The bin edges along the first dimension.
yedges : dpnp.ndarray of shape (ny+1,)
The bin edges along the second dimension.

See Also
--------
:obj:`dpnp.histogram` : 1D histogram
:obj:`dpnp.histogramdd` : Multidimensional histogram

Notes
-----
When `density` is ``True``, then the returned histogram is the sample
density, defined such that the sum over bins of the product
``bin_value * bin_area`` is 1.

Please note that the histogram does not follow the Cartesian convention
where `x` values are on the abscissa and `y` values on the ordinate
axis. Rather, `x` is histogrammed along the first dimension of the
array (vertical), and `y` along the second dimension of the array
(horizontal). This ensures compatibility with `histogramdd`.

Examples
--------
>>> import dpnp as np
>>> x = np.random.randn(20).astype("float32")
>>> y = np.random.randn(20).astype("float32")
>>> hist, edges_x, edges_y = np.histogram2d(x, y, bins=(4, 3))
>>> hist.shape
(4, 3)
>>> hist
array([[1., 2., 0.],
[0., 3., 1.],
[1., 4., 1.],
[1., 3., 3.]], dtype=float32)
>>> edges_x.shape
(5,)
>>> edges_x
array([-1.7516936 , -0.96109843, -0.17050326, 0.62009203, 1.4106871 ],
dtype=float32)
>>> edges_y.shape
(4,)
>>> edges_y
array([-2.6604428 , -0.94615364, 0.76813555, 2.4824247 ], dtype=float32)

Please note, that resulting values of histogram and edges may vary.

"""

dpnp.check_supported_arrays_type(x, y)
if weights is not None:
dpnp.check_supported_arrays_type(weights)

if x.ndim != 1 or y.ndim != 1:
raise ValueError(
f"x and y must be 1-dimensional arrays."
f"Got {x.ndim} and {y.ndim} respectively"
)

if len(x) != len(y):
raise ValueError(
f"x and y must have the same length."
f"Got {len(x)} and {len(y)} respectively"
)

usm_type, exec_q = get_usm_allocations([x, y, bins, range, weights])
device = exec_q.sycl_device

sample_dtype = result_type_for_device([x.dtype, y.dtype], device)

# Unlike histogramdd histogram2d accepts 1d bins and
# apply it to both dimensions
# at the same moment two elements bins should be interpreted as
# number of bins in each dimension and array-like bins with one element
# is not allowed
if isinstance(bins, Iterable) and len(bins) > 2:
bins = [bins] * 2

bins = _histdd_normalize_bins(bins, 2)
bins_dtypes = [sample_dtype]
bins_dtypes += [b.dtype for b in bins if hasattr(b, "dtype")]

bins_dtype = result_type_for_device(bins_dtypes, device)
hist_dtype = _histdd_hist_dtype(exec_q, weights)

supported_types = statistics_ext.histogramdd_dtypes()

sample_dtype, _ = _align_dtypes(
sample_dtype, bins_dtype, hist_dtype, supported_types, device
)

sample = dpnp.empty_like(
x, shape=x.shape + (2,), dtype=sample_dtype, usm_type=usm_type
)
sample[:, 0] = x
sample[:, 1] = y

hist, edges = histogramdd(
sample, bins=bins, range=range, density=density, weights=weights
)
return hist, edges[0], edges[1]


def _histdd_validate_bins(bins):
for i, b in enumerate(bins):
if numpy.ndim(b) == 0:
Expand Down Expand Up @@ -873,9 +1027,7 @@ def _histdd_hist_dtype(queue, weights):
# hist_dtype is either float or complex, so it is ok
# to calculate it as result type between default_float and
# weights.dtype
hist_dtype = _result_type_for_device(
[hist_dtype, weights.dtype], device
)
hist_dtype = result_type_for_device([hist_dtype, weights.dtype], device)

return hist_dtype

Expand All @@ -886,7 +1038,7 @@ def _histdd_sample_dtype(queue, sample, bin_edges_list):
dtypes_ = [bin_edges.dtype for bin_edges in bin_edges_list]
dtypes_.append(sample.dtype)

return _result_type_for_device(dtypes_, device)
return result_type_for_device(dtypes_, device)


def _histdd_supported_dtypes(sample, bin_edges_list, weights):
Expand Down Expand Up @@ -918,7 +1070,7 @@ def _histdd_extract_arrays(sample, weights, bins):
return all_arrays


def histogramdd(sample, bins=10, range=None, weights=None, density=False):
def histogramdd(sample, bins=10, range=None, density=None, weights=None):
"""
Compute the multidimensional histogram of some data.

Expand All @@ -936,30 +1088,33 @@ def histogramdd(sample, bins=10, range=None, weights=None, density=False):
* The number of bins for each dimension (nx, ny, ... =bins)
* The number of bins for all dimensions (nx=ny=...=bins).

Default: ``10``
Default: ``10``.
range : {None, sequence}, optional
A sequence of length D, each an optional (lower, upper) tuple giving
the outer bin edges to be used if the edges are not given explicitly in
`bins`.
An entry of None in the sequence results in the minimum and maximum
An entry of ``None`` in the sequence results in the minimum and maximum
values being used for the corresponding dimension.
None is equivalent to passing a tuple of D None values.

Default: ``None``
weights : {dpnp.ndarray, usm_ndarray}, optional
An (N,)-shaped array of values `w_i` weighing each sample
`(x_i, y_i, z_i, ...)`.
Weights are normalized to 1 if density is True. If density is False,
the values of the returned histogram are equal to the sum of the
weights belonging to the samples falling into each bin.
``None`` is equivalent to passing a tuple of D ``None`` values.

Default: ``None``
density : bool, optional
If ``False``, the default, returns the number of samples in each bin.
Default: ``None``.
density : {None, bool}, optional
If ``False`` or ``None``, the default, returns the number of
samples in each bin.
If ``True``, returns the probability *density* function at the bin,
``bin_count / sample_count / bin_volume``.

Default: ``False``
Default: ``None``.
weights : {None, dpnp.ndarray, usm_ndarray}, optional
An (N,)-shaped array of values `w_i` weighing each sample
`(x_i, y_i, z_i, ...)`.
Weights are normalized to ``1`` if density is ``True``.
If density is ``False``, the values of the returned histogram
are equal to the sum of the weights belonging to the samples
falling into each bin.
If ``None`` all samples are assigned a weight of ``1``.

Default: ``None``.

Returns
-------
Expand Down Expand Up @@ -993,7 +1148,7 @@ def histogramdd(sample, bins=10, range=None, weights=None, density=False):
elif sample.ndim > 2:
raise ValueError("sample must have no more than 2 dimensions")

ndim = sample.shape[1] if sample.size > 0 else 1
ndim = sample.shape[1]

_arrays = _histdd_extract_arrays(sample, weights, bins)
usm_type, queue = get_usm_allocations(_arrays)
Expand Down
Loading
Loading