Binning in physt

[1]:
# Necessary import evil
%matplotlib inline
from physt import histogram, binnings
import numpy as np
import matplotlib.pyplot as plt
[2]:
# Some data
np.random.seed(42)

heights1 = np.random.normal(169, 10, 100000)
heights2 = np.random.normal(180, 6, 100000)
numbers = np.random.rand(100000)

Ideal number of bins

[3]:
X = [int(x) for x in np.logspace(0, 4, 50)]

algos = binnings.bincount_methods
Ys = { algo: [] for algo in algos}

for x in X:
    ex_dataset = np.random.exponential(1, x)
    for algo in algos:
        Ys[algo].append(binnings.ideal_bin_count(ex_dataset, algo))

figure, axis = plt.subplots(figsize=(8, 8))
for algo in algos:
    if algo == "default":
        axis.plot(X, Ys[algo], ":.", label=algo, alpha=0.5, lw=2)
    else:
        axis.plot(X, Ys[algo], "-", label=algo, alpha=0.5, lw=2)
axis.set_xscale("log")
axis.set_yscale("log")
axis.set_xlabel("Sample size")
axis.set_ylabel("Bin count")
axis.legend(loc=2);
_images/binning_4_0.png

Binning schemes

Exponential binning

Uses numpy.logscale to create bins.

[4]:
figure, axis = plt.subplots(1, 2, figsize=(10, 4))
hist1 = histogram(numbers, "exponential", bin_count=10, range=(0.0001, 1))
hist1.plot(color="green", ax=axis[0])
hist1.plot(density=True, errors=True, ax=axis[1])
axis[0].set_title("Absolute scale")
axis[1].set_title("Log scale")
axis[1].set_xscale("log");
_images/binning_7_0.png

Integer binning

Useful for integer values (or something you want to round to integers), creates bins of width=1 around integers (i.e. 0.5-1.5, …)

[5]:
# Sum of two dice (should be triangle, right?)
dice = np.floor(np.random.rand(10000) * 6) + np.floor(np.random.rand(10000) * 6) + 2
histogram(dice, "integer").plot(ticks="center", density=True);
_images/binning_9_0.png

Quantile-based binning

Based on quantiles, this binning results in all bins containing roughly the same amount of observances.

[6]:
figure, axis = plt.subplots(1, 2, figsize=(10, 4))
# bins2 = binning.quantile_bins(heights1, 40)
hist2 = histogram(heights1, "quantile", bin_count=40)
hist2.plot(ax=axis[0]);
hist2.plot(density=True, ax=axis[1]);
axis[0].set_title("Frequencies")
axis[1].set_title("Density");
hist2
[6]:
Histogram1D(bins=(40,), total=100000, dtype=int64)
_images/binning_11_1.png
[7]:
figure, axis = plt.subplots()

histogram(heights1, "quantile", bin_count=10).plot(alpha=0.3, density=True, ax=axis, label="Quantile based")
histogram(heights1, 10).plot(alpha=0.3, density=True, ax=axis, color="green", label="Equal spaced")
axis.legend(loc=2);
_images/binning_12_0.png

Fixed-width bins

This binning is useful if you want “human-friendly” bin intervals.

[8]:
hist_fixed = histogram(heights1, "fixed_width", bin_width=3)
hist_fixed.plot()
hist_fixed
[8]:
Histogram1D(bins=(31,), total=100000, dtype=int64)
_images/binning_14_1.png

“Human” bins

The width and alignment of bins is guessed from the data with an approximate number of bins as (optional) parameter.

[9]:
human = histogram(heights1, "human", bin_count=15)
human.plot()
human
[9]:
Histogram1D(bins=(19,), total=100000, dtype=int64)
_images/binning_16_1.png

Astropy binning

Astropy includes its histogramming tools. If this package is available, we reuse its binning methods. These include:

  • Bayesian blocks
  • Knuth
  • Freedman
  • Scott

See http://docs.astropy.org/en/stable/visualization/histogram.html for more details.

[10]:
middle_sized = np.random.normal(180, 6, 5000)

for n in ["blocks", "scott", "knuth", "freedman"]:
    algo = "{0}".format(n)
    hist = histogram(middle_sized, algo, name=algo)
    hist.plot(density=True)
_images/binning_18_0.png
_images/binning_18_1.png
_images/binning_18_2.png
_images/binning_18_3.png