Binning in physt¶
[1]:
# Necessary import evil
%matplotlib inline
from physt import histogram, binnings
import numpy as np
import matplotlib.pyplot as plt
[2]:
# Some data
np.random.seed(42)
heights1 = np.random.normal(169, 10, 100000)
heights2 = np.random.normal(180, 6, 100000)
numbers = np.random.rand(100000)
Ideal number of bins¶
[3]:
X = [int(x) for x in np.logspace(0, 4, 50)]
algos = binnings.bincount_methods
Ys = { algo: [] for algo in algos}
for x in X:
ex_dataset = np.random.exponential(1, x)
for algo in algos:
Ys[algo].append(binnings.ideal_bin_count(ex_dataset, algo))
figure, axis = plt.subplots(figsize=(8, 8))
for algo in algos:
if algo == "default":
axis.plot(X, Ys[algo], ":.", label=algo, alpha=0.5, lw=2)
else:
axis.plot(X, Ys[algo], "-", label=algo, alpha=0.5, lw=2)
axis.set_xscale("log")
axis.set_yscale("log")
axis.set_xlabel("Sample size")
axis.set_ylabel("Bin count")
axis.legend(loc=2);
Binning schemes¶
Exponential binning¶
Uses numpy.logscale to create bins.
[4]:
figure, axis = plt.subplots(1, 2, figsize=(10, 4))
hist1 = histogram(numbers, "exponential", bin_count=10, range=(0.0001, 1))
hist1.plot(color="green", ax=axis[0])
hist1.plot(density=True, errors=True, ax=axis[1])
axis[0].set_title("Absolute scale")
axis[1].set_title("Log scale")
axis[1].set_xscale("log");
Integer binning¶
Useful for integer values (or something you want to round to integers), creates bins of width=1 around integers (i.e. 0.5-1.5, …)
[5]:
# Sum of two dice (should be triangle, right?)
dice = np.floor(np.random.rand(10000) * 6) + np.floor(np.random.rand(10000) * 6) + 2
histogram(dice, "integer").plot(ticks="center", density=True);
Quantile-based binning¶
Based on quantiles, this binning results in all bins containing roughly the same amount of observances.
[6]:
figure, axis = plt.subplots(1, 2, figsize=(10, 4))
# bins2 = binning.quantile_bins(heights1, 40)
hist2 = histogram(heights1, "quantile", bin_count=40)
hist2.plot(ax=axis[0]);
hist2.plot(density=True, ax=axis[1]);
axis[0].set_title("Frequencies")
axis[1].set_title("Density");
hist2
[6]:
Histogram1D(bins=(40,), total=100000, dtype=int64)
[7]:
figure, axis = plt.subplots()
histogram(heights1, "quantile", bin_count=10).plot(alpha=0.3, density=True, ax=axis, label="Quantile based")
histogram(heights1, 10).plot(alpha=0.3, density=True, ax=axis, color="green", label="Equal spaced")
axis.legend(loc=2);
Fixed-width bins¶
This binning is useful if you want “human-friendly” bin intervals.
[8]:
hist_fixed = histogram(heights1, "fixed_width", bin_width=3)
hist_fixed.plot()
hist_fixed
[8]:
Histogram1D(bins=(31,), total=100000, dtype=int64)
“Human” bins¶
The width and alignment of bins is guessed from the data with an approximate number of bins as (optional) parameter.
[9]:
human = histogram(heights1, "human", bin_count=15)
human.plot()
human
[9]:
Histogram1D(bins=(19,), total=100000, dtype=int64)
Astropy binning¶
Astropy includes its histogramming tools. If this package is available, we reuse its binning methods. These include:
- Bayesian blocks
- Knuth
- Freedman
- Scott
See http://docs.astropy.org/en/stable/visualization/histogram.html for more details.
[10]:
middle_sized = np.random.normal(180, 6, 5000)
for n in ["blocks", "scott", "knuth", "freedman"]:
algo = "{0}".format(n)
hist = histogram(middle_sized, algo, name=algo)
hist.plot(density=True)