2D Histograms in physt

[1]:
# Necessary import evil
import physt
from physt import h1, h2, histogramdd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

np.random.seed(42)
[2]:
# Some data
x = np.random.normal(100, 1, 1000)
y = np.random.normal(10, 10, 1000)
[3]:
# Create a simple histogram
histogram = h2(x, y, [8, 4], name="Some histogram", axis_names=["x", "y"])
histogram
[3]:
Histogram2D('Some histogram', bins=(8, 4), total=1000, dtype=int64)
[4]:
# Frequencies are a 2D-array
histogram.frequencies
[4]:
array([[  0,   2,   4,   0],
       [  3,  26,  20,   5],
       [ 17,  78, 104,  10],
       [ 26, 163, 147,  17],
       [ 17, 136,  96,  17],
       [  6,  41,  38,   6],
       [  1,  11,   7,   0],
       [  0,   1,   0,   1]], dtype=int64)

Multidimensional binning

In most cases, binning methods that apply for 1D histograms, can be used also in higher dimensions. In such cases, each parameter can be either scalar (applies to all dimensions) or a list/tuple with independent values for each dimension. This also applies for range that has to be list/tuple of tuples.

[6]:
histogram = h2(x, y, "fixed_width", bin_width=[2, 10], name="Fixed-width bins", axis_names=["x", "y"])
histogram.plot();
histogram.numpy_bins
[6]:
[array([ 96.,  98., 100., 102., 104.]),
 array([-20., -10.,   0.,  10.,  20.,  30.,  40.,  50.])]
_images/2d_histograms_6_1.png
[7]:
histogram = h2(x, y, "quantile", bin_count=[3, 4], name="Quantile bins", axis_names=["x", "y"])
histogram.plot(cmap_min=0);
histogram.numpy_bins
[7]:
[array([ 96.75873266,  99.54993453, 100.40825276, 103.85273149]),
 array([-19.40388635,   3.93758311,  10.63077132,  17.28882177,
         41.93107568])]
_images/2d_histograms_7_1.png
[8]:
histogram = h2(x, y, "human", bin_count=5, name="Human-friendly bins", axis_names=["x", "y"])
histogram.plot();
histogram.numpy_bins
[8]:
[array([ 96.,  98., 100., 102., 104.]),
 array([-20., -10.,   0.,  10.,  20.,  30.,  40.,  50.])]
_images/2d_histograms_8_1.png

Plotting

2D

[ ]:
# Default is workable
ax = histogram.plot()
[9]:
# Custom colormap, no colorbar
import matplotlib.cm as cm
fig, ax = plt.subplots()
ax = histogram.plot(ax=ax, cmap=cm.copper, show_colorbar=False, grid_color=cm.copper(0.5))
ax.set_title("Custom colormap");
_images/2d_histograms_11_0.png
[10]:
# Use a named colormap + limit it to a range of values
import matplotlib.cm as cm
fig, ax = plt.subplots()
ax = histogram.plot(ax=ax, cmap="Oranges", show_colorbar=True, cmap_min=20, cmap_max=100, show_values=True)
ax.set_title("Clipped colormap");
_images/2d_histograms_12_0.png
[11]:
# Show labels (and hide zero bins), no grid(lw=0)
ax = histogram.plot(show_values=True, show_zero=False, cmap=cm.RdBu, format_value=float, lw=0)
_images/2d_histograms_13_0.png

Large histograms as images

Plotting histograms in this way gets problematic with more than roughly 50x50 bins. There is an alternative, though, partially inspired by the datashader project - plot the histogram as bitmap, which works very fast even for very large histograms.

Note: This method does not work for histograms with irregular bins.

[12]:
x = np.random.normal(100, 1, 1000000)
y = np.random.normal(10, 10, 1000000)
[13]:
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
h2(x, y, 20, name="20 bins - map").plot("map", cmap="rainbow", lw=0, alpha=1, ax=axes[0], show_colorbar=False)
h2(x, y, 20, name="20 bins - image").plot("image", cmap="rainbow", alpha=1, ax=axes[1])
h2(x, y, 500, name="500 bins - image").plot("image", cmap="rainbow", alpha=1, ax=axes[2]);
_images/2d_histograms_16_0.png

See that the output is equivalent to map without lines.

Transformation

Sometimes, the value range is too big to show details. Therefore, it may be of some use to transform the values by a function, e.g. logarithm.

[14]:
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
h2(x, y, 20, name="20 bins - map").plot("map", alpha=1, lw=0, show_zero=False, cmap="rainbow", ax=axes[0], show_colorbar=False, cmap_normalize="log")
h2(x, y, 20, name="20 bins - image").plot("image", alpha=1, ax=axes[1], cmap="rainbow", cmap_normalize="log")
h2(x, y, 500, name="500 bins - image").plot("image", alpha=1, ax=axes[2], cmap="rainbow", cmap_normalize="log");
_images/2d_histograms_19_0.png
[15]:
# Composition - show histogram overlayed with "points"
fig, ax = plt.subplots(figsize=(8, 7))
h_2 = h2(x, y, 30)
h_2.plot("map", lw=0, alpha=0.9, cmap="Blues", ax=ax, cmap_normalize="log", show_zero=False)
# h2(x, y, 300).plot("image", alpha=1, cmap="Greys", ax=ax, transform=lambda x: x > 0);
# Not working currently
[15]:
<AxesSubplot:xlabel='axis0', ylabel='axis1'>
_images/2d_histograms_20_1.png

3D

By this, we mean 3D bar plots of 2D histograms (not a visual representation of 3D histograms).

[16]:
histogram.plot("bar3d", cmap="rainbow");
_images/2d_histograms_22_0.png
[17]:
histogram.plot("bar3d", color="red");
_images/2d_histograms_23_0.png

Projections

[18]:
proj1 = histogram.projection("x", name="Projection to X")
proj1.plot(errors=True)
proj1
[18]:
Histogram1D('Projection to X', bins=(4,), total=1000, dtype=int64)
_images/2d_histograms_25_1.png
[19]:
proj2 = histogram.projection("y", name="Projection to Y")
proj2.plot(errors=True)
proj2
[19]:
Histogram1D('Projection to Y', bins=(7,), total=1000, dtype=int64)
_images/2d_histograms_26_1.png

Adaptive 2D histograms

[20]:
# Create and add two histograms with adaptive binning
height1 = np.random.normal(180, 5, 1000)
weight1 = np.random.normal(80, 2, 1000)
ad1 = h2(height1, weight1, "fixed_width", bin_width=1, adaptive=True)
ad1.plot(show_zero=False)

height2 = np.random.normal(160, 5, 1000)
weight2 = np.random.normal(70, 2, 1000)
ad2 = h2(height2, weight2, "fixed_width", bin_width=1, adaptive=True)
ad2.plot(show_zero=False)

(ad1 + ad2).plot(show_zero=False);
_images/2d_histograms_28_0.png
_images/2d_histograms_28_1.png
_images/2d_histograms_28_2.png

N-dimensional histograms

Although is not easy to visualize them, it is possible to create histograms of any dimensions that behave similar to 2D ones. Warning: be aware that the memory consumption can be significant.

[21]:
# Create a 4D histogram
data = [np.random.rand(1000)[:, np.newaxis] for i in range(4)]
data = np.concatenate(data, axis=1)
h4 = histogramdd(data, [3, 2, 2, 3], axis_names="abcd")
h4
[21]:
HistogramND(bins=(3, 2, 2, 3), total=1000, dtype=int64)
[22]:
h4.frequencies
[22]:
array([[[[31, 28, 33],
         [21, 22, 22]],

        [[25, 29, 28],
         [29, 35, 28]]],


       [[[20, 25, 20],
         [28, 32, 31]],

        [[30, 28, 24],
         [29, 21, 27]]],


       [[[27, 26, 33],
         [21, 35, 30]],

        [[38, 30, 32],
         [25, 30, 27]]]], dtype=int64)
[23]:
h4.projection("a", "d", name="4D -> 2D").plot(show_values=True, format_value=int, cmap_min="min");
_images/2d_histograms_32_0.png
[24]:
h4.projection("d", name="4D -> 1D").plot("scatter", errors=True);
_images/2d_histograms_33_0.png

Support for pandas DataFrames (without pandas dependency ;-))

[25]:
# Load notorious example data set
iris = sns.load_dataset('iris')
[28]:
iris = sns.load_dataset('iris')
iris_hist = physt.h2(iris["sepal_length"], iris["sepal_width"], "human", bin_count=[12, 7], name="Iris")
iris_hist.plot(show_zero=False, cmap=cm.gray_r, show_values=True, format_value=int);
_images/2d_histograms_36_0.png
[29]:
iris_hist.projection("sepal_length").plot();
_images/2d_histograms_37_0.png