test_clustergram.py
from sklearn.datasets import make_blobs
import pandas as pd
import numpy as np
import pytest
from bokeh.embed import json_item
from pandas.testing import assert_series_equal
try:
import cudf
import cuml
import cupy as cp
RAPIDS = True
except (ImportError, ModuleNotFoundError):
RAPIDS = False
from clustergram import Clustergram
n_samples = 100
n_features = 2
n_clusters = 8
random_state = 0
device_data, device_labels = make_blobs(
n_samples=n_samples,
n_features=n_features,
centers=n_clusters,
random_state=random_state,
cluster_std=0.1,
)
data = pd.DataFrame(device_data)
def test_sklearn_kmeans():
clustergram = Clustergram(range(1, 8), backend="sklearn", random_state=random_state)
clustergram.fit(data)
for i in range(1, 8):
assert clustergram.labels[i].nunique() == i
assert clustergram.labels.shape == (100, 7)
assert clustergram.labels.notna().all().all()
expected = [
1.439891622331535,
-2.809248339265837,
-0.9554163965815223,
0.15829646201444203,
0.626698921291375,
0.9155105021035385,
1.0238657347680074,
]
assert expected == [
pytest.approx(np.mean(clustergram.cluster_centers[x]), rel=1e-12)
for x in range(1, 8)
]
assert clustergram.plot_data_pca.empty
ax = clustergram.plot(pca_kwargs=dict(random_state=random_state))
assert len(ax.get_children()) == 46
assert clustergram.plot_data.empty
ax = clustergram.plot(pca_weighted=False)
assert len(ax.get_children()) == 46
assert clustergram.plot_data_pca.mean().mean() == pytest.approx(
-2.095277953205114, rel=1e-15
)
assert clustergram.plot_data.mean().mean() == pytest.approx(
1.4398916223315354, rel=1e-15
)
def test_sklearn_minibatchkmeans():
clustergram = Clustergram(
range(1, 8),
backend="sklearn",
method="minibatchkmeans",
random_state=random_state,
)
clustergram.fit(data)
for i in range(1, 8):
assert clustergram.labels[i].nunique() == i
assert clustergram.labels.shape == (100, 7)
assert clustergram.labels.notna().all().all()
expected = [
1.439891622331535,
1.5942431676314943,
-0.9391362578715787,
0.16457587659721762,
0.7988407523191436,
0.9230637622852088,
1.0250449911587773,
]
assert expected == [
pytest.approx(np.mean(clustergram.cluster_centers[x]), rel=1e-12)
for x in range(1, 8)
]
assert clustergram.plot_data_pca.empty
ax = clustergram.plot(pca_kwargs=dict(random_state=random_state))
assert len(ax.get_children()) == 45
assert clustergram.plot_data.empty
ax = clustergram.plot(pca_weighted=False)
assert len(ax.get_children()) == 45
assert clustergram.plot_data_pca.mean().mean() == pytest.approx(
-2.153978086091386, rel=1e-15
)
assert clustergram.plot_data.mean().mean() == pytest.approx(
1.477158426841248, rel=1e-15
)
def test_sklearn_gmm():
clustergram = Clustergram(
range(1, 8), backend="sklearn", method="gmm", random_state=random_state
)
clustergram.fit(data)
for i in range(1, 8):
assert clustergram.labels[i].nunique() == i
assert clustergram.labels.shape == (100, 7)
assert clustergram.labels.notna().all().all()
expected = [
1.4886908509157464,
-2.8599808770366817,
-0.8823883211732156,
0.18416419702253917,
0.08229356227237798,
0.6537149985640699,
0.927345926721354,
]
assert expected == [
pytest.approx(np.mean(clustergram.cluster_centers[x]), rel=1e-6)
for x in range(1, 8)
]
assert clustergram.plot_data_pca.empty
ax = clustergram.plot(pca_kwargs=dict(random_state=random_state))
assert len(ax.get_children()) == 44
assert clustergram.plot_data.empty
ax = clustergram.plot(pca_weighted=False)
assert len(ax.get_children()) == 44
assert clustergram.plot_data_pca.mean().mean() == pytest.approx(
-1.9629843968429452, rel=1e-15
)
assert clustergram.plot_data.mean().mean() == pytest.approx(
1.3321040444661392, rel=1e-15
)
def test_bic():
clustergram = Clustergram(
range(1, 8),
backend="sklearn",
method="gmm",
random_state=random_state,
bic=True,
)
clustergram.fit(data)
expected = pd.Series(
[
1226.7924019554766,
948.6374834781362,
800.1788609508928,
687.5987056807201,
497.2770114251739,
402.1340827435864,
306.6669136240255,
],
index=range(1, 8),
)
assert_series_equal(expected, clustergram.bic, rtol=1e-6)
clustergram = Clustergram(
range(1, 8),
backend="sklearn",
method="gmm",
random_state=random_state,
bic=False,
)
clustergram.fit(data)
assert hasattr(clustergram, "bic") is False
@pytest.mark.skipif(
not RAPIDS,
reason="RAPIDS not available.",
)
def test_cuml_kmeans():
n_samples = 10
n_features = 2
n_clusters = 5
random_state = 0
device_data, device_labels = cuml.make_blobs(
n_samples=n_samples,
n_features=n_features,
centers=n_clusters,
random_state=random_state,
cluster_std=0.1,
)
data = cudf.DataFrame(device_data)
# cudf.DataFrame
clustergram = Clustergram(range(1, 8), backend="cuML", random_state=random_state)
clustergram.fit(data)
for i in range(1, 8):
assert clustergram.labels[i].nunique() == i
assert clustergram.labels.shape == (10, 7)
assert clustergram.labels.notna().all().all()
expected = [
0.9148379012942314,
1.0465015769004822,
0.9405179619789124,
0.8763175010681152,
1.5546628013253212,
1.2617384965221086,
0.7542384501014437,
]
assert expected == [
pytest.approx(float(clustergram.cluster_centers[x].mean().mean()), rel=1e-6)
for x in range(1, 8)
]
assert clustergram.plot_data_pca.empty
ax = clustergram.plot(pca_kwargs=dict(random_state=random_state))
assert len(ax.get_children()) == 46
assert clustergram.plot_data.empty
ax = clustergram.plot(pca_weighted=False)
assert len(ax.get_children()) == 46
assert clustergram.plot_data_pca.mean().mean() == pytest.approx(
1.344412697695078, rel=1e-10
)
assert clustergram.plot_data.mean().mean() == pytest.approx(
0.9148379244974681, rel=1e-10
)
# cupy array
data = device_data
clustergram = Clustergram(range(1, 8), backend="cuML", random_state=random_state)
clustergram.fit(data)
for i in range(1, 8):
assert clustergram.labels[i].nunique() == i
assert clustergram.labels.shape == (10, 7)
assert clustergram.labels.notna().all().all()
expected = [
0.9148379012942314,
1.0465015769004822,
0.9405179619789124,
0.8763175010681152,
1.5546628013253212,
1.2617384965221086,
0.7542384501014437,
]
assert expected == [
pytest.approx(float(cp.mean(clustergram.cluster_centers[x])), rel=1e-6)
for x in range(1, 8)
]
assert clustergram.plot_data_pca.empty
ax = clustergram.plot(pca_kwargs=dict(random_state=random_state))
assert len(ax.get_children()) == 46
assert clustergram.plot_data.empty
ax = clustergram.plot(pca_weighted=False)
assert len(ax.get_children()) == 46
assert clustergram.plot_data_pca.mean().mean() == pytest.approx(
1.344412697695078, rel=1e-6
)
assert clustergram.plot_data.mean().mean() == pytest.approx(
0.9148379244974681, rel=1e-6
)
def test_hierarchical():
clustergram = Clustergram(range(1, 8), method="hierarchical")
clustergram.fit(data)
for i in range(1, 8):
assert clustergram.labels[i].nunique() == i
assert clustergram.labels.shape == (100, 7)
assert clustergram.labels.notna().all().all()
expected = [
1.4398916223315354,
-2.8092483392658374,
-0.7499055624802712,
0.28659658912247143,
0.7961494117071617,
0.9155105021035381,
1.023865734768007,
]
assert expected == [
pytest.approx(np.mean(clustergram.cluster_centers[x]), rel=1e-12)
for x in range(1, 8)
]
assert clustergram.plot_data_pca.empty
ax = clustergram.plot(pca_kwargs=dict(random_state=random_state))
assert len(ax.get_children()) == 44
assert clustergram.plot_data.empty
ax = clustergram.plot(pca_weighted=False)
assert len(ax.get_children()) == 44
assert clustergram.plot_data_pca.mean().mean() == pytest.approx(
-2.0952779532051142, rel=1e-15
)
assert clustergram.plot_data.mean().mean() == pytest.approx(
1.4398916223315354, rel=1e-15
)
def test_hierarchical_array():
clustergram = Clustergram(method="hierarchical", k_range=range(1, 10))
clustergram.fit(data.values)
for i in range(1, 10):
assert clustergram.labels[i].nunique() == i
assert clustergram.labels.shape == (100, 9)
assert clustergram.labels.notna().all().all()
def test_errors():
with pytest.raises(ValueError):
Clustergram(range(1, 3), backend="nonsense")
with pytest.raises(ValueError):
Clustergram(range(1, 3), method="nonsense")
with pytest.raises(ValueError):
Clustergram(range(1, 3), method="kmeans", backend="scipy")
with pytest.raises(ValueError):
Clustergram(range(1, 3), method="hieararchical", backend="sklearn")
with pytest.raises(ValueError):
Clustergram(range(1, 3), method="gmm", backend="cuML")
with pytest.raises(ValueError):
Clustergram()
def test_repr():
expected = (
"Clustergram(k_range=range(1, 30), backend='sklearn', "
"method='kmeans', kwargs={'n_init': 10})"
)
clustergram = Clustergram(range(1, 30), n_init=10)
assert expected == clustergram.__repr__()
def test_silhouette_score():
clustergram = Clustergram(range(1, 8), backend="sklearn", random_state=random_state)
clustergram.fit(data)
pd.testing.assert_series_equal(
clustergram.silhouette_score(),
pd.Series(
[
0.6754810302465651,
0.6277858262368159,
0.6728079183937916,
0.7092450515302072,
0.8001963572359172,
0.8798871538184535,
],
index=list(range(2, 8)),
name="silhouette_score",
),
)
pd.testing.assert_series_equal(
clustergram.silhouette,
pd.Series(
[
0.6754810302465651,
0.6277858262368159,
0.6728079183937916,
0.7092450515302072,
0.8001963572359172,
0.8798871538184535,
],
index=list(range(2, 8)),
name="silhouette_score",
),
)
@pytest.mark.skipif(
not RAPIDS,
reason="RAPIDS not available.",
)
def test_silhouette_score_cuml():
n_samples = 10
n_features = 2
n_clusters = 5
random_state = 0
device_data, device_labels = cuml.make_blobs(
n_samples=n_samples,
n_features=n_features,
centers=n_clusters,
random_state=random_state,
cluster_std=0.1,
)
data = cudf.DataFrame(device_data)
clustergram = Clustergram(range(1, 8), backend="cuML", random_state=random_state)
clustergram.fit(data)
pd.testing.assert_series_equal(
clustergram.silhouette_score(),
pd.Series(
[0.5359467, 0.5933514, 0.7809184, 0.8807362, 0.68701756, 0.4919311],
index=list(range(2, 8)),
name="silhouette_score",
),
check_dtype=False,
)
clustergram = Clustergram(range(1, 8), backend="cuML", random_state=random_state)
clustergram.fit(device_data)
pd.testing.assert_series_equal(
clustergram.silhouette_score(),
pd.Series(
[0.5359467, 0.5933514, 0.7809184, 0.8807362, 0.68701756, 0.4919311],
index=list(range(2, 8)),
name="silhouette_score",
),
check_dtype=False,
)
def test_calinski_harabasz_score():
clustergram = Clustergram(range(1, 8), backend="sklearn", random_state=random_state)
clustergram.fit(data)
pd.testing.assert_series_equal(
clustergram.calinski_harabasz_score(),
pd.Series(
[
114.18545531981596,
259.8218744719872,
446.25054149041324,
586.3857013614834,
916.5220549808022,
1689.4091019412879,
],
index=list(range(2, 8)),
name="calinski_harabasz_score",
),
)
pd.testing.assert_series_equal(
clustergram.calinski_harabasz,
pd.Series(
[
114.18545531981596,
259.8218744719872,
446.25054149041324,
586.3857013614834,
916.5220549808022,
1689.4091019412879,
],
index=list(range(2, 8)),
name="calinski_harabasz_score",
),
)
@pytest.mark.skipif(
not RAPIDS,
reason="RAPIDS not available.",
)
def test_calinski_harabasz_score_cuml():
n_samples = 10
n_features = 2
n_clusters = 5
random_state = 0
device_data, device_labels = cuml.make_blobs(
n_samples=n_samples,
n_features=n_features,
centers=n_clusters,
random_state=random_state,
cluster_std=0.1,
)
data = cudf.DataFrame(device_data)
clustergram = Clustergram(range(1, 8), backend="cuML", random_state=random_state)
clustergram.fit(data)
pd.testing.assert_series_equal(
clustergram.calinski_harabasz_score(),
pd.Series(
[
14.884236661408588,
18.993060869559063,
25.53897801880369,
10495.855575243557,
10895.935616041483,
10449.035861758717,
],
index=list(range(2, 8)),
name="calinski_harabasz_score",
),
)
clustergram = Clustergram(range(1, 8), backend="cuML", random_state=random_state)
clustergram.fit(device_data)
pd.testing.assert_series_equal(
clustergram.calinski_harabasz_score(),
pd.Series(
[
14.884236661408588,
18.993060869559063,
25.53897801880369,
10495.855575243557,
10895.935616041483,
10449.035861758717,
],
index=list(range(2, 8)),
name="calinski_harabasz_score",
),
)
def test_davies_bouldin_score():
clustergram = Clustergram(range(1, 8), backend="sklearn", random_state=random_state)
clustergram.fit(data)
pd.testing.assert_series_equal(
clustergram.davies_bouldin_score(),
pd.Series(
[
0.2945752391269888,
0.5101512437048275,
0.4762688744525792,
0.4822529450245402,
0.3533377436714937,
0.21391254262995393,
],
index=list(range(2, 8)),
name="davies_bouldin_score",
),
)
pd.testing.assert_series_equal(
clustergram.davies_bouldin,
pd.Series(
[
0.2945752391269888,
0.5101512437048275,
0.4762688744525792,
0.4822529450245402,
0.3533377436714937,
0.21391254262995393,
],
index=list(range(2, 8)),
name="davies_bouldin_score",
),
)
@pytest.mark.skipif(
not RAPIDS,
reason="RAPIDS not available.",
)
def test_davies_bouldin_score_cuml():
n_samples = 10
n_features = 2
n_clusters = 5
random_state = 0
device_data, device_labels = cuml.make_blobs(
n_samples=n_samples,
n_features=n_features,
centers=n_clusters,
random_state=random_state,
cluster_std=0.1,
)
data = cudf.DataFrame(device_data)
clustergram = Clustergram(range(1, 8), backend="cuML", random_state=random_state)
clustergram.fit(data)
pd.testing.assert_series_equal(
clustergram.davies_bouldin_score(),
pd.Series(
[
0.67477383902307,
0.7673811855139047,
0.4520342597085474,
0.02258593626130912,
0.01451002792630246,
0.00967011650130667,
],
index=list(range(2, 8)),
name="davies_bouldin_score",
),
)
clustergram = Clustergram(range(1, 8), backend="cuML", random_state=random_state)
clustergram.fit(device_data)
pd.testing.assert_series_equal(
clustergram.davies_bouldin_score(),
pd.Series(
[
0.67477383902307,
0.7673811855139047,
0.4520342597085474,
0.02258593626130912,
0.01451002792630246,
0.00967011650130667,
],
index=list(range(2, 8)),
name="davies_bouldin_score",
),
)
def test_from_data_mean():
data = np.array([[-1, -1, 0, 10], [1, 1, 10, 2], [0, 0, 20, 4]])
labels = pd.DataFrame({1: [0, 0, 0], 2: [0, 0, 1], 3: [0, 2, 1]})
clustergram = Clustergram.from_data(data, labels)
assert clustergram.plot_data_pca.empty
ax = clustergram.plot(pca_kwargs=dict(random_state=random_state))
assert len(ax.get_children()) == 18
assert clustergram.plot_data.empty
ax = clustergram.plot(pca_weighted=False)
assert len(ax.get_children()) == 18
assert clustergram.plot_data_pca.mean().mean() == pytest.approx(
-7.820673888000655, rel=1e-15
)
assert clustergram.plot_data.mean().mean() == pytest.approx(
3.8333333333333335, rel=1e-15
)
def test_from_data_median():
data = np.array([[-1, -1, 0, 10], [1, 1, 10, 2], [0, 0, 20, 4]])
labels = pd.DataFrame({1: [0, 0, 0], 2: [0, 0, 1], 3: [0, 2, 1]})
clustergram = Clustergram.from_data(data, labels, method="median")
assert clustergram.plot_data_pca.empty
ax = clustergram.plot(pca_kwargs=dict(random_state=random_state))
assert len(ax.get_children()) == 18
assert clustergram.plot_data.empty
ax = clustergram.plot(pca_weighted=False)
assert len(ax.get_children()) == 18
assert clustergram.plot_data_pca.mean().mean() == pytest.approx(
-7.958519683972767, rel=1e-15
)
assert clustergram.plot_data.mean().mean() == pytest.approx(
3.7222222222222228, rel=1e-15
)
def test_from_data_nonsense():
data = np.array([[-1, -1, 0, 10], [1, 1, 10, 2], [0, 0, 20, 4]])
labels = pd.DataFrame({1: [0, 0, 0], 2: [0, 0, 1], 3: [0, 2, 1]})
with pytest.raises(ValueError, match="'nonsense' is not supported."):
Clustergram.from_data(data, labels, method="nonsense")
def test_from_data_index():
data = pd.DataFrame(
np.array([[-1, -1, 0, 10], [1, 1, 10, 2], [0, 0, 20, 4]]), index=["a", "b", "c"]
)
labels = pd.DataFrame({1: [0, 0, 0], 2: [0, 0, 1], 3: [0, 2, 1]})
clustergram = Clustergram.from_data(data, labels)
clustergram.plot()
clustergram.plot(pca_weighted=False)
clustergram = Clustergram.from_data(data, labels, method="median")
clustergram.plot()
clustergram.plot(pca_weighted=False)
def test_from_centers():
labels = pd.DataFrame({1: [0, 0, 0], 2: [0, 0, 1], 3: [0, 2, 1]})
centers = {
1: np.array([[0, 0]]),
2: np.array([[-1, -1], [1, 1]]),
3: np.array([[-1, -1], [1, 1], [0, 0]]),
}
clustergram = Clustergram.from_centers(centers, labels)
assert clustergram.plot_data.empty
ax = clustergram.plot(pca_weighted=False)
assert len(ax.get_children()) == 18
assert clustergram.plot_data.mean().mean() == pytest.approx(
-0.1111111111111111, rel=1e-15
)
labels = pd.DataFrame({2: [0, 0, 0], 3: [0, 0, 1], 4: [0, 2, 1]})
centers = {
1: np.array([[0, 0]]),
2: np.array([[-1, -1], [1, 1]]),
3: np.array([[-1, -1], [1, 1], [0, 0]]),
}
with pytest.raises(ValueError, match="'cluster_centers' keys do not match"):
Clustergram.from_centers(centers, labels)
def test_from_centers_data():
labels = pd.DataFrame({1: [0, 0, 0], 2: [0, 0, 1], 3: [0, 2, 1]})
centers = {
1: np.array([[0, 0]]),
2: np.array([[-1, -1], [1, 1]]),
3: np.array([[-1, -1], [1, 1], [0, 0]]),
}
data = np.array([[-1, -1], [1, 1], [0, 0]])
clustergram = Clustergram.from_centers(centers, labels, data)
assert clustergram.plot_data_pca.empty
ax = clustergram.plot(pca_weighted=True)
assert len(ax.get_children()) == 18
assert clustergram.plot_data_pca.mean().mean() == pytest.approx(
-0.15713484026367722, rel=1e-15
)
def test_bokeh():
clustergram = Clustergram(range(1, 8), backend="sklearn", random_state=random_state)
clustergram.fit(data)
f = clustergram.bokeh(pca_kwargs=dict(random_state=random_state))
out = str(json_item(f, "clustergram"))
assert out.count("data") == 60
assert "cluster_labels" in out
assert "count" in out
assert "ratio" in out
assert "size" in out
f = clustergram.bokeh(pca_weighted=False)
out = str(json_item(f, "clustergram"))
assert out.count("data") == 60
assert "cluster_labels" in out
assert "count" in out
assert "ratio" in out
assert "size" in out
@pytest.mark.skipif(
not RAPIDS,
reason="RAPIDS not available.",
)
def test_bokeh_cuml():
n_samples = 10
n_features = 2
n_clusters = 5
random_state = 0
device_data, device_labels = cuml.make_blobs(
n_samples=n_samples,
n_features=n_features,
centers=n_clusters,
random_state=random_state,
cluster_std=0.1,
)
data = cudf.DataFrame(device_data)
clustergram = Clustergram(range(1, 8), backend="cuML", random_state=random_state)
clustergram.fit(data)
f = clustergram.bokeh()
out = str(json_item(f, "clustergram"))
assert out.count("data") == 58
assert "cluster_labels" in out
assert "count" in out
assert "ratio" in out
assert "size" in out