File size: 4,917 Bytes
9c6594c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""Define plots for clustering models built with scikit-learn."""

from warnings import simplefilter

import pandas as pd
import sklearn

import wandb
from wandb.integration.sklearn import calculate, utils

# ignore all future warnings
simplefilter(action="ignore", category=FutureWarning)


def clusterer(model, X_train, cluster_labels, labels=None, model_name="Clusterer"):  # noqa: N803
    """Generates all sklearn clusterer plots supported by W&B.

    The following plots are generated:
        elbow curve, silhouette plot.

    Should only be called with a fitted clusterer (otherwise an error is thrown).

    Args:
        model: (clusterer) Takes in a fitted clusterer.
        X_train: (arr) Training set features.
        cluster_labels: (list) Names for cluster labels. Makes plots easier to read
                            by replacing cluster indexes with corresponding names.
        labels: (list) Named labels for target variable (y). Makes plots easier to
                        read by replacing target values with corresponding index.
                        For example if `labels=['dog', 'cat', 'owl']` all 0s are
                        replaced by dog, 1s by cat.
        model_name: (str) Model name. Defaults to 'Clusterer'

    Returns:
        None: To see plots, go to your W&B run page then expand the 'media' tab
              under 'auto visualizations'.

    Example:
    ```python
    wandb.sklearn.plot_clusterer(kmeans, X, cluster_labels, labels, "KMeans")
    ```
    """
    wandb.termlog(f"\nPlotting {model_name}.")
    if isinstance(model, sklearn.cluster.KMeans):
        elbow_curve(model, X_train)
        wandb.termlog("Logged elbow curve.")

        silhouette(model, X_train, cluster_labels, labels=labels, kmeans=True)

    else:
        silhouette(model, X_train, cluster_labels, kmeans=False)

    wandb.termlog("Logged silhouette plot.")


def elbow_curve(
    clusterer=None,
    X=None,  # noqa: N803
    cluster_ranges=None,
    n_jobs=1,
    show_cluster_time=True,
):
    """Measures and plots variance explained as a function of the number of clusters.

    Useful in picking the optimal number of clusters.

    Should only be called with a fitted clusterer (otherwise an error is thrown).

    Please note this function fits the model on the training set when called.

    Args:
        model: (clusterer) Takes in a fitted clusterer.
        X: (arr) Training set features.

    Returns:
        None: To see plots, go to your W&B run page then expand the 'media' tab
              under 'auto visualizations'.

    Example:
    ```python
    wandb.sklearn.plot_elbow_curve(model, X_train)
    ```
    """
    if not hasattr(clusterer, "n_clusters"):
        wandb.termlog(
            "n_clusters attribute not in classifier. Cannot plot elbow method."
        )
        return

    not_missing = utils.test_missing(clusterer=clusterer)
    correct_types = utils.test_types
    is_fitted = utils.test_fitted(clusterer)

    if not_missing and correct_types and is_fitted:
        elbow_curve_chart = calculate.elbow_curve(
            clusterer, X, cluster_ranges, n_jobs, show_cluster_time
        )

        wandb.log({"elbow_curve": elbow_curve_chart})


def silhouette(
    clusterer=None,
    X=None,  # noqa: N803
    cluster_labels=None,
    labels=None,
    metric="euclidean",
    kmeans=True,
):
    """Measures & plots silhouette coefficients.

    Silhouette coefficients near +1 indicate that the sample is far away from
    the neighboring clusters. A value near 0 indicates that the sample is on or
    very close to the decision boundary between two neighboring clusters and
    negative values indicate that the samples might have been assigned to the wrong cluster.

    Should only be called with a fitted clusterer (otherwise an error is thrown).

    Please note this function fits the model on the training set when called.

    Args:
        model: (clusterer) Takes in a fitted clusterer.
        X: (arr) Training set features.
        cluster_labels: (list) Names for cluster labels. Makes plots easier to read
                               by replacing cluster indexes with corresponding names.

    Returns:
        None: To see plots, go to your W&B run page then expand the 'media' tab
              under 'auto visualizations'.

    Example:
    ```python
    wandb.sklearn.plot_silhouette(model, X_train, ["spam", "not spam"])
    ```
    """
    not_missing = utils.test_missing(clusterer=clusterer)
    correct_types = utils.test_types(clusterer=clusterer)
    is_fitted = utils.test_fitted(clusterer)

    if not_missing and correct_types and is_fitted:
        if isinstance(X, (pd.DataFrame)):
            X = X.values  # noqa: N806
        silhouette_chart = calculate.silhouette(
            clusterer, X, cluster_labels, labels, metric, kmeans
        )
        wandb.log({"silhouette_plot": silhouette_chart})