jamtur01's picture
Upload folder using huggingface_hub
9c6594c verified
"""Define plots for clustering models built with scikit-learn."""
from warnings import simplefilter
import pandas as pd
import sklearn
import wandb
from wandb.integration.sklearn import calculate, utils
# ignore all future warnings
simplefilter(action="ignore", category=FutureWarning)
def clusterer(model, X_train, cluster_labels, labels=None, model_name="Clusterer"): # noqa: N803
"""Generates all sklearn clusterer plots supported by W&B.
The following plots are generated:
elbow curve, silhouette plot.
Should only be called with a fitted clusterer (otherwise an error is thrown).
Args:
model: (clusterer) Takes in a fitted clusterer.
X_train: (arr) Training set features.
cluster_labels: (list) Names for cluster labels. Makes plots easier to read
by replacing cluster indexes with corresponding names.
labels: (list) Named labels for target variable (y). Makes plots easier to
read by replacing target values with corresponding index.
For example if `labels=['dog', 'cat', 'owl']` all 0s are
replaced by dog, 1s by cat.
model_name: (str) Model name. Defaults to 'Clusterer'
Returns:
None: To see plots, go to your W&B run page then expand the 'media' tab
under 'auto visualizations'.
Example:
```python
wandb.sklearn.plot_clusterer(kmeans, X, cluster_labels, labels, "KMeans")
```
"""
wandb.termlog(f"\nPlotting {model_name}.")
if isinstance(model, sklearn.cluster.KMeans):
elbow_curve(model, X_train)
wandb.termlog("Logged elbow curve.")
silhouette(model, X_train, cluster_labels, labels=labels, kmeans=True)
else:
silhouette(model, X_train, cluster_labels, kmeans=False)
wandb.termlog("Logged silhouette plot.")
def elbow_curve(
clusterer=None,
X=None, # noqa: N803
cluster_ranges=None,
n_jobs=1,
show_cluster_time=True,
):
"""Measures and plots variance explained as a function of the number of clusters.
Useful in picking the optimal number of clusters.
Should only be called with a fitted clusterer (otherwise an error is thrown).
Please note this function fits the model on the training set when called.
Args:
model: (clusterer) Takes in a fitted clusterer.
X: (arr) Training set features.
Returns:
None: To see plots, go to your W&B run page then expand the 'media' tab
under 'auto visualizations'.
Example:
```python
wandb.sklearn.plot_elbow_curve(model, X_train)
```
"""
if not hasattr(clusterer, "n_clusters"):
wandb.termlog(
"n_clusters attribute not in classifier. Cannot plot elbow method."
)
return
not_missing = utils.test_missing(clusterer=clusterer)
correct_types = utils.test_types
is_fitted = utils.test_fitted(clusterer)
if not_missing and correct_types and is_fitted:
elbow_curve_chart = calculate.elbow_curve(
clusterer, X, cluster_ranges, n_jobs, show_cluster_time
)
wandb.log({"elbow_curve": elbow_curve_chart})
def silhouette(
clusterer=None,
X=None, # noqa: N803
cluster_labels=None,
labels=None,
metric="euclidean",
kmeans=True,
):
"""Measures & plots silhouette coefficients.
Silhouette coefficients near +1 indicate that the sample is far away from
the neighboring clusters. A value near 0 indicates that the sample is on or
very close to the decision boundary between two neighboring clusters and
negative values indicate that the samples might have been assigned to the wrong cluster.
Should only be called with a fitted clusterer (otherwise an error is thrown).
Please note this function fits the model on the training set when called.
Args:
model: (clusterer) Takes in a fitted clusterer.
X: (arr) Training set features.
cluster_labels: (list) Names for cluster labels. Makes plots easier to read
by replacing cluster indexes with corresponding names.
Returns:
None: To see plots, go to your W&B run page then expand the 'media' tab
under 'auto visualizations'.
Example:
```python
wandb.sklearn.plot_silhouette(model, X_train, ["spam", "not spam"])
```
"""
not_missing = utils.test_missing(clusterer=clusterer)
correct_types = utils.test_types(clusterer=clusterer)
is_fitted = utils.test_fitted(clusterer)
if not_missing and correct_types and is_fitted:
if isinstance(X, (pd.DataFrame)):
X = X.values # noqa: N806
silhouette_chart = calculate.silhouette(
clusterer, X, cluster_labels, labels, metric, kmeans
)
wandb.log({"silhouette_plot": silhouette_chart})