Spaces:

kheejay88
/

unsupervised_clustering_app

Sleeping

App Files Files Community

unsupervised_clustering_app / app.py

kheejay88

Update app.py

87b9b04 verified 5 months ago

raw

history blame contribute delete

3.71 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.cluster import KMeans
	from sklearn.preprocessing import StandardScaler
	from sklearn.decomposition import PCA
	from datasets import load_dataset

	# App Title
	st.title("Unsupervised Data Clustering App")

	# About App
	with st.expander("About this App"):
	st.write(
	"This app allows you to upload any type of unlabeled dataset "
	"and automatically clusters the data using K-means clustering. "
	"It visualizes the clusters using PCA and provides time series and cluster distribution plots "
	"to help you identify patterns and groupings within your data."
	)

	# File uploader
	uploaded_file = st.file_uploader("Upload Custom CSV file", type=["csv"])

	# # Example Demo Dataset
	if st.button("Test With An Example Dataset"):
	dataset = load_dataset('kheejay88/country_data', split='train')
	df = pd.DataFrame(dataset)
	st.success("Loaded example dataset from Hugging Face.")

	with st.expander("Dataset Columns"):
	st.write("""
	country – Name of the country\n
	child_mort – Death of children under 5 years of age per 1000 live births\n
	exports – Exports of goods and services per capita (as a percentage of GDP)\n
	health – Total health spending per capita (as a percentage of GDP)\n
	imports – Imports of goods and services per capita (as a percentage of GDP)\n
	income – Net income per person\n
	inflation – Annual inflation rate (percentage)\n
	life_expec – Average life expectancy at birth (in years)\n
	total_fer – Total fertility rate (average number of children per woman)\n
	gdpp – GDP per capita\n
	""")

	if uploaded_file is not None:
	df = pd.read_csv(uploaded_file)

	if 'df' in locals():
	# Drop non-numeric columns
	categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
	df.drop(columns=categorical_cols, inplace=True)
	st.write("### Raw Data:")
	st.write(df.head())

	# Preprocessing
	scaler = StandardScaler()
	scaled_data = scaler.fit_transform(df)

	# User input for clusters
	num_clusters = st.slider("Select number of clusters", min_value=2, max_value=10, value=3)

	# K-Means Clustering
	kmeans = KMeans(n_clusters=num_clusters, random_state=42)
	clusters = kmeans.fit_predict(scaled_data)
	df['Cluster'] = clusters

	# PCA for visualization
	pca = PCA(n_components=2)
	pca_data = pca.fit_transform(scaled_data)
	df['PCA1'] = pca_data[:, 0]
	df['PCA2'] = pca_data[:, 1]

	# Plot Clusters
	st.write("### Cluster Visualization:")
	fig, ax = plt.subplots()
	sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df, palette='viridis', ax=ax)
	st.pyplot(fig)

	# Time Series Plot (if available)
	numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
	if len(numeric_cols) >= 2:
	selected_col = st.selectbox("Select column for time series visualization", numeric_cols)
	st.write("### Time Series Plot:")
	fig, ax = plt.subplots()
	for cluster in df['Cluster'].unique():
	cluster_data = df[df['Cluster'] == cluster]
	ax.plot(cluster_data.index, cluster_data[selected_col], label=f'Cluster {cluster}')
	ax.legend()
	st.pyplot(fig)

	# Cluster distribution
	st.write("### Cluster Distribution:")
	fig, ax = plt.subplots()
	sns.countplot(x='Cluster', data=df, palette='viridis', ax=ax)
	st.pyplot(fig)

	st.markdown("---") # Adds a horizontal line
	st.markdown("Thanks!")