|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from sklearn.cluster import KMeans |
|
from sklearn.preprocessing import StandardScaler |
|
from sklearn.decomposition import PCA |
|
from datasets import load_dataset |
|
|
|
|
|
st.title("Unsupervised Data Clustering App") |
|
|
|
|
|
with st.expander("About this App"): |
|
st.write( |
|
"This app allows you to upload any type of unlabeled dataset " |
|
"and automatically clusters the data using K-means clustering. " |
|
"It visualizes the clusters using PCA and provides time series and cluster distribution plots " |
|
"to help you identify patterns and groupings within your data." |
|
) |
|
|
|
|
|
uploaded_file = st.file_uploader("Upload Custom CSV file", type=["csv"]) |
|
|
|
|
|
if st.button("Test With An Example Dataset"): |
|
dataset = load_dataset('kheejay88/country_data', split='train') |
|
df = pd.DataFrame(dataset) |
|
st.success("Loaded example dataset from Hugging Face.") |
|
|
|
with st.expander("Dataset Columns"): |
|
st.write(""" |
|
**country** – Name of the country\n |
|
**child_mort** – Death of children under 5 years of age per 1000 live births\n |
|
**exports** – Exports of goods and services per capita (as a percentage of GDP)\n |
|
**health** – Total health spending per capita (as a percentage of GDP)\n |
|
**imports** – Imports of goods and services per capita (as a percentage of GDP)\n |
|
**income** – Net income per person\n |
|
**inflation** – Annual inflation rate (percentage)\n |
|
**life_expec** – Average life expectancy at birth (in years)\n |
|
**total_fer** – Total fertility rate (average number of children per woman)\n |
|
**gdpp** – GDP per capita\n |
|
""") |
|
|
|
if uploaded_file is not None: |
|
df = pd.read_csv(uploaded_file) |
|
|
|
if 'df' in locals(): |
|
|
|
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() |
|
df.drop(columns=categorical_cols, inplace=True) |
|
st.write("### Raw Data:") |
|
st.write(df.head()) |
|
|
|
|
|
scaler = StandardScaler() |
|
scaled_data = scaler.fit_transform(df) |
|
|
|
|
|
num_clusters = st.slider("Select number of clusters", min_value=2, max_value=10, value=3) |
|
|
|
|
|
kmeans = KMeans(n_clusters=num_clusters, random_state=42) |
|
clusters = kmeans.fit_predict(scaled_data) |
|
df['Cluster'] = clusters |
|
|
|
|
|
pca = PCA(n_components=2) |
|
pca_data = pca.fit_transform(scaled_data) |
|
df['PCA1'] = pca_data[:, 0] |
|
df['PCA2'] = pca_data[:, 1] |
|
|
|
|
|
st.write("### Cluster Visualization:") |
|
fig, ax = plt.subplots() |
|
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df, palette='viridis', ax=ax) |
|
st.pyplot(fig) |
|
|
|
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() |
|
if len(numeric_cols) >= 2: |
|
selected_col = st.selectbox("Select column for time series visualization", numeric_cols) |
|
st.write("### Time Series Plot:") |
|
fig, ax = plt.subplots() |
|
for cluster in df['Cluster'].unique(): |
|
cluster_data = df[df['Cluster'] == cluster] |
|
ax.plot(cluster_data.index, cluster_data[selected_col], label=f'Cluster {cluster}') |
|
ax.legend() |
|
st.pyplot(fig) |
|
|
|
|
|
st.write("### Cluster Distribution:") |
|
fig, ax = plt.subplots() |
|
sns.countplot(x='Cluster', data=df, palette='viridis', ax=ax) |
|
st.pyplot(fig) |
|
|
|
st.markdown("---") |
|
st.markdown("**Thanks!**") |
|
|