import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from datasets import load_dataset # App Title st.title("Unsupervised Data Clustering App") # About App with st.expander("About this App"): st.write( "This app allows you to upload any type of unlabeled dataset " "and automatically clusters the data using K-means clustering. " "It visualizes the clusters using PCA and provides time series and cluster distribution plots " "to help you identify patterns and groupings within your data." ) # File uploader uploaded_file = st.file_uploader("Upload Custom CSV file", type=["csv"]) # # Example Demo Dataset if st.button("Test With An Example Dataset"): dataset = load_dataset('kheejay88/country_data', split='train') df = pd.DataFrame(dataset) st.success("Loaded example dataset from Hugging Face.") with st.expander("Dataset Columns"): st.write(""" **country** – Name of the country\n **child_mort** – Death of children under 5 years of age per 1000 live births\n **exports** – Exports of goods and services per capita (as a percentage of GDP)\n **health** – Total health spending per capita (as a percentage of GDP)\n **imports** – Imports of goods and services per capita (as a percentage of GDP)\n **income** – Net income per person\n **inflation** – Annual inflation rate (percentage)\n **life_expec** – Average life expectancy at birth (in years)\n **total_fer** – Total fertility rate (average number of children per woman)\n **gdpp** – GDP per capita\n """) if uploaded_file is not None: df = pd.read_csv(uploaded_file) if 'df' in locals(): # Drop non-numeric columns categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() df.drop(columns=categorical_cols, inplace=True) st.write("### Raw Data:") st.write(df.head()) # Preprocessing scaler = StandardScaler() scaled_data = scaler.fit_transform(df) # User input for clusters num_clusters = st.slider("Select number of clusters", min_value=2, max_value=10, value=3) # K-Means Clustering kmeans = KMeans(n_clusters=num_clusters, random_state=42) clusters = kmeans.fit_predict(scaled_data) df['Cluster'] = clusters # PCA for visualization pca = PCA(n_components=2) pca_data = pca.fit_transform(scaled_data) df['PCA1'] = pca_data[:, 0] df['PCA2'] = pca_data[:, 1] # Plot Clusters st.write("### Cluster Visualization:") fig, ax = plt.subplots() sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df, palette='viridis', ax=ax) st.pyplot(fig) # Time Series Plot (if available) numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() if len(numeric_cols) >= 2: selected_col = st.selectbox("Select column for time series visualization", numeric_cols) st.write("### Time Series Plot:") fig, ax = plt.subplots() for cluster in df['Cluster'].unique(): cluster_data = df[df['Cluster'] == cluster] ax.plot(cluster_data.index, cluster_data[selected_col], label=f'Cluster {cluster}') ax.legend() st.pyplot(fig) # Cluster distribution st.write("### Cluster Distribution:") fig, ax = plt.subplots() sns.countplot(x='Cluster', data=df, palette='viridis', ax=ax) st.pyplot(fig) st.markdown("---") # Adds a horizontal line st.markdown("**Thanks!**")