unsupervised_clustering_app_Cybersoft

Sleeping

App Files Files Community

kheejay88 commited on Mar 11

Commit

a7fd807

verified ·

1 Parent(s): 9ab5fbe

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -96

app.py CHANGED Viewed

@@ -1,96 +1,96 @@
-import streamlit as st
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plt
-import seaborn as sns
-from sklearn.cluster import KMeans
-from sklearn.preprocessing import StandardScaler
-from sklearn.decomposition import PCA
-# from datasets import load_dataset
-# App Title
-st.title("Unsupervised Data Clustering App")
-# About App
-with st.expander("About this App"):
-    st.write(
-        "This app allows you to upload any type of unlabeled dataset "
-        "and automatically clusters the data using K-means clustering. "
-        "It visualizes the clusters using PCA and provides time series and cluster distribution plots "
-        "to help you identify patterns and groupings within your data."
-    )
-# File uploader
-uploaded_file = st.file_uploader("Upload Custom CSV file", type=["csv"])
-# # Example Demo Dataset
-if st.button("Test With An Example Dataset"):
-    dataset = load_dataset('kheejay88/country_data', split='train')
-    df = pd.DataFrame(dataset)
-    st.success("Loaded example dataset from Hugging Face.")
-with st.expander("Dataset Columns"):
-    st.write("""
-        **country** – Name of the country\n
-        **child_mort** – Death of children under 5 years of age per 1000 live births\n
-        **exports** – Exports of goods and services per capita (as a percentage of GDP)\n
-        **health** – Total health spending per capita (as a percentage of GDP)\n
-        **imports** – Imports of goods and services per capita (as a percentage of GDP)\n
-        **income** – Net income per person\n
-        **inflation** – Annual inflation rate (percentage)\n
-        **life_expec** – Average life expectancy at birth (in years)\n
-        **total_fer** – Total fertility rate (average number of children per woman)\n
-        **gdpp** – GDP per capita\n
-    """)
-if uploaded_file is not None:
-    df = pd.read_csv(uploaded_file)
-if 'df' in locals():
-    # Drop non-numeric columns
-    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
-    df.drop(columns=categorical_cols, inplace=True)
-    st.write("### Raw Data:")
-    st.write(df.head())
-    # Preprocessing
-    scaler = StandardScaler()
-    scaled_data = scaler.fit_transform(df)
-    # User input for clusters
-    num_clusters = st.slider("Select number of clusters", min_value=2, max_value=10, value=3)
-    # K-Means Clustering
-    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
-    clusters = kmeans.fit_predict(scaled_data)
-    df['Cluster'] = clusters
-    # PCA for visualization
-    pca = PCA(n_components=2)
-    pca_data = pca.fit_transform(scaled_data)
-    df['PCA1'] = pca_data[:, 0]
-    df['PCA2'] = pca_data[:, 1]
-    # Plot Clusters
-    st.write("### Cluster Visualization:")
-    fig, ax = plt.subplots()
-    sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df, palette='viridis', ax=ax)
-    st.pyplot(fig)
-    # Time Series Plot (if available)
-    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    if len(numeric_cols) >= 2:
-        selected_col = st.selectbox("Select column for time series visualization", numeric_cols)
-        st.write("### Time Series Plot:")
-        fig, ax = plt.subplots()
-        for cluster in df['Cluster'].unique():
-            cluster_data = df[df['Cluster'] == cluster]
-            ax.plot(cluster_data.index, cluster_data[selected_col], label=f'Cluster {cluster}')
-        ax.legend()
-        st.pyplot(fig)
-    # Cluster distribution
-    st.write("### Cluster Distribution:")
-    fig, ax = plt.subplots()
-    sns.countplot(x='Cluster', data=df, palette='viridis', ax=ax)
-    st.pyplot(fig)

+import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA
+from datasets import load_dataset
+# App Title
+st.title("Unsupervised Data Clustering App")
+# About App
+with st.expander("About this App"):
+    st.write(
+        "This app allows you to upload any type of unlabeled dataset "
+        "and automatically clusters the data using K-means clustering. "
+        "It visualizes the clusters using PCA and provides time series and cluster distribution plots "
+        "to help you identify patterns and groupings within your data."
+    )
+# File uploader
+uploaded_file = st.file_uploader("Upload Custom CSV file", type=["csv"])
+# # Example Demo Dataset
+if st.button("Test With An Example Dataset"):
+    dataset = load_dataset('kheejay88/country_data', split='train')
+    df = pd.DataFrame(dataset)
+    st.success("Loaded example dataset from Hugging Face.")
+with st.expander("Dataset Columns"):
+    st.write("""
+        **country** – Name of the country\n
+        **child_mort** – Death of children under 5 years of age per 1000 live births\n
+        **exports** – Exports of goods and services per capita (as a percentage of GDP)\n
+        **health** – Total health spending per capita (as a percentage of GDP)\n
+        **imports** – Imports of goods and services per capita (as a percentage of GDP)\n
+        **income** – Net income per person\n
+        **inflation** – Annual inflation rate (percentage)\n
+        **life_expec** – Average life expectancy at birth (in years)\n
+        **total_fer** – Total fertility rate (average number of children per woman)\n
+        **gdpp** – GDP per capita\n
+    """)
+if uploaded_file is not None:
+    df = pd.read_csv(uploaded_file)
+if 'df' in locals():
+    # Drop non-numeric columns
+    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+    df.drop(columns=categorical_cols, inplace=True)
+    st.write("### Raw Data:")
+    st.write(df.head())
+    # Preprocessing
+    scaler = StandardScaler()
+    scaled_data = scaler.fit_transform(df)
+    # User input for clusters
+    num_clusters = st.slider("Select number of clusters", min_value=2, max_value=10, value=3)
+    # K-Means Clustering
+    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
+    clusters = kmeans.fit_predict(scaled_data)
+    df['Cluster'] = clusters
+    # PCA for visualization
+    pca = PCA(n_components=2)
+    pca_data = pca.fit_transform(scaled_data)
+    df['PCA1'] = pca_data[:, 0]
+    df['PCA2'] = pca_data[:, 1]
+    # Plot Clusters
+    st.write("### Cluster Visualization:")
+    fig, ax = plt.subplots()
+    sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df, palette='viridis', ax=ax)
+    st.pyplot(fig)
+    # Time Series Plot (if available)
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    if len(numeric_cols) >= 2:
+        selected_col = st.selectbox("Select column for time series visualization", numeric_cols)
+        st.write("### Time Series Plot:")
+        fig, ax = plt.subplots()
+        for cluster in df['Cluster'].unique():
+            cluster_data = df[df['Cluster'] == cluster]
+            ax.plot(cluster_data.index, cluster_data[selected_col], label=f'Cluster {cluster}')
+        ax.legend()
+        st.pyplot(fig)
+    # Cluster distribution
+    st.write("### Cluster Distribution:")
+    fig, ax = plt.subplots()
+    sns.countplot(x='Cluster', data=df, palette='viridis', ax=ax)
+    st.pyplot(fig)