unsupervised_clustering_app_Cybersoft

Sleeping

App Files Files Community

hoangkha1810 commited on Apr 28

Commit

d880f04

verified ·

1 Parent(s): 87b9b04

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -46

app.py CHANGED Viewed

@@ -8,92 +8,93 @@ from sklearn.preprocessing import StandardScaler
 from sklearn.decomposition import PCA
 from datasets import load_dataset
-# App Title
-st.title("Unsupervised Data Clustering App")
-# About App
-with st.expander("About this App"):
     st.write(
-        "This app allows you to upload any type of unlabeled dataset "
-        "and automatically clusters the data using K-means clustering. "
-        "It visualizes the clusters using PCA and provides time series and cluster distribution plots "
-        "to help you identify patterns and groupings within your data."
     )
-# File uploader
-uploaded_file = st.file_uploader("Upload Custom CSV file", type=["csv"])
-# # Example Demo Dataset
-if st.button("Test With An Example Dataset"):
     dataset = load_dataset('kheejay88/country_data', split='train')
     df = pd.DataFrame(dataset)
-    st.success("Loaded example dataset from Hugging Face.")
-with st.expander("Dataset Columns"):
     st.write("""
-        **country** – Name of the country\n
-        **child_mort** – Death of children under 5 years of age per 1000 live births\n
-        **exports** – Exports of goods and services per capita (as a percentage of GDP)\n
-        **health** – Total health spending per capita (as a percentage of GDP)\n
-        **imports** – Imports of goods and services per capita (as a percentage of GDP)\n
-        **income** – Net income per person\n
-        **inflation** – Annual inflation rate (percentage)\n
-        **life_expec** – Average life expectancy at birth (in years)\n
-        **total_fer** – Total fertility rate (average number of children per woman)\n
-        **gdpp** – GDP per capita\n
     """)
 if uploaded_file is not None:
     df = pd.read_csv(uploaded_file)
 if 'df' in locals():
-    # Drop non-numeric columns
     categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
     df.drop(columns=categorical_cols, inplace=True)
-    st.write("### Raw Data:")
     st.write(df.head())
-    # Preprocessing
     scaler = StandardScaler()
     scaled_data = scaler.fit_transform(df)
-    # User input for clusters
-    num_clusters = st.slider("Select number of clusters", min_value=2, max_value=10, value=3)
-    # K-Means Clustering
     kmeans = KMeans(n_clusters=num_clusters, random_state=42)
     clusters = kmeans.fit_predict(scaled_data)
-    df['Cluster'] = clusters
-    # PCA for visualization
     pca = PCA(n_components=2)
     pca_data = pca.fit_transform(scaled_data)
     df['PCA1'] = pca_data[:, 0]
     df['PCA2'] = pca_data[:, 1]
-    # Plot Clusters
-    st.write("### Cluster Visualization:")
     fig, ax = plt.subplots()
-    sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df, palette='viridis', ax=ax)
     st.pyplot(fig)
-    # Time Series Plot (if available)
     numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
     if len(numeric_cols) >= 2:
-        selected_col = st.selectbox("Select column for time series visualization", numeric_cols)
-        st.write("### Time Series Plot:")
         fig, ax = plt.subplots()
-        for cluster in df['Cluster'].unique():
-            cluster_data = df[df['Cluster'] == cluster]
-            ax.plot(cluster_data.index, cluster_data[selected_col], label=f'Cluster {cluster}')
         ax.legend()
         st.pyplot(fig)
-    # Cluster distribution
-    st.write("### Cluster Distribution:")
     fig, ax = plt.subplots()
-    sns.countplot(x='Cluster', data=df, palette='viridis', ax=ax)
     st.pyplot(fig)
-    st.markdown("---")  # Adds a horizontal line
-    st.markdown("**Thanks!**")

 from sklearn.decomposition import PCA
 from datasets import load_dataset
+# Tiêu đề ứng dụng
+st.title("Ứng dụng Phân Cụm Dữ Liệu Không Giám Sát")
+# Giới thiệu về ứng dụng
+with st.expander("Giới thiệu về Ứng dụng"):
     st.write(
+        "Ứng dụng này cho phép bạn tải lên bất kỳ loại tập dữ liệu không nhãn nào "
+        "và tự động phân cụm dữ liệu bằng thuật toán K-means. "
+        "Nó sẽ trực quan hóa các cụm bằng phương pháp PCA và cung cấp biểu đồ chuỗi thời gian cũng như phân phối cụm "
+        "giúp bạn nhận diện các mô hình và nhóm trong dữ liệu."
     )
+# Tải file
+uploaded_file = st.file_uploader("Tải lên tập tin CSV", type=["csv"])
+# Bộ dữ liệu mẫu
+if st.button("Dùng thử với bộ dữ liệu mẫu"):
     dataset = load_dataset('kheejay88/country_data', split='train')
     df = pd.DataFrame(dataset)
+    st.success("Đã tải thành công bộ dữ liệu mẫu từ Hugging Face.")
+# Cột dữ liệu
+with st.expander("Các Cột Dữ Liệu"):
     st.write("""
+        **country** – Tên quốc gia\n
+        **child_mort** – Tỷ lệ tử vong trẻ em dưới 5 tuổi trên 1000 ca sinh sống\n
+        **exports** – Xuất khẩu hàng hóa và dịch vụ trên đầu người (tính theo phần trăm GDP)\n
+        **health** – Chi tiêu y tế trên đầu người (tính theo phần trăm GDP)\n
+        **imports** – Nhập khẩu hàng hóa và dịch vụ trên đầu người (tính theo phần trăm GDP)\n
+        **income** – Thu nhập ròng trên đầu người\n
+        **inflation** – Tỷ lệ lạm phát hàng năm (phần trăm)\n
+        **life_expec** – Tuổi thọ trung bình khi sinh (năm)\n
+        **total_fer** – Tổng tỷ suất sinh (số con trung bình mỗi phụ nữ)\n
+        **gdpp** – GDP trên đầu người\n
     """)
 if uploaded_file is not None:
     df = pd.read_csv(uploaded_file)
 if 'df' in locals():
+    # Loại bỏ cột không phải số
     categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
     df.drop(columns=categorical_cols, inplace=True)
+    st.write("### Dữ liệu thô:")
     st.write(df.head())
+    # Tiền xử lý dữ liệu
     scaler = StandardScaler()
     scaled_data = scaler.fit_transform(df)
+    # Chọn số lượng cụm
+    num_clusters = st.slider("Chọn số lượng cụm", min_value=2, max_value=10, value=3)
+    # Phân cụm bằng K-Means
     kmeans = KMeans(n_clusters=num_clusters, random_state=42)
     clusters = kmeans.fit_predict(scaled_data)
+    df['Cụm'] = clusters
+    # Giảm chiều bằng PCA để trực quan hóa
     pca = PCA(n_components=2)
     pca_data = pca.fit_transform(scaled_data)
     df['PCA1'] = pca_data[:, 0]
     df['PCA2'] = pca_data[:, 1]
+    # Vẽ biểu đồ phân cụm
+    st.write("### Biểu đồ Phân Cụm:")
     fig, ax = plt.subplots()
+    sns.scatterplot(x='PCA1', y='PCA2', hue='Cụm', data=df, palette='viridis', ax=ax)
     st.pyplot(fig)
+    # Vẽ biểu đồ chuỗi thời gian (nếu có)
     numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
     if len(numeric_cols) >= 2:
+        selected_col = st.selectbox("Chọn cột để vẽ biểu đồ chuỗi thời gian", numeric_cols)
+        st.write("### Biểu đồ Chuỗi Thời Gian:")
         fig, ax = plt.subplots()
+        for cluster in df['Cụm'].unique():
+            cluster_data = df[df['Cụm'] == cluster]
+            ax.plot(cluster_data.index, cluster_data[selected_col], label=f'Cụm {cluster}')
         ax.legend()
         st.pyplot(fig)
+    # Phân phối cụm
+    st.write("### Phân phối Cụm:")
     fig, ax = plt.subplots()
+    sns.countplot(x='Cụm', data=df, palette='viridis', ax=ax)
     st.pyplot(fig)
+    st.markdown("---")  # Kẻ một đường ngang
+    st.markdown("**Cảm ơn bạn!**")