unsupervised_clustering_app_Cybersoft

Sleeping

App Files Files Community

unsupervised_clustering_app_Cybersoft / app.py

hoangkha1810

Update app.py

d880f04 verified 3 months ago

raw

history blame contribute delete

4.2 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.cluster import KMeans
	from sklearn.preprocessing import StandardScaler
	from sklearn.decomposition import PCA
	from datasets import load_dataset

	# Tiêu đề ứng dụng
	st.title("Ứng dụng Phân Cụm Dữ Liệu Không Giám Sát")

	# Giới thiệu về ứng dụng
	with st.expander("Giới thiệu về Ứng dụng"):
	st.write(
	"Ứng dụng này cho phép bạn tải lên bất kỳ loại tập dữ liệu không nhãn nào "
	"và tự động phân cụm dữ liệu bằng thuật toán K-means. "
	"Nó sẽ trực quan hóa các cụm bằng phương pháp PCA và cung cấp biểu đồ chuỗi thời gian cũng như phân phối cụm "
	"giúp bạn nhận diện các mô hình và nhóm trong dữ liệu."
	)

	# Tải file
	uploaded_file = st.file_uploader("Tải lên tập tin CSV", type=["csv"])

	# Bộ dữ liệu mẫu
	if st.button("Dùng thử với bộ dữ liệu mẫu"):
	dataset = load_dataset('kheejay88/country_data', split='train')
	df = pd.DataFrame(dataset)
	st.success("Đã tải thành công bộ dữ liệu mẫu từ Hugging Face.")

	# Cột dữ liệu
	with st.expander("Các Cột Dữ Liệu"):
	st.write("""
	country – Tên quốc gia\n
	child_mort – Tỷ lệ tử vong trẻ em dưới 5 tuổi trên 1000 ca sinh sống\n
	exports – Xuất khẩu hàng hóa và dịch vụ trên đầu người (tính theo phần trăm GDP)\n
	health – Chi tiêu y tế trên đầu người (tính theo phần trăm GDP)\n
	imports – Nhập khẩu hàng hóa và dịch vụ trên đầu người (tính theo phần trăm GDP)\n
	income – Thu nhập ròng trên đầu người\n
	inflation – Tỷ lệ lạm phát hàng năm (phần trăm)\n
	life_expec – Tuổi thọ trung bình khi sinh (năm)\n
	total_fer – Tổng tỷ suất sinh (số con trung bình mỗi phụ nữ)\n
	gdpp – GDP trên đầu người\n
	""")

	if uploaded_file is not None:
	df = pd.read_csv(uploaded_file)

	if 'df' in locals():
	# Loại bỏ cột không phải số
	categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
	df.drop(columns=categorical_cols, inplace=True)
	st.write("### Dữ liệu thô:")
	st.write(df.head())

	# Tiền xử lý dữ liệu
	scaler = StandardScaler()
	scaled_data = scaler.fit_transform(df)

	# Chọn số lượng cụm
	num_clusters = st.slider("Chọn số lượng cụm", min_value=2, max_value=10, value=3)

	# Phân cụm bằng K-Means
	kmeans = KMeans(n_clusters=num_clusters, random_state=42)
	clusters = kmeans.fit_predict(scaled_data)
	df['Cụm'] = clusters

	# Giảm chiều bằng PCA để trực quan hóa
	pca = PCA(n_components=2)
	pca_data = pca.fit_transform(scaled_data)
	df['PCA1'] = pca_data[:, 0]
	df['PCA2'] = pca_data[:, 1]

	# Vẽ biểu đồ phân cụm
	st.write("### Biểu đồ Phân Cụm:")
	fig, ax = plt.subplots()
	sns.scatterplot(x='PCA1', y='PCA2', hue='Cụm', data=df, palette='viridis', ax=ax)
	st.pyplot(fig)

	# Vẽ biểu đồ chuỗi thời gian (nếu có)
	numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
	if len(numeric_cols) >= 2:
	selected_col = st.selectbox("Chọn cột để vẽ biểu đồ chuỗi thời gian", numeric_cols)
	st.write("### Biểu đồ Chuỗi Thời Gian:")
	fig, ax = plt.subplots()
	for cluster in df['Cụm'].unique():
	cluster_data = df[df['Cụm'] == cluster]
	ax.plot(cluster_data.index, cluster_data[selected_col], label=f'Cụm {cluster}')
	ax.legend()
	st.pyplot(fig)

	# Phân phối cụm
	st.write("### Phân phối Cụm:")
	fig, ax = plt.subplots()
	sns.countplot(x='Cụm', data=df, palette='viridis', ax=ax)
	st.pyplot(fig)

	st.markdown("---") # Kẻ một đường ngang
	st.markdown("Cảm ơn bạn!")