import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from datasets import load_dataset

# Tiêu đề ứng dụng
st.title("Ứng dụng Phân Cụm Dữ Liệu Không Giám Sát")

# Giới thiệu về ứng dụng
with st.expander("Giới thiệu về Ứng dụng"):
    st.write(
        "Ứng dụng này cho phép bạn tải lên bất kỳ loại tập dữ liệu không nhãn nào "
        "và tự động phân cụm dữ liệu bằng thuật toán K-means. "
        "Nó sẽ trực quan hóa các cụm bằng phương pháp PCA và cung cấp biểu đồ chuỗi thời gian cũng như phân phối cụm "
        "giúp bạn nhận diện các mô hình và nhóm trong dữ liệu."
    )

# Tải file
uploaded_file = st.file_uploader("Tải lên tập tin CSV", type=["csv"])

# Bộ dữ liệu mẫu
if st.button("Dùng thử với bộ dữ liệu mẫu"):
    dataset = load_dataset('kheejay88/country_data', split='train')
    df = pd.DataFrame(dataset)
    st.success("Đã tải thành công bộ dữ liệu mẫu từ Hugging Face.")

# Cột dữ liệu
with st.expander("Các Cột Dữ Liệu"):
    st.write("""
        **country** – Tên quốc gia\n
        **child_mort** – Tỷ lệ tử vong trẻ em dưới 5 tuổi trên 1000 ca sinh sống\n
        **exports** – Xuất khẩu hàng hóa và dịch vụ trên đầu người (tính theo phần trăm GDP)\n
        **health** – Chi tiêu y tế trên đầu người (tính theo phần trăm GDP)\n
        **imports** – Nhập khẩu hàng hóa và dịch vụ trên đầu người (tính theo phần trăm GDP)\n
        **income** – Thu nhập ròng trên đầu người\n
        **inflation** – Tỷ lệ lạm phát hàng năm (phần trăm)\n
        **life_expec** – Tuổi thọ trung bình khi sinh (năm)\n
        **total_fer** – Tổng tỷ suất sinh (số con trung bình mỗi phụ nữ)\n
        **gdpp** – GDP trên đầu người\n
    """)

if uploaded_file is not None:
    df = pd.read_csv(uploaded_file)

if 'df' in locals():
    # Loại bỏ cột không phải số
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    df.drop(columns=categorical_cols, inplace=True)
    st.write("### Dữ liệu thô:")
    st.write(df.head())

    # Tiền xử lý dữ liệu
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)

    # Chọn số lượng cụm
    num_clusters = st.slider("Chọn số lượng cụm", min_value=2, max_value=10, value=3)

    # Phân cụm bằng K-Means
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans.fit_predict(scaled_data)
    df['Cụm'] = clusters

    # Giảm chiều bằng PCA để trực quan hóa
    pca = PCA(n_components=2)
    pca_data = pca.fit_transform(scaled_data)
    df['PCA1'] = pca_data[:, 0]
    df['PCA2'] = pca_data[:, 1]

    # Vẽ biểu đồ phân cụm
    st.write("### Biểu đồ Phân Cụm:")
    fig, ax = plt.subplots()
    sns.scatterplot(x='PCA1', y='PCA2', hue='Cụm', data=df, palette='viridis', ax=ax)
    st.pyplot(fig)

    # Vẽ biểu đồ chuỗi thời gian (nếu có)
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if len(numeric_cols) >= 2:
        selected_col = st.selectbox("Chọn cột để vẽ biểu đồ chuỗi thời gian", numeric_cols)
        st.write("### Biểu đồ Chuỗi Thời Gian:")
        fig, ax = plt.subplots()
        for cluster in df['Cụm'].unique():
            cluster_data = df[df['Cụm'] == cluster]
            ax.plot(cluster_data.index, cluster_data[selected_col], label=f'Cụm {cluster}')
        ax.legend()
        st.pyplot(fig)

    # Phân phối cụm
    st.write("### Phân phối Cụm:")
    fig, ax = plt.subplots()
    sns.countplot(x='Cụm', data=df, palette='viridis', ax=ax)
    st.pyplot(fig)

    st.markdown("---")  # Kẻ một đường ngang
    st.markdown("**Cảm ơn bạn!**")