Spaces:
Sleeping
Sleeping
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from xgboost import XGBClassifier | |
from sklearn.model_selection import StratifiedKFold, cross_val_predict | |
from sklearn.preprocessing import MinMaxScaler | |
from imblearn.over_sampling import SMOTE | |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score | |
from imblearn.pipeline import Pipeline | |
import joblib | |
sns.set(style='whitegrid') | |
# Veri setini oku | |
telco = pd.read_csv('Telco-Customer-Churn.csv') | |
pd.set_option('display.max_columns', None) | |
# TotalCharges sayısallaştırma ve eksik verileri çıkarma | |
telco.TotalCharges = pd.to_numeric(telco.TotalCharges, errors='coerce') | |
telco.dropna(inplace=True) | |
# CustomerID'yi çıkar | |
df2 = telco.iloc[:, 1:] | |
# Binary sütunları True/False yap | |
bool_map = {'Yes': True, 'No': False} | |
binary_columns = ['Churn', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling'] | |
for col in binary_columns: | |
df2[col] = df2[col].map(bool_map) | |
df2['SeniorCitizen'].replace({1: True, 0: False}, inplace=True) | |
df2['gender'].replace({'Female': True, 'Male': False}, inplace=True) | |
# Yeni özellikler | |
# Ortalama aylık ödeme | |
df2['avg_charge_per_month'] = df2['TotalCharges'] / df2['tenure'].replace(0, 1) | |
# Toplam ödeme ile (aylık * süre) oranı | |
df2['charge_ratio'] = df2.apply( | |
lambda row: row['TotalCharges'] / (row['MonthlyCharges'] * row['tenure']) | |
if row['MonthlyCharges'] > 0 and row['tenure'] > 0 else 1, axis=1) | |
# Süre kategorisi | |
df2['tenure_bin'] = pd.cut(df2['tenure'], bins=[0, 12, 24, df2['tenure'].max()], labels=['0-12', '12-24', '24+']) | |
# One-hot encoding | |
multi_cat_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', | |
'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', | |
'Contract', 'PaymentMethod', 'tenure_bin'] | |
df_dummies = pd.get_dummies(df2, columns=multi_cat_cols, drop_first=False) | |
# Uzun vadeli sözleşme özelliği | |
df_dummies['is_long_term_contract'] = ( | |
df_dummies.get('Contract_One year', False) | df_dummies.get('Contract_Two year', False) | |
) | |
# Hedef ve bağımsız değişkenler | |
y = df_dummies['Churn'].values | |
X = df_dummies.drop(columns=['Churn']) | |
# En önemli 15 özelliği seçmek için XGB fit | |
temp_model = XGBClassifier(random_state=42) | |
temp_model.fit(X, y) | |
feature_importances = pd.Series(temp_model.feature_importances_, index=X.columns).sort_values(ascending=False) | |
top_15_features = feature_importances.head(15).index.tolist() | |
X_selected = X[top_15_features] | |
# Pipeline: Ölçekleme + SMOTE + XGBoost | |
pipe = Pipeline([ | |
("scaler", MinMaxScaler()), | |
("smote", SMOTE(sampling_strategy=1.0, random_state=42)), | |
("xgb", XGBClassifier( | |
n_estimators=100, | |
max_depth=4, | |
learning_rate=0.1, | |
subsample=1.0, | |
colsample_bytree=0.7, | |
scale_pos_weight=1, | |
eval_metric='logloss', | |
random_state=42 | |
)) | |
]) | |
# 5-fold stratified cross-validation | |
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) | |
y_pred = cross_val_predict(pipe, X_selected, y, cv=cv, method="predict") | |
y_proba = cross_val_predict(pipe, X_selected, y, cv=cv, method="predict_proba")[:, 1] | |
# Performans metrikleri | |
print("XGBoost Cross-Validation Sonuçları (Binary):") | |
print("--------------------------------------------") | |
print("Accuracy: {:.4f}".format(accuracy_score(y, y_pred))) | |
print("Precision: {:.4f}".format(precision_score(y, y_pred))) | |
print("Recall: {:.4f}".format(recall_score(y, y_pred))) | |
print("F1 Score: {:.4f}".format(f1_score(y, y_pred))) | |
print("ROC-AUC: {:.4f}".format(roc_auc_score(y, y_proba))) | |
print("") | |
print("XGBoost Cross-Validation Sonuçları (Macro):") | |
print("--------------------------------------------") | |
print("Accuracy: {:.4f}".format(accuracy_score(y, y_pred))) | |
print("Precision: {:.4f}".format(precision_score(y, y_pred, average='macro'))) | |
print("Recall: {:.4f}".format(recall_score(y, y_pred, average='macro'))) | |
print("F1 Score: {:.4f}".format(f1_score(y, y_pred, average='macro'))) | |
print("ROC-AUC: {:.4f}".format(roc_auc_score(y, y_proba))) | |
# Modeli eğit ve kaydet | |
pipe.fit(X_selected, y) | |
joblib.dump(pipe, 'churn_model.pkl') | |
joblib.dump(top_15_features, "model_features.pkl") | |