gradio-app / churn_model.py
omarcevi's picture
Upload folder using huggingface_hub
24d09a0 verified
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.pipeline import Pipeline
import joblib
sns.set(style='whitegrid')
# Veri setini oku
telco = pd.read_csv('Telco-Customer-Churn.csv')
pd.set_option('display.max_columns', None)
# TotalCharges sayısallaştırma ve eksik verileri çıkarma
telco.TotalCharges = pd.to_numeric(telco.TotalCharges, errors='coerce')
telco.dropna(inplace=True)
# CustomerID'yi çıkar
df2 = telco.iloc[:, 1:]
# Binary sütunları True/False yap
bool_map = {'Yes': True, 'No': False}
binary_columns = ['Churn', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
for col in binary_columns:
df2[col] = df2[col].map(bool_map)
df2['SeniorCitizen'].replace({1: True, 0: False}, inplace=True)
df2['gender'].replace({'Female': True, 'Male': False}, inplace=True)
# Yeni özellikler
# Ortalama aylık ödeme
df2['avg_charge_per_month'] = df2['TotalCharges'] / df2['tenure'].replace(0, 1)
# Toplam ödeme ile (aylık * süre) oranı
df2['charge_ratio'] = df2.apply(
lambda row: row['TotalCharges'] / (row['MonthlyCharges'] * row['tenure'])
if row['MonthlyCharges'] > 0 and row['tenure'] > 0 else 1, axis=1)
# Süre kategorisi
df2['tenure_bin'] = pd.cut(df2['tenure'], bins=[0, 12, 24, df2['tenure'].max()], labels=['0-12', '12-24', '24+'])
# One-hot encoding
multi_cat_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
'Contract', 'PaymentMethod', 'tenure_bin']
df_dummies = pd.get_dummies(df2, columns=multi_cat_cols, drop_first=False)
# Uzun vadeli sözleşme özelliği
df_dummies['is_long_term_contract'] = (
df_dummies.get('Contract_One year', False) | df_dummies.get('Contract_Two year', False)
)
# Hedef ve bağımsız değişkenler
y = df_dummies['Churn'].values
X = df_dummies.drop(columns=['Churn'])
# En önemli 15 özelliği seçmek için XGB fit
temp_model = XGBClassifier(random_state=42)
temp_model.fit(X, y)
feature_importances = pd.Series(temp_model.feature_importances_, index=X.columns).sort_values(ascending=False)
top_15_features = feature_importances.head(15).index.tolist()
X_selected = X[top_15_features]
# Pipeline: Ölçekleme + SMOTE + XGBoost
pipe = Pipeline([
("scaler", MinMaxScaler()),
("smote", SMOTE(sampling_strategy=1.0, random_state=42)),
("xgb", XGBClassifier(
n_estimators=100,
max_depth=4,
learning_rate=0.1,
subsample=1.0,
colsample_bytree=0.7,
scale_pos_weight=1,
eval_metric='logloss',
random_state=42
))
])
# 5-fold stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred = cross_val_predict(pipe, X_selected, y, cv=cv, method="predict")
y_proba = cross_val_predict(pipe, X_selected, y, cv=cv, method="predict_proba")[:, 1]
# Performans metrikleri
print("XGBoost Cross-Validation Sonuçları (Binary):")
print("--------------------------------------------")
print("Accuracy: {:.4f}".format(accuracy_score(y, y_pred)))
print("Precision: {:.4f}".format(precision_score(y, y_pred)))
print("Recall: {:.4f}".format(recall_score(y, y_pred)))
print("F1 Score: {:.4f}".format(f1_score(y, y_pred)))
print("ROC-AUC: {:.4f}".format(roc_auc_score(y, y_proba)))
print("")
print("XGBoost Cross-Validation Sonuçları (Macro):")
print("--------------------------------------------")
print("Accuracy: {:.4f}".format(accuracy_score(y, y_pred)))
print("Precision: {:.4f}".format(precision_score(y, y_pred, average='macro')))
print("Recall: {:.4f}".format(recall_score(y, y_pred, average='macro')))
print("F1 Score: {:.4f}".format(f1_score(y, y_pred, average='macro')))
print("ROC-AUC: {:.4f}".format(roc_auc_score(y, y_proba)))
# Modeli eğit ve kaydet
pipe.fit(X_selected, y)
joblib.dump(pipe, 'churn_model.pkl')
joblib.dump(top_15_features, "model_features.pkl")