File size: 4,363 Bytes
24d09a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.pipeline import Pipeline
import joblib

sns.set(style='whitegrid')

# Veri setini oku
telco = pd.read_csv('Telco-Customer-Churn.csv')
pd.set_option('display.max_columns', None)

# TotalCharges sayısallaştırma ve eksik verileri çıkarma
telco.TotalCharges = pd.to_numeric(telco.TotalCharges, errors='coerce')
telco.dropna(inplace=True)

# CustomerID'yi çıkar
df2 = telco.iloc[:, 1:]

# Binary sütunları True/False yap
bool_map = {'Yes': True, 'No': False}
binary_columns = ['Churn', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
for col in binary_columns:
    df2[col] = df2[col].map(bool_map)

df2['SeniorCitizen'].replace({1: True, 0: False}, inplace=True)
df2['gender'].replace({'Female': True, 'Male': False}, inplace=True)

# Yeni özellikler
# Ortalama aylık ödeme
df2['avg_charge_per_month'] = df2['TotalCharges'] / df2['tenure'].replace(0, 1)
# Toplam ödeme ile (aylık * süre) oranı
df2['charge_ratio'] = df2.apply(
    lambda row: row['TotalCharges'] / (row['MonthlyCharges'] * row['tenure'])
    if row['MonthlyCharges'] > 0 and row['tenure'] > 0 else 1, axis=1)

# Süre kategorisi
df2['tenure_bin'] = pd.cut(df2['tenure'], bins=[0, 12, 24, df2['tenure'].max()], labels=['0-12', '12-24', '24+'])

# One-hot encoding
multi_cat_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                  'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                  'Contract', 'PaymentMethod', 'tenure_bin']
df_dummies = pd.get_dummies(df2, columns=multi_cat_cols, drop_first=False)

# Uzun vadeli sözleşme özelliği
df_dummies['is_long_term_contract'] = (
    df_dummies.get('Contract_One year', False) | df_dummies.get('Contract_Two year', False)
)

# Hedef ve bağımsız değişkenler
y = df_dummies['Churn'].values
X = df_dummies.drop(columns=['Churn'])

# En önemli 15 özelliği seçmek için XGB fit
temp_model = XGBClassifier(random_state=42)
temp_model.fit(X, y)
feature_importances = pd.Series(temp_model.feature_importances_, index=X.columns).sort_values(ascending=False)
top_15_features = feature_importances.head(15).index.tolist()
X_selected = X[top_15_features]

# Pipeline: Ölçekleme + SMOTE + XGBoost
pipe = Pipeline([
    ("scaler", MinMaxScaler()),
    ("smote", SMOTE(sampling_strategy=1.0, random_state=42)),
    ("xgb", XGBClassifier(
        n_estimators=100,
        max_depth=4,
        learning_rate=0.1,
        subsample=1.0,
        colsample_bytree=0.7,
        scale_pos_weight=1,
        eval_metric='logloss',
        random_state=42
    ))
])

# 5-fold stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred = cross_val_predict(pipe, X_selected, y, cv=cv, method="predict")
y_proba = cross_val_predict(pipe, X_selected, y, cv=cv, method="predict_proba")[:, 1]

# Performans metrikleri
print("XGBoost Cross-Validation Sonuçları (Binary):")
print("--------------------------------------------")
print("Accuracy: {:.4f}".format(accuracy_score(y, y_pred)))
print("Precision: {:.4f}".format(precision_score(y, y_pred)))
print("Recall: {:.4f}".format(recall_score(y, y_pred)))
print("F1 Score: {:.4f}".format(f1_score(y, y_pred)))
print("ROC-AUC: {:.4f}".format(roc_auc_score(y, y_proba)))
print("")
print("XGBoost Cross-Validation Sonuçları (Macro):")
print("--------------------------------------------")
print("Accuracy: {:.4f}".format(accuracy_score(y, y_pred)))
print("Precision: {:.4f}".format(precision_score(y, y_pred, average='macro')))
print("Recall: {:.4f}".format(recall_score(y, y_pred, average='macro')))
print("F1 Score: {:.4f}".format(f1_score(y, y_pred, average='macro')))
print("ROC-AUC: {:.4f}".format(roc_auc_score(y, y_proba)))

# Modeli eğit ve kaydet
pipe.fit(X_selected, y)
joblib.dump(pipe, 'churn_model.pkl')
joblib.dump(top_15_features, "model_features.pkl")