import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB

df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

df.drop('customerID',axis=1,inplace=True)

# Convert 'TotalCharges' to numeric (it's often loaded as object due to empty strings)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
# Reason: 'coerce' turns non-convertible values (like empty strings) into NaN.

# Handle missing 'TotalCharges' values (if any, usually few)
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
# Reason: Impute with median to preserve distribution and prevent data loss.

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 
 17  MonthlyCharges    7043 non-null   float64
 18  TotalCharges      7043 non-null   float64
 19  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(16)
memory usage: 1.1+ MB

# Convert 'No internet service' to 'No' for consistency in some features
df.replace('No internet service', 'No', inplace=True)
df.replace('No phone service', 'No', inplace=True)

# Convert 'Yes'/'No' and 'Male'/'Female' to 1/0
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity',
                   'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                   'StreamingMovies', 'PaperlessBilling', 'Churn']
for col in binary_cols:
        df[col] = df[col].apply(lambda x: 1 if x == 'Yes' else 0)
df['gender'] = df['gender'].apply(lambda x: 1 if x == 'Male' else 0) # Male=1, Female=0

# One-hot encode other categorical features
categorical_cols = ['InternetService', 'Contract', 'PaymentMethod']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
# Reason: One-hot encoding creates binary columns for categories, avoiding false ordinality.

df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   gender                                 7043 non-null   int64  
 1   SeniorCitizen                          7043 non-null   int64  
 2   Partner                                7043 non-null   int64  
 3   Dependents                             7043 non-null   int64  
 4   tenure                                 7043 non-null   int64  
 5   PhoneService                           7043 non-null   int64  
 6   MultipleLines                          7043 non-null   int64  
 7   OnlineSecurity                         7043 non-null   int64  
 8   OnlineBackup                           7043 non-null   int64  
 9   DeviceProtection                       7043 non-null   int64  
 10  TechSupport                            7043 non-null   int64  
 11  StreamingTV                            7043 non-null   int64  
 12  StreamingMovies                        7043 non-null   int64  
 13  PaperlessBilling                       7043 non-null   int64  
 14  MonthlyCharges                         7043 non-null   float64
 15  TotalCharges                           7043 non-null   float64
 16  Churn                                  7043 non-null   int64  
 17  InternetService_Fiber optic            7043 non-null   bool   
 18  InternetService_No                     7043 non-null   bool   
 19  Contract_One year                      7043 non-null   bool   
 20  Contract_Two year                      7043 non-null   bool   
 21  PaymentMethod_Credit card (automatic)  7043 non-null   bool   
 22  PaymentMethod_Electronic check         7043 non-null   bool   
 23  PaymentMethod_Mailed check             7043 non-null   bool   
dtypes: bool(7), float64(2), int64(15)
memory usage: 983.7 KB

from sklearn.model_selection import train_test_split

X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Reason: stratify=y ensures that the train/test sets have roughly the same proportion of churners as the original dataset.

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

# Logistic Regression with GridSearch
param_grid_log = {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear']}
grid_log = GridSearchCV(LogisticRegression(max_iter=1000, class_weight='balanced'), param_grid_log, cv=5)
grid_log.fit(X_train, y_train)

# Decision Tree with GridSearch
param_grid_tree = {'max_depth': [2, 4, 6, 8, 10], 'min_samples_split': [2, 5, 10]}
grid_tree = GridSearchCV(DecisionTreeClassifier(), param_grid_tree, cv=5)
grid_tree.fit(X_train, y_train)

# Random Forest
forest = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
forest.fit(X_train, y_train)

# Support Vector Machine with pipeline and GridSearch
pipe_svc = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(class_weight='balanced'))
])
param_grid_svc = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf']
}
grid_svc = GridSearchCV(pipe_svc, param_grid_svc, cv=5)
grid_svc.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('svc', SVC(class_weight='balanced'))]),
             param_grid={'svc__C': [0.1, 1, 10],
                         'svc__kernel': ['linear', 'rbf']})

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('svc', SVC(class_weight='balanced'))]),
             param_grid={'svc__C': [0.1, 1, 10],
                         'svc__kernel': ['linear', 'rbf']})

Pipeline(steps=[('scaler', StandardScaler()),
                ('svc', SVC(C=1, class_weight='balanced'))])

StandardScaler()

SVC(C=1, class_weight='balanced')

# Evaluation
models = {
    "Logistic Regression": grid_log.best_estimator_,
    "Decision Tree": grid_tree.best_estimator_,
    "Random Forest": forest,
    "SVM": grid_svc.best_estimator_
}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f"{name} CV Accuracy: {scores.mean():.4f}")

Logistic Regression CV Accuracy: 0.7556
Decision Tree CV Accuracy: 0.7843
Random Forest CV Accuracy: 0.7868
SVM CV Accuracy: 0.7481

# Best Model Confusion Matrix (Random Forest here)
y_pred = forest.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Random Forest')
plt.show()

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.87      1035
           1       0.65      0.49      0.56       374

    accuracy                           0.79      1409
   macro avg       0.74      0.70      0.71      1409
weighted avg       0.78      0.79      0.78      1409

# Feature Importance
importances = pd.DataFrame({'Feature': X.columns, 'Importance': forest.feature_importances_})
importances = importances.sort_values(by='Importance', ascending=False)
print("\nTop 10 Important Features (Random Forest):")
print(importances.head(10))

Top 10 Important Features (Random Forest):
                           Feature  Importance
15                    TotalCharges    0.171017
4                           tenure    0.168982
14                  MonthlyCharges    0.164893
19               Contract_Two year    0.059942
16     InternetService_Fiber optic    0.053535
21  PaymentMethod_Electronic check    0.037796
18               Contract_One year    0.029666
13                PaperlessBilling    0.028259
0                           gender    0.026159
7                   OnlineSecurity    0.025893

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Create a sorted dataframe of feature importances
importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': forest.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Print top features
print("\nTop 10 Important Features (Random Forest):")
print(importances.head(10))

# Plot top 10 features
plt.figure(figsize=(8,5))
sns.barplot(
    x='Importance', 
    y='Feature', 
    data=importances.head(10),
    palette='viridis'
)
plt.title("🔍 Top 10 Feature Importances (Random Forest)", fontsize=13, weight='bold')
plt.xlabel("Feature Importance Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

Top 10 Important Features (Random Forest):
                           Feature  Importance
15                    TotalCharges    0.171017
4                           tenure    0.168982
14                  MonthlyCharges    0.164893
19               Contract_Two year    0.059942
16     InternetService_Fiber optic    0.053535
21  PaymentMethod_Electronic check    0.037796
18               Contract_One year    0.029666
13                PaperlessBilling    0.028259
0                           gender    0.026159
7                   OnlineSecurity    0.025893

	customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	...	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	Yes	No	1	No	No phone service	DSL	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	No	No	34	Yes	No	DSL	Yes	...	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No
2	3668-QPYBK	Male	No	No	2	Yes	No	DSL	Yes	...	No	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	Yes
3	7795-CFOCW	Male	No	No	45	No	No phone service	DSL	Yes	...	Yes	Yes	No	No	One year	No	Bank transfer (automatic)	42.30	1840.75	No
4	9237-HQITU	Female	No	No	2	Yes	No	Fiber optic	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	70.70	151.65	Yes

	gender	Partner	tenure	PhoneService	OnlineSecurity	OnlineBackup	DeviceProtection	...	MonthlyCharges	TotalCharges	Churn	InternetService_Fiber optic	InternetService_No	Contract_One year	Contract_Two year	PaymentMethod_Credit card (automatic)	PaymentMethod_Electronic check	PaymentMethod_Mailed check
0	0	1	1	0	0	1	0	...	29.85	29.85	0	False	False	False	False	False	True	False
1	1	0	34	1	1	0	1	...	56.95	1889.50	0	False	False	True	False	False	False	True
2	1	0	2	1	1	1	0	...	53.85	108.15	1	False	False	False	False	False	False	True
3	1	0	45	0	1	0	1	...	42.30	1840.75	0	False	False	True	False	False	False	False
4	0	0	2	1	0	0	0	...	70.70	151.65	1	True	False	False	False	False	True	False

model selection¶