In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
In [3]:
# Load dataset
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
In [4]:
df.head()
Out[4]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No ... No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes ... Yes No No No One year No Mailed check 56.95 1889.5 No
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes ... No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes ... Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No ... No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes

5 rows × 21 columns

In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB
In [6]:
df.isna().sum()
Out[6]:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

no na values

In [7]:
df.drop('customerID',axis=1,inplace=True)
In [8]:
# Convert 'TotalCharges' to numeric (it's often loaded as object due to empty strings)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
# Reason: 'coerce' turns non-convertible values (like empty strings) into NaN.

# Handle missing 'TotalCharges' values (if any, usually few)
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
# Reason: Impute with median to preserve distribution and prevent data loss.
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 
 17  MonthlyCharges    7043 non-null   float64
 18  TotalCharges      7043 non-null   float64
 19  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(16)
memory usage: 1.1+ MB
In [10]:
# Convert 'No internet service' to 'No' for consistency in some features
df.replace('No internet service', 'No', inplace=True)
df.replace('No phone service', 'No', inplace=True)

# Convert 'Yes'/'No' and 'Male'/'Female' to 1/0
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity',
                   'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                   'StreamingMovies', 'PaperlessBilling', 'Churn']
for col in binary_cols:
        df[col] = df[col].apply(lambda x: 1 if x == 'Yes' else 0)
df['gender'] = df['gender'].apply(lambda x: 1 if x == 'Male' else 0) # Male=1, Female=0

# One-hot encode other categorical features
categorical_cols = ['InternetService', 'Contract', 'PaymentMethod']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
# Reason: One-hot encoding creates binary columns for categories, avoiding false ordinality.
In [11]:
df.head()
Out[11]:
gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines OnlineSecurity OnlineBackup DeviceProtection ... MonthlyCharges TotalCharges Churn InternetService_Fiber optic InternetService_No Contract_One year Contract_Two year PaymentMethod_Credit card (automatic) PaymentMethod_Electronic check PaymentMethod_Mailed check
0 0 0 1 0 1 0 0 0 1 0 ... 29.85 29.85 0 False False False False False True False
1 1 0 0 0 34 1 0 1 0 1 ... 56.95 1889.50 0 False False True False False False True
2 1 0 0 0 2 1 0 1 1 0 ... 53.85 108.15 1 False False False False False False True
3 1 0 0 0 45 0 0 1 0 1 ... 42.30 1840.75 0 False False True False False False False
4 0 0 0 0 2 1 0 0 0 0 ... 70.70 151.65 1 True False False False False True False

5 rows × 24 columns

In [12]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   gender                                 7043 non-null   int64  
 1   SeniorCitizen                          7043 non-null   int64  
 2   Partner                                7043 non-null   int64  
 3   Dependents                             7043 non-null   int64  
 4   tenure                                 7043 non-null   int64  
 5   PhoneService                           7043 non-null   int64  
 6   MultipleLines                          7043 non-null   int64  
 7   OnlineSecurity                         7043 non-null   int64  
 8   OnlineBackup                           7043 non-null   int64  
 9   DeviceProtection                       7043 non-null   int64  
 10  TechSupport                            7043 non-null   int64  
 11  StreamingTV                            7043 non-null   int64  
 12  StreamingMovies                        7043 non-null   int64  
 13  PaperlessBilling                       7043 non-null   int64  
 14  MonthlyCharges                         7043 non-null   float64
 15  TotalCharges                           7043 non-null   float64
 16  Churn                                  7043 non-null   int64  
 17  InternetService_Fiber optic            7043 non-null   bool   
 18  InternetService_No                     7043 non-null   bool   
 19  Contract_One year                      7043 non-null   bool   
 20  Contract_Two year                      7043 non-null   bool   
 21  PaymentMethod_Credit card (automatic)  7043 non-null   bool   
 22  PaymentMethod_Electronic check         7043 non-null   bool   
 23  PaymentMethod_Mailed check             7043 non-null   bool   
dtypes: bool(7), float64(2), int64(15)
memory usage: 983.7 KB
In [13]:
from sklearn.model_selection import train_test_split

X = df.drop('Churn', axis=1)
y = df['Churn']
In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Reason: stratify=y ensures that the train/test sets have roughly the same proportion of churners as the original dataset.

model selection¶

In [18]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
In [19]:
# Logistic Regression with GridSearch
param_grid_log = {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear']}
grid_log = GridSearchCV(LogisticRegression(max_iter=1000, class_weight='balanced'), param_grid_log, cv=5)
grid_log.fit(X_train, y_train)

# Decision Tree with GridSearch
param_grid_tree = {'max_depth': [2, 4, 6, 8, 10], 'min_samples_split': [2, 5, 10]}
grid_tree = GridSearchCV(DecisionTreeClassifier(), param_grid_tree, cv=5)
grid_tree.fit(X_train, y_train)

# Random Forest
forest = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
forest.fit(X_train, y_train)

# Support Vector Machine with pipeline and GridSearch
pipe_svc = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(class_weight='balanced'))
])
param_grid_svc = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf']
}
grid_svc = GridSearchCV(pipe_svc, param_grid_svc, cv=5)
grid_svc.fit(X_train, y_train)
Out[19]:
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('svc', SVC(class_weight='balanced'))]),
             param_grid={'svc__C': [0.1, 1, 10],
                         'svc__kernel': ['linear', 'rbf']})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('svc', SVC(class_weight='balanced'))]),
             param_grid={'svc__C': [0.1, 1, 10],
                         'svc__kernel': ['linear', 'rbf']})
Pipeline(steps=[('scaler', StandardScaler()),
                ('svc', SVC(C=1, class_weight='balanced'))])
StandardScaler()
SVC(C=1, class_weight='balanced')
In [20]:
# Evaluation
models = {
    "Logistic Regression": grid_log.best_estimator_,
    "Decision Tree": grid_tree.best_estimator_,
    "Random Forest": forest,
    "SVM": grid_svc.best_estimator_
}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f"{name} CV Accuracy: {scores.mean():.4f}")
Logistic Regression CV Accuracy: 0.7556
Decision Tree CV Accuracy: 0.7843
Random Forest CV Accuracy: 0.7868
SVM CV Accuracy: 0.7481
In [21]:
# Best Model Confusion Matrix (Random Forest here)
y_pred = forest.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Random Forest')
plt.show()

print(classification_report(y_test, y_pred))
No description has been provided for this image
              precision    recall  f1-score   support

           0       0.83      0.90      0.87      1035
           1       0.65      0.49      0.56       374

    accuracy                           0.79      1409
   macro avg       0.74      0.70      0.71      1409
weighted avg       0.78      0.79      0.78      1409

In [22]:
# Feature Importance
importances = pd.DataFrame({'Feature': X.columns, 'Importance': forest.feature_importances_})
importances = importances.sort_values(by='Importance', ascending=False)
print("\nTop 10 Important Features (Random Forest):")
print(importances.head(10))
Top 10 Important Features (Random Forest):
                           Feature  Importance
15                    TotalCharges    0.171017
4                           tenure    0.168982
14                  MonthlyCharges    0.164893
19               Contract_Two year    0.059942
16     InternetService_Fiber optic    0.053535
21  PaymentMethod_Electronic check    0.037796
18               Contract_One year    0.029666
13                PaperlessBilling    0.028259
0                           gender    0.026159
7                   OnlineSecurity    0.025893
In [24]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Create a sorted dataframe of feature importances
importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': forest.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Print top features
print("\nTop 10 Important Features (Random Forest):")
print(importances.head(10))

# Plot top 10 features
plt.figure(figsize=(8,5))
sns.barplot(
    x='Importance', 
    y='Feature', 
    data=importances.head(10),
    palette='viridis'
)
plt.title("🔍 Top 10 Feature Importances (Random Forest)", fontsize=13, weight='bold')
plt.xlabel("Feature Importance Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()
Top 10 Important Features (Random Forest):
                           Feature  Importance
15                    TotalCharges    0.171017
4                           tenure    0.168982
14                  MonthlyCharges    0.164893
19               Contract_Two year    0.059942
16     InternetService_Fiber optic    0.053535
21  PaymentMethod_Electronic check    0.037796
18               Contract_One year    0.029666
13                PaperlessBilling    0.028259
0                           gender    0.026159
7                   OnlineSecurity    0.025893
No description has been provided for this image