In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
In [3]:
# Load dataset
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
In [4]:
df.head()
Out[4]:
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 gender 7043 non-null object 2 SeniorCitizen 7043 non-null int64 3 Partner 7043 non-null object 4 Dependents 7043 non-null object 5 tenure 7043 non-null int64 6 PhoneService 7043 non-null object 7 MultipleLines 7043 non-null object 8 InternetService 7043 non-null object 9 OnlineSecurity 7043 non-null object 10 OnlineBackup 7043 non-null object 11 DeviceProtection 7043 non-null object 12 TechSupport 7043 non-null object 13 StreamingTV 7043 non-null object 14 StreamingMovies 7043 non-null object 15 Contract 7043 non-null object 16 PaperlessBilling 7043 non-null object 17 PaymentMethod 7043 non-null object 18 MonthlyCharges 7043 non-null float64 19 TotalCharges 7043 non-null object 20 Churn 7043 non-null object dtypes: float64(1), int64(2), object(18) memory usage: 1.1+ MB
In [6]:
df.isna().sum()
Out[6]:
customerID 0 gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 0 Churn 0 dtype: int64
no na values
In [7]:
df.drop('customerID',axis=1,inplace=True)
In [8]:
# Convert 'TotalCharges' to numeric (it's often loaded as object due to empty strings)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
# Reason: 'coerce' turns non-convertible values (like empty strings) into NaN.
# Handle missing 'TotalCharges' values (if any, usually few)
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
# Reason: Impute with median to preserve distribution and prevent data loss.
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 7043 non-null object 1 SeniorCitizen 7043 non-null int64 2 Partner 7043 non-null object 3 Dependents 7043 non-null object 4 tenure 7043 non-null int64 5 PhoneService 7043 non-null object 6 MultipleLines 7043 non-null object 7 InternetService 7043 non-null object 8 OnlineSecurity 7043 non-null object 9 OnlineBackup 7043 non-null object 10 DeviceProtection 7043 non-null object 11 TechSupport 7043 non-null object 12 StreamingTV 7043 non-null object 13 StreamingMovies 7043 non-null object 14 Contract 7043 non-null object 15 PaperlessBilling 7043 non-null object 16 PaymentMethod 7043 non-null object 17 MonthlyCharges 7043 non-null float64 18 TotalCharges 7043 non-null float64 19 Churn 7043 non-null object dtypes: float64(2), int64(2), object(16) memory usage: 1.1+ MB
In [10]:
# Convert 'No internet service' to 'No' for consistency in some features
df.replace('No internet service', 'No', inplace=True)
df.replace('No phone service', 'No', inplace=True)
# Convert 'Yes'/'No' and 'Male'/'Female' to 1/0
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity',
'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
'StreamingMovies', 'PaperlessBilling', 'Churn']
for col in binary_cols:
df[col] = df[col].apply(lambda x: 1 if x == 'Yes' else 0)
df['gender'] = df['gender'].apply(lambda x: 1 if x == 'Male' else 0) # Male=1, Female=0
# One-hot encode other categorical features
categorical_cols = ['InternetService', 'Contract', 'PaymentMethod']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
# Reason: One-hot encoding creates binary columns for categories, avoiding false ordinality.
In [11]:
df.head()
Out[11]:
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | OnlineSecurity | OnlineBackup | DeviceProtection | ... | MonthlyCharges | TotalCharges | Churn | InternetService_Fiber optic | InternetService_No | Contract_One year | Contract_Two year | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 29.85 | 29.85 | 0 | False | False | False | False | False | True | False |
| 1 | 1 | 0 | 0 | 0 | 34 | 1 | 0 | 1 | 0 | 1 | ... | 56.95 | 1889.50 | 0 | False | False | True | False | False | False | True |
| 2 | 1 | 0 | 0 | 0 | 2 | 1 | 0 | 1 | 1 | 0 | ... | 53.85 | 108.15 | 1 | False | False | False | False | False | False | True |
| 3 | 1 | 0 | 0 | 0 | 45 | 0 | 0 | 1 | 0 | 1 | ... | 42.30 | 1840.75 | 0 | False | False | True | False | False | False | False |
| 4 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | ... | 70.70 | 151.65 | 1 | True | False | False | False | False | True | False |
5 rows × 24 columns
In [12]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 7043 non-null int64 1 SeniorCitizen 7043 non-null int64 2 Partner 7043 non-null int64 3 Dependents 7043 non-null int64 4 tenure 7043 non-null int64 5 PhoneService 7043 non-null int64 6 MultipleLines 7043 non-null int64 7 OnlineSecurity 7043 non-null int64 8 OnlineBackup 7043 non-null int64 9 DeviceProtection 7043 non-null int64 10 TechSupport 7043 non-null int64 11 StreamingTV 7043 non-null int64 12 StreamingMovies 7043 non-null int64 13 PaperlessBilling 7043 non-null int64 14 MonthlyCharges 7043 non-null float64 15 TotalCharges 7043 non-null float64 16 Churn 7043 non-null int64 17 InternetService_Fiber optic 7043 non-null bool 18 InternetService_No 7043 non-null bool 19 Contract_One year 7043 non-null bool 20 Contract_Two year 7043 non-null bool 21 PaymentMethod_Credit card (automatic) 7043 non-null bool 22 PaymentMethod_Electronic check 7043 non-null bool 23 PaymentMethod_Mailed check 7043 non-null bool dtypes: bool(7), float64(2), int64(15) memory usage: 983.7 KB
In [13]:
from sklearn.model_selection import train_test_split
X = df.drop('Churn', axis=1)
y = df['Churn']
In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Reason: stratify=y ensures that the train/test sets have roughly the same proportion of churners as the original dataset.
model selection¶
In [18]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
In [19]:
# Logistic Regression with GridSearch
param_grid_log = {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear']}
grid_log = GridSearchCV(LogisticRegression(max_iter=1000, class_weight='balanced'), param_grid_log, cv=5)
grid_log.fit(X_train, y_train)
# Decision Tree with GridSearch
param_grid_tree = {'max_depth': [2, 4, 6, 8, 10], 'min_samples_split': [2, 5, 10]}
grid_tree = GridSearchCV(DecisionTreeClassifier(), param_grid_tree, cv=5)
grid_tree.fit(X_train, y_train)
# Random Forest
forest = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
forest.fit(X_train, y_train)
# Support Vector Machine with pipeline and GridSearch
pipe_svc = Pipeline([
('scaler', StandardScaler()),
('svc', SVC(class_weight='balanced'))
])
param_grid_svc = {
'svc__C': [0.1, 1, 10],
'svc__kernel': ['linear', 'rbf']
}
grid_svc = GridSearchCV(pipe_svc, param_grid_svc, cv=5)
grid_svc.fit(X_train, y_train)
Out[19]:
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('svc', SVC(class_weight='balanced'))]),
param_grid={'svc__C': [0.1, 1, 10],
'svc__kernel': ['linear', 'rbf']})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('svc', SVC(class_weight='balanced'))]),
param_grid={'svc__C': [0.1, 1, 10],
'svc__kernel': ['linear', 'rbf']})Pipeline(steps=[('scaler', StandardScaler()),
('svc', SVC(C=1, class_weight='balanced'))])StandardScaler()
SVC(C=1, class_weight='balanced')
In [20]:
# Evaluation
models = {
"Logistic Regression": grid_log.best_estimator_,
"Decision Tree": grid_tree.best_estimator_,
"Random Forest": forest,
"SVM": grid_svc.best_estimator_
}
for name, model in models.items():
scores = cross_val_score(model, X_train, y_train, cv=5)
print(f"{name} CV Accuracy: {scores.mean():.4f}")
Logistic Regression CV Accuracy: 0.7556 Decision Tree CV Accuracy: 0.7843 Random Forest CV Accuracy: 0.7868 SVM CV Accuracy: 0.7481
In [21]:
# Best Model Confusion Matrix (Random Forest here)
y_pred = forest.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Random Forest')
plt.show()
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.83 0.90 0.87 1035
1 0.65 0.49 0.56 374
accuracy 0.79 1409
macro avg 0.74 0.70 0.71 1409
weighted avg 0.78 0.79 0.78 1409
In [22]:
# Feature Importance
importances = pd.DataFrame({'Feature': X.columns, 'Importance': forest.feature_importances_})
importances = importances.sort_values(by='Importance', ascending=False)
print("\nTop 10 Important Features (Random Forest):")
print(importances.head(10))
Top 10 Important Features (Random Forest):
Feature Importance
15 TotalCharges 0.171017
4 tenure 0.168982
14 MonthlyCharges 0.164893
19 Contract_Two year 0.059942
16 InternetService_Fiber optic 0.053535
21 PaymentMethod_Electronic check 0.037796
18 Contract_One year 0.029666
13 PaperlessBilling 0.028259
0 gender 0.026159
7 OnlineSecurity 0.025893
In [24]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Create a sorted dataframe of feature importances
importances = pd.DataFrame({
'Feature': X.columns,
'Importance': forest.feature_importances_
}).sort_values(by='Importance', ascending=False)
# Print top features
print("\nTop 10 Important Features (Random Forest):")
print(importances.head(10))
# Plot top 10 features
plt.figure(figsize=(8,5))
sns.barplot(
x='Importance',
y='Feature',
data=importances.head(10),
palette='viridis'
)
plt.title("🔍 Top 10 Feature Importances (Random Forest)", fontsize=13, weight='bold')
plt.xlabel("Feature Importance Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()
Top 10 Important Features (Random Forest):
Feature Importance
15 TotalCharges 0.171017
4 tenure 0.168982
14 MonthlyCharges 0.164893
19 Contract_Two year 0.059942
16 InternetService_Fiber optic 0.053535
21 PaymentMethod_Electronic check 0.037796
18 Contract_One year 0.029666
13 PaperlessBilling 0.028259
0 gender 0.026159
7 OnlineSecurity 0.025893