import pandas as pd
import numpy as np
#Visualization
from matplotlib import pyplot as plt
import seaborn as sns
#Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
#Model evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
#Importing the dataset
df = pd.read_csv('classificationdata.csv', index_col='id')
df.head()
state_code | tenure | contract_length | promotions_offered | remaining_term | last_nps_rating | area_code | international_plan | voice_mail_plan | number_vmail_messages | ... | total_eve_calls | total_eve_charge | total_night_minutes | total_night_calls | total_night_charge | total_intl_minutes | total_intl_calls | total_intl_charge | number_customer_service_calls | churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | |||||||||||||||||||||
0 | HI | 156 | 14.0 | Yes | 1.0 | 6.0 | area_code_510 | no | no | 0 | ... | 108 | 19.138302 | 208.349932 | 130 | 9.190181 | 8.015688 | 7 | 2.248902 | 7 | no |
1 | MI | 216 | 8.0 | No | 14.0 | 9.0 | area_code_408 | no | no | 3 | ... | 71 | 15.474436 | 228.902063 | 85 | 10.277852 | 9.683971 | 8 | 2.609739 | 3 | no |
2 | NH | 18 | 20.0 | No | 12.0 | 1.0 | area_code_408 | no | no | 1 | ... | 55 | 22.547297 | 202.353527 | 127 | 8.898488 | 14.039450 | 8 | 3.845776 | 2 | no |
3 | MN | 174 | 9.0 | No | 12.0 | 6.0 | area_code_415 | no | no | 2 | ... | 105 | 16.666506 | 214.487530 | 105 | 9.740333 | 13.031063 | 4 | 3.525823 | 1 | no |
4 | TX | 68 | 19.0 | No | 22.0 | 5.0 | area_code_415 | no | no | 1 | ... | 88 | 20.408969 | 190.047534 | 113 | 8.813303 | 6.760950 | 4 | 1.828652 | 0 | no |
5 rows × 24 columns
f'The dataset contains {df.shape[0]} rows y {df.shape[1]} columns'
'The dataset contains 17243 rows y 24 columns'
The data in each column is:
df.columns
Index(['state_code', 'tenure', 'contract_length', 'promotions_offered', 'remaining_term', 'last_nps_rating', 'area_code', 'international_plan', 'voice_mail_plan', 'number_vmail_messages', 'total_day_minutes', 'total_day_calls', 'total_day_charge', 'total_eve_minutes', 'total_eve_calls', 'total_eve_charge', 'total_night_minutes', 'total_night_calls', 'total_night_charge', 'total_intl_minutes', 'total_intl_calls', 'total_intl_charge', 'number_customer_service_calls', 'churn'], dtype='object')
And there are 6 categorial and 18 numerical features:
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 17243 entries, 0 to 17242 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 state_code 17243 non-null object 1 tenure 17243 non-null int64 2 contract_length 17196 non-null float64 3 promotions_offered 17196 non-null object 4 remaining_term 17196 non-null float64 5 last_nps_rating 17196 non-null float64 6 area_code 17231 non-null object 7 international_plan 17243 non-null object 8 voice_mail_plan 17213 non-null object 9 number_vmail_messages 17243 non-null int64 10 total_day_minutes 17243 non-null float64 11 total_day_calls 17243 non-null int64 12 total_day_charge 17243 non-null float64 13 total_eve_minutes 17225 non-null float64 14 total_eve_calls 17243 non-null int64 15 total_eve_charge 17243 non-null float64 16 total_night_minutes 17243 non-null float64 17 total_night_calls 17243 non-null int64 18 total_night_charge 17243 non-null float64 19 total_intl_minutes 17243 non-null float64 20 total_intl_calls 17243 non-null int64 21 total_intl_charge 17243 non-null float64 22 number_customer_service_calls 17243 non-null int64 23 churn 17196 non-null object dtypes: float64(11), int64(7), object(6) memory usage: 3.3+ MB
The dataset also contains null values:
df.isnull().sum()
state_code 0 tenure 0 contract_length 47 promotions_offered 47 remaining_term 47 last_nps_rating 47 area_code 12 international_plan 0 voice_mail_plan 30 number_vmail_messages 0 total_day_minutes 0 total_day_calls 0 total_day_charge 0 total_eve_minutes 18 total_eve_calls 0 total_eve_charge 0 total_night_minutes 0 total_night_calls 0 total_night_charge 0 total_intl_minutes 0 total_intl_calls 0 total_intl_charge 0 number_customer_service_calls 0 churn 47 dtype: int64
Looking at the distribution of each numerical value we can see that:
plt.figure(figsize=(20,30))
for i in enumerate(df.select_dtypes(exclude='object').columns): #Creates a tuple with number and name of each column
plt.subplot(8,3,i[0]+1)
sns.histplot(data=df,x=i[1],edgecolor=None, hue='churn')
plt.ylabel('')
plt.tight_layout()
We can see this using the violin plot:
plt.figure(figsize=(15,12))
for i in enumerate(df.select_dtypes(exclude='object').columns): #Creates a tuple with number and name of each column
if i[1] == 'last_nps_rating' or i[1] == 'remaining_term':
plt.subplot(2,2,i[0]+1)
sns.violinplot(data=df,y=i[1],x='churn')
plt.title(i[1])
plt.tight_layout()
plt.figure(figsize=(20,35))
for i in enumerate(df.select_dtypes(include='object').columns): #Creates a tuple with number and name of each column
if i[1] == 'state_code':
plt.subplot(8,2,i[0]+1)
sns.countplot(data=df, x=i[1], hue='churn')
plt.xticks(rotation=90)
if i[1] == 'churn':
plt.subplot(8,2,i[0]+1)
sns.countplot(data=df, x=i[1])
else:
plt.subplot(8,2,i[0]+1)
sns.countplot(data=df, x=i[1], hue='churn')