Spaces:
Sleeping
Sleeping
Uploaded files
Browse files- WA_Fn-UseC_-Telco-Customer-Churn.csv +0 -0
- dashboard.py +145 -0
- func.py +235 -0
- requirements.txt +8 -0
WA_Fn-UseC_-Telco-Customer-Churn.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
dashboard.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import seaborn as sns
|
5 |
+
import plotly.express as px
|
6 |
+
import plotly.graph_objects as go
|
7 |
+
from plotly.subplots import make_subplots
|
8 |
+
import warnings
|
9 |
+
import streamlit as st
|
10 |
+
warnings.filterwarnings('ignore')
|
11 |
+
from sklearn.preprocessing import StandardScaler
|
12 |
+
from sklearn.preprocessing import LabelEncoder
|
13 |
+
from sklearn.tree import DecisionTreeClassifier
|
14 |
+
from sklearn.ensemble import RandomForestClassifier
|
15 |
+
from sklearn.naive_bayes import GaussianNB
|
16 |
+
from sklearn.neighbors import KNeighborsClassifier
|
17 |
+
from sklearn.svm import SVC
|
18 |
+
from sklearn.neural_network import MLPClassifier
|
19 |
+
from sklearn.ensemble import AdaBoostClassifier
|
20 |
+
from sklearn.ensemble import GradientBoostingClassifier
|
21 |
+
from sklearn.ensemble import ExtraTreesClassifier
|
22 |
+
from sklearn.linear_model import LogisticRegression
|
23 |
+
from sklearn.model_selection import train_test_split
|
24 |
+
from sklearn.metrics import accuracy_score
|
25 |
+
from xgboost import XGBClassifier
|
26 |
+
from sklearn import metrics
|
27 |
+
from sklearn.metrics import roc_curve
|
28 |
+
from sklearn.metrics import recall_score, confusion_matrix, precision_score, f1_score, accuracy_score, classification_report
|
29 |
+
import func as fc
|
30 |
+
from io import StringIO
|
31 |
+
st.set_page_config(layout='wide')
|
32 |
+
tab1, tab2 = st.tabs(['Data','ML'])
|
33 |
+
#loading the options list from the functions file func.py
|
34 |
+
optionList = fc.OPTION_LIST
|
35 |
+
modelList = fc.MODEL_SELECTOR
|
36 |
+
#option to upload the dataframe
|
37 |
+
with tab1:
|
38 |
+
option = st.selectbox('Select the plot you want to visualize',optionList)
|
39 |
+
uploaded_dataframe = st.file_uploader("Choose a file")
|
40 |
+
#print(type(uploaded_dataframe))
|
41 |
+
if uploaded_dataframe is not None:
|
42 |
+
if option is not None :
|
43 |
+
fig1,fig2,fig3,fig4,fig5,fig6,fig7,fig8,fig9,fig10,fig11,fig12,fig13, processed_df = fc.take_input(uploaded_dataframe)
|
44 |
+
with tab1:
|
45 |
+
st.dataframe(processed_df)
|
46 |
+
with st.container():
|
47 |
+
col1, col2, col3 = st.columns(3)
|
48 |
+
with col1:
|
49 |
+
st.plotly_chart(fig1, use_container_width=True)
|
50 |
+
with col2:
|
51 |
+
st.plotly_chart(fig2,use_container_width=True)
|
52 |
+
with col3:
|
53 |
+
st.plotly_chart(fig3,use_container_width=True)
|
54 |
+
with st.container():
|
55 |
+
col1, col2, col3 = st.columns(3)
|
56 |
+
with col1:
|
57 |
+
st.plotly_chart(fig4, use_container_width=True)
|
58 |
+
with col2:
|
59 |
+
st.plotly_chart(fig5,use_container_width=True)
|
60 |
+
with col3:
|
61 |
+
st.plotly_chart(fig6,use_container_width=True)
|
62 |
+
with st.container():
|
63 |
+
col1, col2, col3 = st.columns(3)
|
64 |
+
with col1:
|
65 |
+
st.plotly_chart(fig7, use_container_width=True)
|
66 |
+
with col2:
|
67 |
+
st.plotly_chart(fig8,use_container_width=True)
|
68 |
+
with col3:
|
69 |
+
st.plotly_chart(fig9,use_container_width=True)
|
70 |
+
with st.container():
|
71 |
+
col1, col2, col3, col4 = st.columns(4)
|
72 |
+
with col1:
|
73 |
+
st.plotly_chart(fig10, use_container_width=True)
|
74 |
+
with col2:
|
75 |
+
st.plotly_chart(fig11,use_container_width=True)
|
76 |
+
with col3:
|
77 |
+
st.plotly_chart(fig12,use_container_width=True)
|
78 |
+
with col4:
|
79 |
+
st.plotly_chart(fig13,use_container_width=True)
|
80 |
+
|
81 |
+
|
82 |
+
#removing the secondary tab
|
83 |
+
#with tab2:
|
84 |
+
# st.plotly_chart(figure,use_container_width=True)
|
85 |
+
with tab2:
|
86 |
+
modeloption = st.selectbox('Select an ML Model',modelList)
|
87 |
+
uploaded_dataframe = st.file_uploader("Choose a file", key=2)
|
88 |
+
test_size_slider = st.slider('Enter the test size: ',0.0,1.0)
|
89 |
+
random_state_input = st.number_input('Select a random seed',0,1000)
|
90 |
+
#print(test_size_slider)
|
91 |
+
if uploaded_dataframe is not None:
|
92 |
+
#Add a slider later the test_size, and a input box for the random state
|
93 |
+
#print(uploaded_dataframe)
|
94 |
+
acc_score, classification_rep, output_df,original_df = fc.standardize_dataframe(uploaded_dataframe,modeloption,test_size_slider,random_state_input)
|
95 |
+
st.dataframe(output_df)
|
96 |
+
#st.write('Accuracy Score of '+modeloption+' is: '+str(acc_score))
|
97 |
+
st.metric(label='Accuracy Score of '+modeloption,value=str(acc_score))
|
98 |
+
st.markdown('```bash \t \n'+classification_rep+'```')
|
99 |
+
#print(acc_score,'\n',classification_rep)
|
100 |
+
st.write('Enter some information to predict the churn:')
|
101 |
+
pr_1 = st.selectbox('Select the gender:',['Female','Male'])
|
102 |
+
pr_2 = st.selectbox('Is the customer a senior citizen?',['Yes','No'])
|
103 |
+
pr_3 = st.selectbox('Does the customer have a partner?',['Yes','No'])
|
104 |
+
pr_4 = st.selectbox('Does the customer have dependents?',['Yes','No'])
|
105 |
+
pr_5 = st.number_input('What is the customer tenure?',0,100)
|
106 |
+
pr_6 = st.selectbox('Does the customer have phone service?',['Yes','No'])
|
107 |
+
pr_7 = st.selectbox('Does the customer have multiple lines?',['Yes','No','No phone service'])
|
108 |
+
pr_8 = st.selectbox('Does the customer have internet service?',['No','DSL','Fiber optic'])
|
109 |
+
pr_9 = st.selectbox('Does the customer have online security?',['Yes','No','No internet service'])
|
110 |
+
pr_10 = st.selectbox('Does the customer have online backup?',['Yes','No','No internet service'])
|
111 |
+
pr_11 = st.selectbox('Does the customer have device protection?',['Yes','No','No internet service'])
|
112 |
+
pr_12 = st.selectbox('Does the customer have tech support?',['Yes','No','No internet service'])
|
113 |
+
pr_13 = st.selectbox('Does the customer have streaming TV?',['Yes','No','No internet service'])
|
114 |
+
pr_14 = st.selectbox('Does the customer have streaming movies?',['Yes','No','No internet service'])
|
115 |
+
pr_15 = st.selectbox('Does the customer have a contract?',['Month-to-month','One year','Two year'])
|
116 |
+
pr_16 = st.selectbox('Does the customer have paperless billing?',['Yes','No'])
|
117 |
+
pr_17 = st.selectbox('What is the payment method of the customer?',['Electronic check','Mailed check','Bank transfer (automatic)','Credit card (automatic)'])
|
118 |
+
pr_18 = st.number_input('What are the monthly charges of the customer?')
|
119 |
+
pr_19 = st.number_input('What are the total charges of the customer?')
|
120 |
+
if st.button('Predict Churn'):
|
121 |
+
#convert the inputs to a vector and pass it to a voting classifier algorithm
|
122 |
+
feature_vector = pd.DataFrame({'customerID':[1],
|
123 |
+
'gender':[pr_1],
|
124 |
+
'SeniorCitizen':[pr_2],
|
125 |
+
'Partner':[pr_3],
|
126 |
+
'Dependents':[pr_4],
|
127 |
+
'tenure':[pr_5],
|
128 |
+
'PhoneService':[pr_6],
|
129 |
+
'MultipleLines':[pr_7],
|
130 |
+
'InternetService':[pr_8],
|
131 |
+
'OnlineSecurity':[pr_9],
|
132 |
+
'OnlineBackup':[pr_10],
|
133 |
+
'DeviceProtection':[pr_11],
|
134 |
+
'TechSupport':[pr_12],
|
135 |
+
'StreamingTV':[pr_13],
|
136 |
+
'StreamingMovies':[pr_14],
|
137 |
+
'Contract':[pr_15],
|
138 |
+
'PaperlessBilling':[pr_16],
|
139 |
+
'PaymentMethod':[pr_17],
|
140 |
+
'MonthlyCharges':[pr_18],
|
141 |
+
'TotalCharges':[pr_19]})
|
142 |
+
#passing the feature vector to be processed and predict a churn output
|
143 |
+
#print(feature_vector)
|
144 |
+
response = fc.standardize_feature_vector(feature_vector,original_df,test_size_slider,random_state_input)
|
145 |
+
st.metric(label='Prediction Response',value=response)
|
func.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from plotly.subplots import make_subplots
|
4 |
+
import plotly.graph_objects as go
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import plotly.express as px
|
7 |
+
from sklearn.preprocessing import StandardScaler
|
8 |
+
from sklearn.preprocessing import LabelEncoder
|
9 |
+
from sklearn.model_selection import train_test_split
|
10 |
+
from sklearn.metrics import classification_report,accuracy_score
|
11 |
+
from sklearn.neighbors import KNeighborsClassifier
|
12 |
+
from sklearn.ensemble import VotingClassifier
|
13 |
+
from sklearn.ensemble import AdaBoostClassifier
|
14 |
+
from sklearn.ensemble import GradientBoostingClassifier
|
15 |
+
from sklearn.svm import SVC
|
16 |
+
from sklearn.tree import DecisionTreeClassifier
|
17 |
+
from sklearn.ensemble import RandomForestClassifier
|
18 |
+
from sklearn.linear_model import LogisticRegression
|
19 |
+
OPTION_LIST = ['Gender and Churn Distribution','Customer Contract Distribution','Payment Method Distribution','Payment Method Distribution Churn',
|
20 |
+
'Churn Distribution w.r.t Internet Service and Gender','Dependents Distribution Churn',
|
21 |
+
'Churn Distribution w.r.t Partners','Churn Distribution w.r.t Senior Citizens',
|
22 |
+
'Churn Distribution w.r.t Online Security','Churn Distribution w.r.t Paperless Billing',
|
23 |
+
'Churn Distribution w.r.t Tech Support','Churn Distribution w.r.t Phone Service',
|
24 |
+
'Tenure vs. Churn']
|
25 |
+
MODEL_SELECTOR = ['KNN','SVC','RF','LR','DT','Adaboost','Gradient Boosting','Voting Classifier']
|
26 |
+
num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']
|
27 |
+
scaler= StandardScaler()
|
28 |
+
def preprocess(df):
|
29 |
+
df = df.drop(['customerID'], axis = 1)
|
30 |
+
df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce')
|
31 |
+
df[np.isnan(df['TotalCharges'])]
|
32 |
+
df[df['tenure'] == 0].index
|
33 |
+
df.drop(labels=df[df['tenure'] == 0].index, axis=0, inplace=True)
|
34 |
+
df[df['tenure'] == 0].index
|
35 |
+
df.fillna(df["TotalCharges"].mean())
|
36 |
+
df["SeniorCitizen"]= df["SeniorCitizen"].map({0: "No", 1: "Yes"})
|
37 |
+
return df
|
38 |
+
def object_to_int(dataframe_series):
|
39 |
+
if dataframe_series.dtype=='object':
|
40 |
+
dataframe_series = LabelEncoder().fit_transform(dataframe_series)
|
41 |
+
return dataframe_series
|
42 |
+
def evaluate_voter(test_feature_vector, df,test_size,random_state):
|
43 |
+
print(df)
|
44 |
+
df = preprocess(df)
|
45 |
+
df = df.apply(lambda x: object_to_int(x))
|
46 |
+
X = df.drop(columns = ['Churn'])
|
47 |
+
y = df['Churn'].values
|
48 |
+
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = test_size, random_state = random_state, stratify=y)
|
49 |
+
df_std = pd.DataFrame(StandardScaler().fit_transform(df[num_cols].astype('float64')),columns=num_cols)
|
50 |
+
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
|
51 |
+
X_test[num_cols] = scaler.transform(X_test[num_cols])
|
52 |
+
clf1 = GradientBoostingClassifier()
|
53 |
+
clf2 = LogisticRegression()
|
54 |
+
clf3 = AdaBoostClassifier()
|
55 |
+
eclf1 = VotingClassifier(estimators=[('gbc', clf1), ('lr', clf2), ('abc', clf3)], voting='soft')
|
56 |
+
eclf1.fit(X_train, y_train)
|
57 |
+
#feeding the feature vector as a test input
|
58 |
+
predicted_y = eclf1.predict(test_feature_vector)
|
59 |
+
if predicted_y[0] == 1:
|
60 |
+
#print('The customer is likely to stop using the services')
|
61 |
+
return 'Customer is likely to stop using the telecom services'
|
62 |
+
else:
|
63 |
+
#print('The customer is likely to continue using the services')
|
64 |
+
return 'Customer is likely to continue using the telecom services'
|
65 |
+
|
66 |
+
|
67 |
+
def standardize_feature_vector(df,original_df, test_size,random_state):
|
68 |
+
df = df.drop(['customerID'], axis = 1)
|
69 |
+
df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce')
|
70 |
+
#Manual label encoding is the only solution here...
|
71 |
+
df["SeniorCitizen"]= df["SeniorCitizen"].map({"No": 0, "Yes": 1})
|
72 |
+
df['gender'] = df['gender'].map({'Female':0,'Male':1})
|
73 |
+
df['Partner'] = df['Partner'].map({"No":0,"Yes":1})
|
74 |
+
df['Dependents'] = df['Dependents'].map({"No":0,"Yes":1})
|
75 |
+
df['PhoneService'] = df['PhoneService'].map({"No":0,"Yes":1})
|
76 |
+
df['MultipleLines'] = df['MultipleLines'].map({"No phone service":1,"No":0,"Yes":2})
|
77 |
+
df['InternetService'] = df['InternetService'].map({'DSL':0,'Fiber optic':1,'No':2})
|
78 |
+
df['OnlineSecurity'] = df['OnlineSecurity'].map({'No':0,'Yes':2,'No internet service':1})
|
79 |
+
df['OnlineBackup'] = df['OnlineBackup'].map({'No':0,'Yes':2,'No internet service':1})
|
80 |
+
df['DeviceProtection'] = df['DeviceProtection'].map({'No':0,'Yes':2,'No internet service':1})
|
81 |
+
df['TechSupport'] = df['TechSupport'].map({'No':0,'Yes':2,'No internet service':1})
|
82 |
+
df['StreamingTV'] = df['StreamingTV'].map({'No':0,'Yes':2,'No internet service':1})
|
83 |
+
df['StreamingMovies'] = df['StreamingMovies'].map({'No':0,'Yes':2,'No internet service':1})
|
84 |
+
df['Contract'] = df['Contract'].map({'Month-to-month':0,'One year':1,'Two year':2})
|
85 |
+
df['PaperlessBilling'] = df['PaperlessBilling'].map({"No":0,"Yes":1})
|
86 |
+
df['PaymentMethod'] = df['PaymentMethod'].map({'Electronic check':2, 'Mailed check':3,'Bank transfer (automatic)':0,'Credit card (automatic)':1})
|
87 |
+
#Churn -> No:0, Yes:1
|
88 |
+
numpy_vector = df.to_numpy()
|
89 |
+
print(df)
|
90 |
+
print(numpy_vector)
|
91 |
+
#passing the vector as a test vector to a trained voting classifier
|
92 |
+
return evaluate_voter(df,original_df,test_size,random_state)
|
93 |
+
|
94 |
+
|
95 |
+
def standardize_dataframe(filepath,option,test_size,random_state):
|
96 |
+
df = pd.read_csv(filepath)
|
97 |
+
#print(df)
|
98 |
+
df_new = preprocess(df)
|
99 |
+
#print(df)
|
100 |
+
#label encoding the dataframe
|
101 |
+
df_new = df_new.apply(lambda x: object_to_int(x))
|
102 |
+
#inputs and target selection
|
103 |
+
X = df_new.drop(columns = ['Churn'])
|
104 |
+
y = df_new['Churn'].values
|
105 |
+
#train test split (Allowing the user to choose the optimal train/test split percentage)
|
106 |
+
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = test_size, random_state = random_state, stratify=y)
|
107 |
+
#Standardizing the variables
|
108 |
+
df_std = pd.DataFrame(StandardScaler().fit_transform(df_new[num_cols].astype('float64')),columns=num_cols)
|
109 |
+
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
|
110 |
+
X_test[num_cols] = scaler.transform(X_test[num_cols])
|
111 |
+
if option == 'KNN':
|
112 |
+
knn_model = KNeighborsClassifier(n_neighbors = 11)
|
113 |
+
knn_model.fit(X_train,y_train)
|
114 |
+
predicted_y = knn_model.predict(X_test)
|
115 |
+
return accuracy_score(predicted_y,y_test), classification_report(y_test, predicted_y),df_new,df
|
116 |
+
elif option == 'SVC':
|
117 |
+
svc_model = SVC(random_state = 1)
|
118 |
+
svc_model.fit(X_train,y_train)
|
119 |
+
predicted_y = svc_model.predict(X_test)
|
120 |
+
return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df
|
121 |
+
elif option == 'RF':
|
122 |
+
model_rf = RandomForestClassifier(n_estimators=500 , oob_score = True, n_jobs = -1,
|
123 |
+
random_state =50, max_features = "auto",
|
124 |
+
max_leaf_nodes = 30)
|
125 |
+
model_rf.fit(X_train, y_train)
|
126 |
+
predicted_y = model_rf.predict(X_test)
|
127 |
+
return accuracy_score(y_test, predicted_y), classification_report(y_test,predicted_y),df_new,df
|
128 |
+
elif option == 'LR':
|
129 |
+
lr_model = LogisticRegression()
|
130 |
+
lr_model.fit(X_train,y_train)
|
131 |
+
predicted_y = lr_model.predict(X_test)
|
132 |
+
return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df
|
133 |
+
elif option == 'DT':
|
134 |
+
dt_model = DecisionTreeClassifier()
|
135 |
+
dt_model.fit(X_train,y_train)
|
136 |
+
predicted_y = dt_model.predict(X_test)
|
137 |
+
return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df
|
138 |
+
elif option == 'Adaboost':
|
139 |
+
a_model = AdaBoostClassifier()
|
140 |
+
a_model.fit(X_train,y_train)
|
141 |
+
predicted_y = a_model.predict(X_test)
|
142 |
+
return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df
|
143 |
+
elif option == 'Gradient Boosting':
|
144 |
+
gb = GradientBoostingClassifier()
|
145 |
+
gb.fit(X_train, y_train)
|
146 |
+
predicted_y = gb.predict(X_test)
|
147 |
+
return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df
|
148 |
+
elif option == 'Voting Classifier':
|
149 |
+
clf1 = GradientBoostingClassifier()
|
150 |
+
clf2 = LogisticRegression()
|
151 |
+
clf3 = AdaBoostClassifier()
|
152 |
+
eclf1 = VotingClassifier(estimators=[('gbc', clf1), ('lr', clf2), ('abc', clf3)], voting='soft')
|
153 |
+
eclf1.fit(X_train, y_train)
|
154 |
+
predicted_y = eclf1.predict(X_test)
|
155 |
+
return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df
|
156 |
+
|
157 |
+
|
158 |
+
def visualize(df):
|
159 |
+
g_labels = ['Male', 'Female']
|
160 |
+
c_labels = ['No', 'Yes']
|
161 |
+
# Create subplots: use 'domain' type for Pie subplot
|
162 |
+
fig1 = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
|
163 |
+
fig1.add_trace(go.Pie(labels=g_labels, values=df['gender'].value_counts(), name="Gender"),
|
164 |
+
1, 1)
|
165 |
+
fig1.add_trace(go.Pie(labels=c_labels, values=df['Churn'].value_counts(), name="Churn"),
|
166 |
+
1, 2)
|
167 |
+
# Use `hole` to create a donut-like pie chart
|
168 |
+
fig1.update_traces(hole=.4, hoverinfo="label+percent+name", textfont_size=16)
|
169 |
+
fig1.update_layout(
|
170 |
+
title_text="Gender and Churn Distributions",
|
171 |
+
# Add annotations in the center of the donut pies.
|
172 |
+
annotations=[dict(text='Gender', x=0.16, y=0.5, font_size=20, showarrow=False),
|
173 |
+
dict(text='Churn', x=0.84, y=0.5, font_size=20, showarrow=False)])
|
174 |
+
fig2 = px.histogram(df, x="Churn", color="Contract", barmode="group", title="<b>Customer contract distribution<b>")
|
175 |
+
fig2.update_layout(width=700, height=500, bargap=0.1)
|
176 |
+
labels = df['PaymentMethod'].unique()
|
177 |
+
values = df['PaymentMethod'].value_counts()
|
178 |
+
fig3 = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
|
179 |
+
fig3.update_layout(title_text="<b>Payment Method Distribution</b>")
|
180 |
+
fig4 = px.histogram(df, x="Churn", color="PaymentMethod", title="<b>Customer Payment Method distribution w.r.t. Churn</b>")
|
181 |
+
fig4.update_layout(width=700, height=500, bargap=0.1)
|
182 |
+
fig5 = go.Figure()
|
183 |
+
fig5.add_trace(go.Bar(
|
184 |
+
x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
|
185 |
+
["Female", "Male", "Female", "Male"]],
|
186 |
+
y = [965, 992, 219, 240],
|
187 |
+
name = 'DSL',
|
188 |
+
))
|
189 |
+
fig5.add_trace(go.Bar(
|
190 |
+
x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
|
191 |
+
["Female", "Male", "Female", "Male"]],
|
192 |
+
y = [889, 910, 664, 633],
|
193 |
+
name = 'Fiber optic',
|
194 |
+
))
|
195 |
+
fig5.add_trace(go.Bar(
|
196 |
+
x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
|
197 |
+
["Female", "Male", "Female", "Male"]],
|
198 |
+
y = [690, 717, 56, 57],
|
199 |
+
name = 'No Internet',
|
200 |
+
))
|
201 |
+
fig5.update_layout(title_text="<b>Churn Distribution w.r.t. Internet Service and Gender</b>")
|
202 |
+
color_map = {"Yes": "#FF97FF", "No": "#AB63FA"}
|
203 |
+
fig6 = px.histogram(df, x="Churn", color="Dependents", barmode="group", title="<b>Dependents distribution</b>", color_discrete_map=color_map)
|
204 |
+
fig6.update_layout(width=700, height=500, bargap=0.1)
|
205 |
+
color_map = {"Yes": '#FFA15A', "No": '#00CC96'}
|
206 |
+
fig7 = px.histogram(df, x="Churn", color="Partner", barmode="group", title="<b>Churn distribution w.r.t. Partners</b>", color_discrete_map=color_map)
|
207 |
+
fig7.update_layout(width=700, height=500, bargap=0.1)
|
208 |
+
color_map = {"Yes": '#00CC96', "No": '#B6E880'}
|
209 |
+
fig8 = px.histogram(df, x="Churn", color="SeniorCitizen", title="<b>Churn distribution w.r.t. Senior Citizen</b>", color_discrete_map=color_map)
|
210 |
+
fig8.update_layout(width=700, height=500, bargap=0.1)
|
211 |
+
color_map = {"Yes": "#FF97FF", "No": "#AB63FA"}
|
212 |
+
fig9 = px.histogram(df, x="Churn", color="OnlineSecurity", barmode="group", title="<b>Churn distribution w.r.t Online Security</b>", color_discrete_map=color_map)
|
213 |
+
fig9.update_layout(width=700, height=500, bargap=0.1)
|
214 |
+
color_map = {"Yes": '#FFA15A', "No": '#00CC96'}
|
215 |
+
fig10 = px.histogram(df, x="Churn", color="PaperlessBilling", title="<b>Churn distribution w.r.t. Paperless Billing</b>", color_discrete_map=color_map)
|
216 |
+
fig10.update_layout(width=700, height=500, bargap=0.1)
|
217 |
+
fig11 = px.histogram(df, x="Churn", color="TechSupport",barmode="group", title="<b>Churn distribution w.r.t. Tech Support</b>")
|
218 |
+
fig11.update_layout(width=700, height=500, bargap=0.1)
|
219 |
+
color_map = {"Yes": '#00CC96', "No": '#B6E880'}
|
220 |
+
fig12 = px.histogram(df, x="Churn", color="PhoneService", title="<b>Churn Distribution w.r.t. Phone Service</b>", color_discrete_map=color_map)
|
221 |
+
fig12.update_layout(width=700, height=500, bargap=0.1)
|
222 |
+
fig13 = px.box(df, x='Churn', y = 'tenure')
|
223 |
+
fig13.update_yaxes(title_text='Tenure (Months)', row=1, col=1)
|
224 |
+
fig13.update_xaxes(title_text='Churn', row=1, col=1)
|
225 |
+
fig13.update_layout(autosize=True, width=750, height=600,
|
226 |
+
title_font=dict(size=25, family='Courier'),
|
227 |
+
title='<b>Tenure vs Churn</b>',
|
228 |
+
)
|
229 |
+
return fig1,fig2,fig3,fig4,fig5,fig6,fig7,fig8,fig9,fig10,fig11,fig12,fig13
|
230 |
+
|
231 |
+
def take_input(filepath):
|
232 |
+
df = pd.read_csv(filepath)
|
233 |
+
processed_df = preprocess(df)
|
234 |
+
fig1,fig2,fig3,fig4,fig5,fig6,fig7,fig8,fig9,fig10,fig11,fig12,fig13 = visualize(processed_df)
|
235 |
+
return fig1,fig2,fig3,fig4,fig5,fig6,fig7,fig8,fig9,fig10,fig11,fig12,fig13, processed_df
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
scikit-learn
|
2 |
+
pandas
|
3 |
+
numpy
|
4 |
+
plotly
|
5 |
+
streamlit
|
6 |
+
matplotlib
|
7 |
+
seaborn
|
8 |
+
xgboost
|