Spaces:

ahishamm
/

Updated_BI_Project

Sleeping

App Files Files Community

ahishamm commited on May 13, 2023

Commit

5fbe234

1 Parent(s): 84115b3

Uploaded files

Browse files

Files changed (4) hide show

WA_Fn-UseC_-Telco-Customer-Churn.csv +0 -0
dashboard.py +145 -0
func.py +235 -0
requirements.txt +8 -0

WA_Fn-UseC_-Telco-Customer-Churn.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

dashboard.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import warnings
+import streamlit as st
+warnings.filterwarnings('ignore')
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import LabelEncoder
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.neural_network import MLPClassifier
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.ensemble import ExtraTreesClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+from xgboost import XGBClassifier
+from sklearn import metrics
+from sklearn.metrics import roc_curve
+from sklearn.metrics import recall_score, confusion_matrix, precision_score, f1_score, accuracy_score, classification_report
+import func as fc
+from io import StringIO
+st.set_page_config(layout='wide')
+tab1, tab2 = st.tabs(['Data','ML'])
+#loading the options list from the functions file func.py
+optionList = fc.OPTION_LIST
+modelList = fc.MODEL_SELECTOR
+#option to upload the dataframe
+with tab1:
+    option = st.selectbox('Select the plot you want to visualize',optionList)
+    uploaded_dataframe = st.file_uploader("Choose a file")
+    #print(type(uploaded_dataframe))
+if uploaded_dataframe is not None:
+    if option is not None :
+        fig1,fig2,fig3,fig4,fig5,fig6,fig7,fig8,fig9,fig10,fig11,fig12,fig13, processed_df = fc.take_input(uploaded_dataframe)
+        with tab1:
+            st.dataframe(processed_df)
+            with st.container():
+                col1, col2, col3 = st.columns(3)
+                with col1:
+                    st.plotly_chart(fig1, use_container_width=True)
+                with col2:
+                    st.plotly_chart(fig2,use_container_width=True)
+                with col3:
+                    st.plotly_chart(fig3,use_container_width=True)
+            with st.container():
+                col1, col2, col3 = st.columns(3)
+                with col1:
+                    st.plotly_chart(fig4, use_container_width=True)
+                with col2:
+                    st.plotly_chart(fig5,use_container_width=True)
+                with col3:
+                    st.plotly_chart(fig6,use_container_width=True)
+            with st.container():
+                col1, col2, col3 = st.columns(3)
+                with col1:
+                    st.plotly_chart(fig7, use_container_width=True)
+                with col2:
+                    st.plotly_chart(fig8,use_container_width=True)
+                with col3:
+                    st.plotly_chart(fig9,use_container_width=True)
+            with st.container():
+                col1, col2, col3, col4 = st.columns(4)
+                with col1:
+                    st.plotly_chart(fig10, use_container_width=True)
+                with col2:
+                    st.plotly_chart(fig11,use_container_width=True)
+                with col3:
+                    st.plotly_chart(fig12,use_container_width=True)
+                with col4:
+                    st.plotly_chart(fig13,use_container_width=True)
+        #removing the secondary tab
+        #with tab2:
+        #    st.plotly_chart(figure,use_container_width=True)
+with tab2:
+    modeloption = st.selectbox('Select an ML Model',modelList)
+    uploaded_dataframe = st.file_uploader("Choose a file", key=2)
+    test_size_slider = st.slider('Enter the test size: ',0.0,1.0)
+    random_state_input = st.number_input('Select a random seed',0,1000)
+    #print(test_size_slider)
+    if uploaded_dataframe is not None:
+        #Add a slider later the test_size, and a input box for the random state
+        #print(uploaded_dataframe)
+        acc_score, classification_rep, output_df,original_df = fc.standardize_dataframe(uploaded_dataframe,modeloption,test_size_slider,random_state_input)
+        st.dataframe(output_df)
+        #st.write('Accuracy Score of '+modeloption+' is: '+str(acc_score))
+        st.metric(label='Accuracy Score of '+modeloption,value=str(acc_score))
+        st.markdown('```bash \t \n'+classification_rep+'```')
+        #print(acc_score,'\n',classification_rep)
+        st.write('Enter some information to predict the churn:')
+        pr_1 = st.selectbox('Select the gender:',['Female','Male'])
+        pr_2 = st.selectbox('Is the customer a senior citizen?',['Yes','No'])
+        pr_3 = st.selectbox('Does the customer have a partner?',['Yes','No'])
+        pr_4 = st.selectbox('Does the customer have dependents?',['Yes','No'])
+        pr_5 = st.number_input('What is the customer tenure?',0,100)
+        pr_6 = st.selectbox('Does the customer have phone service?',['Yes','No'])
+        pr_7 = st.selectbox('Does the customer have multiple lines?',['Yes','No','No phone service'])
+        pr_8 = st.selectbox('Does the customer have internet service?',['No','DSL','Fiber optic'])
+        pr_9 = st.selectbox('Does the customer have online security?',['Yes','No','No internet service'])
+        pr_10 = st.selectbox('Does the customer have online backup?',['Yes','No','No internet service'])
+        pr_11 = st.selectbox('Does the customer have device protection?',['Yes','No','No internet service'])
+        pr_12 = st.selectbox('Does the customer have tech support?',['Yes','No','No internet service'])
+        pr_13 = st.selectbox('Does the customer have streaming TV?',['Yes','No','No internet service'])
+        pr_14 = st.selectbox('Does the customer have streaming movies?',['Yes','No','No internet service'])
+        pr_15 = st.selectbox('Does the customer have a contract?',['Month-to-month','One year','Two year'])
+        pr_16 = st.selectbox('Does the customer have paperless billing?',['Yes','No'])
+        pr_17 = st.selectbox('What is the payment method of the customer?',['Electronic check','Mailed check','Bank transfer (automatic)','Credit card (automatic)'])
+        pr_18 = st.number_input('What are the monthly charges of the customer?')
+        pr_19 = st.number_input('What are the total charges of the customer?')
+        if st.button('Predict Churn'):
+            #convert the inputs to a vector and pass it to a voting classifier algorithm
+            feature_vector = pd.DataFrame({'customerID':[1],
+                                           'gender':[pr_1],
+                                          'SeniorCitizen':[pr_2],
+                                          'Partner':[pr_3],
+                                          'Dependents':[pr_4],
+                                          'tenure':[pr_5],
+                                          'PhoneService':[pr_6],
+                                          'MultipleLines':[pr_7],
+                                          'InternetService':[pr_8],
+                                          'OnlineSecurity':[pr_9],
+                                          'OnlineBackup':[pr_10],
+                                          'DeviceProtection':[pr_11],
+                                          'TechSupport':[pr_12],
+                                          'StreamingTV':[pr_13],
+                                          'StreamingMovies':[pr_14],
+                                          'Contract':[pr_15],
+                                          'PaperlessBilling':[pr_16],
+                                          'PaymentMethod':[pr_17],
+                                          'MonthlyCharges':[pr_18],
+                                          'TotalCharges':[pr_19]})
+            #passing the feature vector to be processed and predict a churn output
+            #print(feature_vector)
+            response = fc.standardize_feature_vector(feature_vector,original_df,test_size_slider,random_state_input)
+            st.metric(label='Prediction Response',value=response)

func.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import pandas as pd
+import numpy as np
+from plotly.subplots import make_subplots
+import plotly.graph_objects as go
+import matplotlib.pyplot as plt
+import plotly.express as px
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import LabelEncoder
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report,accuracy_score
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.ensemble import VotingClassifier
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+OPTION_LIST = ['Gender and Churn Distribution','Customer Contract Distribution','Payment Method Distribution','Payment Method Distribution Churn',
+              'Churn Distribution w.r.t Internet Service and Gender','Dependents Distribution Churn',
+              'Churn Distribution w.r.t Partners','Churn Distribution w.r.t Senior Citizens',
+              'Churn Distribution w.r.t Online Security','Churn Distribution w.r.t Paperless Billing',
+              'Churn Distribution w.r.t Tech Support','Churn Distribution w.r.t Phone Service',
+              'Tenure vs. Churn']
+MODEL_SELECTOR = ['KNN','SVC','RF','LR','DT','Adaboost','Gradient Boosting','Voting Classifier']
+num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']
+scaler= StandardScaler()
+def preprocess(df):
+    df = df.drop(['customerID'], axis = 1)
+    df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce')
+    df[np.isnan(df['TotalCharges'])]
+    df[df['tenure'] == 0].index
+    df.drop(labels=df[df['tenure'] == 0].index, axis=0, inplace=True)
+    df[df['tenure'] == 0].index
+    df.fillna(df["TotalCharges"].mean())
+    df["SeniorCitizen"]= df["SeniorCitizen"].map({0: "No", 1: "Yes"})
+    return df
+def object_to_int(dataframe_series):
+    if dataframe_series.dtype=='object':
+        dataframe_series = LabelEncoder().fit_transform(dataframe_series)
+    return dataframe_series
+def evaluate_voter(test_feature_vector, df,test_size,random_state):
+    print(df)
+    df = preprocess(df)
+    df = df.apply(lambda x: object_to_int(x))
+    X = df.drop(columns = ['Churn'])
+    y = df['Churn'].values
+    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = test_size, random_state = random_state, stratify=y)
+    df_std = pd.DataFrame(StandardScaler().fit_transform(df[num_cols].astype('float64')),columns=num_cols)
+    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
+    X_test[num_cols] = scaler.transform(X_test[num_cols])
+    clf1 = GradientBoostingClassifier()
+    clf2 = LogisticRegression()
+    clf3 = AdaBoostClassifier()
+    eclf1 = VotingClassifier(estimators=[('gbc', clf1), ('lr', clf2), ('abc', clf3)], voting='soft')
+    eclf1.fit(X_train, y_train)
+    #feeding the feature vector as a test input
+    predicted_y = eclf1.predict(test_feature_vector)
+    if predicted_y[0] == 1:
+        #print('The customer is likely to stop using the services')
+        return 'Customer is likely to stop using the telecom services'
+    else:
+        #print('The customer is likely to continue using the services')
+        return 'Customer is likely to continue using the telecom services'
+def standardize_feature_vector(df,original_df, test_size,random_state):
+    df = df.drop(['customerID'], axis = 1)
+    df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce')
+    #Manual label encoding is the only solution here...
+    df["SeniorCitizen"]= df["SeniorCitizen"].map({"No": 0, "Yes": 1})
+    df['gender'] = df['gender'].map({'Female':0,'Male':1})
+    df['Partner'] = df['Partner'].map({"No":0,"Yes":1})
+    df['Dependents'] = df['Dependents'].map({"No":0,"Yes":1})
+    df['PhoneService'] = df['PhoneService'].map({"No":0,"Yes":1})
+    df['MultipleLines'] = df['MultipleLines'].map({"No phone service":1,"No":0,"Yes":2})
+    df['InternetService'] = df['InternetService'].map({'DSL':0,'Fiber optic':1,'No':2})
+    df['OnlineSecurity'] = df['OnlineSecurity'].map({'No':0,'Yes':2,'No internet service':1})
+    df['OnlineBackup'] = df['OnlineBackup'].map({'No':0,'Yes':2,'No internet service':1})
+    df['DeviceProtection'] = df['DeviceProtection'].map({'No':0,'Yes':2,'No internet service':1})
+    df['TechSupport'] = df['TechSupport'].map({'No':0,'Yes':2,'No internet service':1})
+    df['StreamingTV'] = df['StreamingTV'].map({'No':0,'Yes':2,'No internet service':1})
+    df['StreamingMovies'] = df['StreamingMovies'].map({'No':0,'Yes':2,'No internet service':1})
+    df['Contract'] = df['Contract'].map({'Month-to-month':0,'One year':1,'Two year':2})
+    df['PaperlessBilling'] = df['PaperlessBilling'].map({"No":0,"Yes":1})
+    df['PaymentMethod'] = df['PaymentMethod'].map({'Electronic check':2, 'Mailed check':3,'Bank transfer (automatic)':0,'Credit card (automatic)':1})
+    #Churn -> No:0, Yes:1
+    numpy_vector = df.to_numpy()
+    print(df)
+    print(numpy_vector)
+    #passing the vector as a test vector to a trained voting classifier
+    return evaluate_voter(df,original_df,test_size,random_state)
+def standardize_dataframe(filepath,option,test_size,random_state):
+    df = pd.read_csv(filepath)
+    #print(df)
+    df_new = preprocess(df)
+    #print(df)
+    #label encoding the dataframe
+    df_new = df_new.apply(lambda x: object_to_int(x))
+    #inputs and target selection
+    X = df_new.drop(columns = ['Churn'])
+    y = df_new['Churn'].values
+    #train test split (Allowing the user to choose the optimal train/test split percentage)
+    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = test_size, random_state = random_state, stratify=y)
+    #Standardizing the variables
+    df_std = pd.DataFrame(StandardScaler().fit_transform(df_new[num_cols].astype('float64')),columns=num_cols)
+    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
+    X_test[num_cols] = scaler.transform(X_test[num_cols])
+    if option == 'KNN':
+        knn_model = KNeighborsClassifier(n_neighbors = 11)
+        knn_model.fit(X_train,y_train)
+        predicted_y = knn_model.predict(X_test)
+        return accuracy_score(predicted_y,y_test), classification_report(y_test, predicted_y),df_new,df
+    elif option == 'SVC':
+        svc_model = SVC(random_state = 1)
+        svc_model.fit(X_train,y_train)
+        predicted_y = svc_model.predict(X_test)
+        return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df
+    elif option == 'RF':
+        model_rf = RandomForestClassifier(n_estimators=500 , oob_score = True, n_jobs = -1,
+                                  random_state =50, max_features = "auto",
+                                  max_leaf_nodes = 30)
+        model_rf.fit(X_train, y_train)
+        predicted_y = model_rf.predict(X_test)
+        return accuracy_score(y_test, predicted_y), classification_report(y_test,predicted_y),df_new,df
+    elif option == 'LR':
+        lr_model = LogisticRegression()
+        lr_model.fit(X_train,y_train)
+        predicted_y = lr_model.predict(X_test)
+        return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df
+    elif option == 'DT':
+        dt_model = DecisionTreeClassifier()
+        dt_model.fit(X_train,y_train)
+        predicted_y = dt_model.predict(X_test)
+        return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df
+    elif option == 'Adaboost':
+        a_model = AdaBoostClassifier()
+        a_model.fit(X_train,y_train)
+        predicted_y = a_model.predict(X_test)
+        return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df
+    elif option == 'Gradient Boosting':
+        gb = GradientBoostingClassifier()
+        gb.fit(X_train, y_train)
+        predicted_y = gb.predict(X_test)
+        return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df
+    elif option == 'Voting Classifier':
+        clf1 = GradientBoostingClassifier()
+        clf2 = LogisticRegression()
+        clf3 = AdaBoostClassifier()
+        eclf1 = VotingClassifier(estimators=[('gbc', clf1), ('lr', clf2), ('abc', clf3)], voting='soft')
+        eclf1.fit(X_train, y_train)
+        predicted_y = eclf1.predict(X_test)
+        return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df
+def visualize(df):
+    g_labels = ['Male', 'Female']
+    c_labels = ['No', 'Yes']
+    # Create subplots: use 'domain' type for Pie subplot
+    fig1 = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
+    fig1.add_trace(go.Pie(labels=g_labels, values=df['gender'].value_counts(), name="Gender"),
+                  1, 1)
+    fig1.add_trace(go.Pie(labels=c_labels, values=df['Churn'].value_counts(), name="Churn"),
+                  1, 2)
+    # Use `hole` to create a donut-like pie chart
+    fig1.update_traces(hole=.4, hoverinfo="label+percent+name", textfont_size=16)
+    fig1.update_layout(
+        title_text="Gender and Churn Distributions",
+        # Add annotations in the center of the donut pies.
+        annotations=[dict(text='Gender', x=0.16, y=0.5, font_size=20, showarrow=False),
+                     dict(text='Churn', x=0.84, y=0.5, font_size=20, showarrow=False)])
+    fig2 = px.histogram(df, x="Churn", color="Contract", barmode="group", title="<b>Customer contract distribution<b>")
+    fig2.update_layout(width=700, height=500, bargap=0.1)
+    labels = df['PaymentMethod'].unique()
+    values = df['PaymentMethod'].value_counts()
+    fig3 = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
+    fig3.update_layout(title_text="<b>Payment Method Distribution</b>")
+    fig4 = px.histogram(df, x="Churn", color="PaymentMethod", title="<b>Customer Payment Method distribution w.r.t. Churn</b>")
+    fig4.update_layout(width=700, height=500, bargap=0.1)
+    fig5 = go.Figure()
+    fig5.add_trace(go.Bar(
+      x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
+           ["Female", "Male", "Female", "Male"]],
+      y = [965, 992, 219, 240],
+      name = 'DSL',
+    ))
+    fig5.add_trace(go.Bar(
+      x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
+           ["Female", "Male", "Female", "Male"]],
+      y = [889, 910, 664, 633],
+      name = 'Fiber optic',
+    ))
+    fig5.add_trace(go.Bar(
+      x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
+           ["Female", "Male", "Female", "Male"]],
+      y = [690, 717, 56, 57],
+      name = 'No Internet',
+    ))
+    fig5.update_layout(title_text="<b>Churn Distribution w.r.t. Internet Service and Gender</b>")
+    color_map = {"Yes": "#FF97FF", "No": "#AB63FA"}
+    fig6 = px.histogram(df, x="Churn", color="Dependents", barmode="group", title="<b>Dependents distribution</b>", color_discrete_map=color_map)
+    fig6.update_layout(width=700, height=500, bargap=0.1)
+    color_map = {"Yes": '#FFA15A', "No": '#00CC96'}
+    fig7 = px.histogram(df, x="Churn", color="Partner", barmode="group", title="<b>Churn distribution w.r.t. Partners</b>", color_discrete_map=color_map)
+    fig7.update_layout(width=700, height=500, bargap=0.1)
+    color_map = {"Yes": '#00CC96', "No": '#B6E880'}
+    fig8 = px.histogram(df, x="Churn", color="SeniorCitizen", title="<b>Churn distribution w.r.t. Senior Citizen</b>", color_discrete_map=color_map)
+    fig8.update_layout(width=700, height=500, bargap=0.1)
+    color_map = {"Yes": "#FF97FF", "No": "#AB63FA"}
+    fig9 = px.histogram(df, x="Churn", color="OnlineSecurity", barmode="group", title="<b>Churn distribution w.r.t Online Security</b>", color_discrete_map=color_map)
+    fig9.update_layout(width=700, height=500, bargap=0.1)
+    color_map = {"Yes": '#FFA15A', "No": '#00CC96'}
+    fig10 = px.histogram(df, x="Churn", color="PaperlessBilling",  title="<b>Churn distribution w.r.t. Paperless Billing</b>", color_discrete_map=color_map)
+    fig10.update_layout(width=700, height=500, bargap=0.1)
+    fig11 = px.histogram(df, x="Churn", color="TechSupport",barmode="group",  title="<b>Churn distribution w.r.t. Tech Support</b>")
+    fig11.update_layout(width=700, height=500, bargap=0.1)
+    color_map = {"Yes": '#00CC96', "No": '#B6E880'}
+    fig12 = px.histogram(df, x="Churn", color="PhoneService", title="<b>Churn Distribution w.r.t. Phone Service</b>", color_discrete_map=color_map)
+    fig12.update_layout(width=700, height=500, bargap=0.1)
+    fig13 = px.box(df, x='Churn', y = 'tenure')
+    fig13.update_yaxes(title_text='Tenure (Months)', row=1, col=1)
+    fig13.update_xaxes(title_text='Churn', row=1, col=1)
+    fig13.update_layout(autosize=True, width=750, height=600,
+        title_font=dict(size=25, family='Courier'),
+        title='<b>Tenure vs Churn</b>',
+    )
+    return fig1,fig2,fig3,fig4,fig5,fig6,fig7,fig8,fig9,fig10,fig11,fig12,fig13
+def take_input(filepath):
+    df = pd.read_csv(filepath)
+    processed_df = preprocess(df)
+    fig1,fig2,fig3,fig4,fig5,fig6,fig7,fig8,fig9,fig10,fig11,fig12,fig13 = visualize(processed_df)
+    return fig1,fig2,fig3,fig4,fig5,fig6,fig7,fig8,fig9,fig10,fig11,fig12,fig13, processed_df

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+scikit-learn
+pandas
+numpy
+plotly
+streamlit
+matplotlib
+seaborn
+xgboost