Spaces:
Build error
Build error
| import pandas as pd | |
| import numpy as np | |
| from plotly.subplots import make_subplots | |
| import plotly.graph_objects as go | |
| import matplotlib.pyplot as plt | |
| import plotly.express as px | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import classification_report,accuracy_score | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.ensemble import VotingClassifier | |
| from sklearn.ensemble import AdaBoostClassifier | |
| from sklearn.ensemble import GradientBoostingClassifier | |
| from sklearn.svm import SVC | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| OPTION_LIST = ['Gender and Churn Distribution','Customer Contract Distribution','Payment Method Distribution','Payment Method Distribution Churn', | |
| 'Churn Distribution w.r.t Internet Service and Gender','Dependents Distribution Churn', | |
| 'Churn Distribution w.r.t Partners','Churn Distribution w.r.t Senior Citizens', | |
| 'Churn Distribution w.r.t Online Security','Churn Distribution w.r.t Paperless Billing', | |
| 'Churn Distribution w.r.t Tech Support','Churn Distribution w.r.t Phone Service', | |
| 'Tenure vs. Churn'] | |
| MODEL_SELECTOR = ['KNN','SVC','RF','LR','DT','Adaboost','Gradient Boosting','Voting Classifier'] | |
| num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges'] | |
| scaler= StandardScaler() | |
| def preprocess(df): | |
| df = df.drop(['customerID'], axis = 1) | |
| df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce') | |
| df[np.isnan(df['TotalCharges'])] | |
| df[df['tenure'] == 0].index | |
| df.drop(labels=df[df['tenure'] == 0].index, axis=0, inplace=True) | |
| df[df['tenure'] == 0].index | |
| df.fillna(df["TotalCharges"].mean()) | |
| df["SeniorCitizen"]= df["SeniorCitizen"].map({0: "No", 1: "Yes"}) | |
| return df | |
| def object_to_int(dataframe_series): | |
| if dataframe_series.dtype=='object': | |
| dataframe_series = LabelEncoder().fit_transform(dataframe_series) | |
| return dataframe_series | |
| def evaluate_voter(test_feature_vector, df,test_size,random_state): | |
| print(df) | |
| df = preprocess(df) | |
| df = df.apply(lambda x: object_to_int(x)) | |
| X = df.drop(columns = ['Churn']) | |
| y = df['Churn'].values | |
| X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = test_size, random_state = random_state, stratify=y) | |
| df_std = pd.DataFrame(StandardScaler().fit_transform(df[num_cols].astype('float64')),columns=num_cols) | |
| X_train[num_cols] = scaler.fit_transform(X_train[num_cols]) | |
| X_test[num_cols] = scaler.transform(X_test[num_cols]) | |
| clf1 = GradientBoostingClassifier() | |
| clf2 = LogisticRegression() | |
| clf3 = AdaBoostClassifier() | |
| eclf1 = VotingClassifier(estimators=[('gbc', clf1), ('lr', clf2), ('abc', clf3)], voting='soft') | |
| eclf1.fit(X_train, y_train) | |
| #feeding the feature vector as a test input | |
| predicted_y = eclf1.predict(test_feature_vector) | |
| if predicted_y[0] == 1: | |
| #print('The customer is likely to stop using the services') | |
| return 'Customer is likely to stop using the telecom services' | |
| else: | |
| #print('The customer is likely to continue using the services') | |
| return 'Customer is likely to continue using the telecom services' | |
| def standardize_feature_vector(df,original_df, test_size,random_state): | |
| df = df.drop(['customerID'], axis = 1) | |
| df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce') | |
| #Manual label encoding is the only solution here... | |
| df["SeniorCitizen"]= df["SeniorCitizen"].map({"No": 0, "Yes": 1}) | |
| df['gender'] = df['gender'].map({'Female':0,'Male':1}) | |
| df['Partner'] = df['Partner'].map({"No":0,"Yes":1}) | |
| df['Dependents'] = df['Dependents'].map({"No":0,"Yes":1}) | |
| df['PhoneService'] = df['PhoneService'].map({"No":0,"Yes":1}) | |
| df['MultipleLines'] = df['MultipleLines'].map({"No phone service":1,"No":0,"Yes":2}) | |
| df['InternetService'] = df['InternetService'].map({'DSL':0,'Fiber optic':1,'No':2}) | |
| df['OnlineSecurity'] = df['OnlineSecurity'].map({'No':0,'Yes':2,'No internet service':1}) | |
| df['OnlineBackup'] = df['OnlineBackup'].map({'No':0,'Yes':2,'No internet service':1}) | |
| df['DeviceProtection'] = df['DeviceProtection'].map({'No':0,'Yes':2,'No internet service':1}) | |
| df['TechSupport'] = df['TechSupport'].map({'No':0,'Yes':2,'No internet service':1}) | |
| df['StreamingTV'] = df['StreamingTV'].map({'No':0,'Yes':2,'No internet service':1}) | |
| df['StreamingMovies'] = df['StreamingMovies'].map({'No':0,'Yes':2,'No internet service':1}) | |
| df['Contract'] = df['Contract'].map({'Month-to-month':0,'One year':1,'Two year':2}) | |
| df['PaperlessBilling'] = df['PaperlessBilling'].map({"No":0,"Yes":1}) | |
| df['PaymentMethod'] = df['PaymentMethod'].map({'Electronic check':2, 'Mailed check':3,'Bank transfer (automatic)':0,'Credit card (automatic)':1}) | |
| #Churn -> No:0, Yes:1 | |
| numpy_vector = df.to_numpy() | |
| print(df) | |
| print(numpy_vector) | |
| #passing the vector as a test vector to a trained voting classifier | |
| return evaluate_voter(df,original_df,test_size,random_state) | |
| def standardize_dataframe(filepath,option,test_size,random_state): | |
| df = pd.read_csv(filepath) | |
| #print(df) | |
| df_new = preprocess(df) | |
| #print(df) | |
| #label encoding the dataframe | |
| df_new = df_new.apply(lambda x: object_to_int(x)) | |
| #inputs and target selection | |
| X = df_new.drop(columns = ['Churn']) | |
| y = df_new['Churn'].values | |
| #train test split (Allowing the user to choose the optimal train/test split percentage) | |
| X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = test_size, random_state = random_state, stratify=y) | |
| #Standardizing the variables | |
| df_std = pd.DataFrame(StandardScaler().fit_transform(df_new[num_cols].astype('float64')),columns=num_cols) | |
| X_train[num_cols] = scaler.fit_transform(X_train[num_cols]) | |
| X_test[num_cols] = scaler.transform(X_test[num_cols]) | |
| if option == 'KNN': | |
| knn_model = KNeighborsClassifier(n_neighbors = 11) | |
| knn_model.fit(X_train,y_train) | |
| predicted_y = knn_model.predict(X_test) | |
| return accuracy_score(predicted_y,y_test), classification_report(y_test, predicted_y),df_new,df | |
| elif option == 'SVC': | |
| svc_model = SVC(random_state = 1) | |
| svc_model.fit(X_train,y_train) | |
| predicted_y = svc_model.predict(X_test) | |
| return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df | |
| elif option == 'RF': | |
| model_rf = RandomForestClassifier(n_estimators=500 , oob_score = True, n_jobs = -1, | |
| random_state =50, max_features = "auto", | |
| max_leaf_nodes = 30) | |
| model_rf.fit(X_train, y_train) | |
| predicted_y = model_rf.predict(X_test) | |
| return accuracy_score(y_test, predicted_y), classification_report(y_test,predicted_y),df_new,df | |
| elif option == 'LR': | |
| lr_model = LogisticRegression() | |
| lr_model.fit(X_train,y_train) | |
| predicted_y = lr_model.predict(X_test) | |
| return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df | |
| elif option == 'DT': | |
| dt_model = DecisionTreeClassifier() | |
| dt_model.fit(X_train,y_train) | |
| predicted_y = dt_model.predict(X_test) | |
| return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df | |
| elif option == 'Adaboost': | |
| a_model = AdaBoostClassifier() | |
| a_model.fit(X_train,y_train) | |
| predicted_y = a_model.predict(X_test) | |
| return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df | |
| elif option == 'Gradient Boosting': | |
| gb = GradientBoostingClassifier() | |
| gb.fit(X_train, y_train) | |
| predicted_y = gb.predict(X_test) | |
| return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df | |
| elif option == 'Voting Classifier': | |
| clf1 = GradientBoostingClassifier() | |
| clf2 = LogisticRegression() | |
| clf3 = AdaBoostClassifier() | |
| eclf1 = VotingClassifier(estimators=[('gbc', clf1), ('lr', clf2), ('abc', clf3)], voting='soft') | |
| eclf1.fit(X_train, y_train) | |
| predicted_y = eclf1.predict(X_test) | |
| return accuracy_score(predicted_y,y_test), classification_report(y_test,predicted_y),df_new,df | |
| def visualize(df): | |
| g_labels = ['Male', 'Female'] | |
| c_labels = ['No', 'Yes'] | |
| # Create subplots: use 'domain' type for Pie subplot | |
| fig1 = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]]) | |
| fig1.add_trace(go.Pie(labels=g_labels, values=df['gender'].value_counts(), name="Gender"), | |
| 1, 1) | |
| fig1.add_trace(go.Pie(labels=c_labels, values=df['Churn'].value_counts(), name="Churn"), | |
| 1, 2) | |
| # Use `hole` to create a donut-like pie chart | |
| fig1.update_traces(hole=.4, hoverinfo="label+percent+name", textfont_size=16) | |
| fig1.update_layout( | |
| title_text="Gender and Churn Distributions", | |
| # Add annotations in the center of the donut pies. | |
| annotations=[dict(text='Gender', x=0.16, y=0.5, font_size=20, showarrow=False), | |
| dict(text='Churn', x=0.84, y=0.5, font_size=20, showarrow=False)]) | |
| fig2 = px.histogram(df, x="Churn", color="Contract", barmode="group", title="<b>Customer contract distribution<b>") | |
| fig2.update_layout(width=700, height=500, bargap=0.1) | |
| labels = df['PaymentMethod'].unique() | |
| values = df['PaymentMethod'].value_counts() | |
| fig3 = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)]) | |
| fig3.update_layout(title_text="<b>Payment Method Distribution</b>") | |
| fig4 = px.histogram(df, x="Churn", color="PaymentMethod", title="<b>Customer Payment Method distribution w.r.t. Churn</b>") | |
| fig4.update_layout(width=700, height=500, bargap=0.1) | |
| fig5 = go.Figure() | |
| fig5.add_trace(go.Bar( | |
| x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'], | |
| ["Female", "Male", "Female", "Male"]], | |
| y = [965, 992, 219, 240], | |
| name = 'DSL', | |
| )) | |
| fig5.add_trace(go.Bar( | |
| x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'], | |
| ["Female", "Male", "Female", "Male"]], | |
| y = [889, 910, 664, 633], | |
| name = 'Fiber optic', | |
| )) | |
| fig5.add_trace(go.Bar( | |
| x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'], | |
| ["Female", "Male", "Female", "Male"]], | |
| y = [690, 717, 56, 57], | |
| name = 'No Internet', | |
| )) | |
| fig5.update_layout(title_text="<b>Churn Distribution w.r.t. Internet Service and Gender</b>") | |
| color_map = {"Yes": "#FF97FF", "No": "#AB63FA"} | |
| fig6 = px.histogram(df, x="Churn", color="Dependents", barmode="group", title="<b>Dependents distribution</b>", color_discrete_map=color_map) | |
| fig6.update_layout(width=700, height=500, bargap=0.1) | |
| color_map = {"Yes": '#FFA15A', "No": '#00CC96'} | |
| fig7 = px.histogram(df, x="Churn", color="Partner", barmode="group", title="<b>Churn distribution w.r.t. Partners</b>", color_discrete_map=color_map) | |
| fig7.update_layout(width=700, height=500, bargap=0.1) | |
| color_map = {"Yes": '#00CC96', "No": '#B6E880'} | |
| fig8 = px.histogram(df, x="Churn", color="SeniorCitizen", title="<b>Churn distribution w.r.t. Senior Citizen</b>", color_discrete_map=color_map) | |
| fig8.update_layout(width=700, height=500, bargap=0.1) | |
| color_map = {"Yes": "#FF97FF", "No": "#AB63FA"} | |
| fig9 = px.histogram(df, x="Churn", color="OnlineSecurity", barmode="group", title="<b>Churn distribution w.r.t Online Security</b>", color_discrete_map=color_map) | |
| fig9.update_layout(width=700, height=500, bargap=0.1) | |
| color_map = {"Yes": '#FFA15A', "No": '#00CC96'} | |
| fig10 = px.histogram(df, x="Churn", color="PaperlessBilling", title="<b>Churn distribution w.r.t. Paperless Billing</b>", color_discrete_map=color_map) | |
| fig10.update_layout(width=700, height=500, bargap=0.1) | |
| fig11 = px.histogram(df, x="Churn", color="TechSupport",barmode="group", title="<b>Churn distribution w.r.t. Tech Support</b>") | |
| fig11.update_layout(width=700, height=500, bargap=0.1) | |
| color_map = {"Yes": '#00CC96', "No": '#B6E880'} | |
| fig12 = px.histogram(df, x="Churn", color="PhoneService", title="<b>Churn Distribution w.r.t. Phone Service</b>", color_discrete_map=color_map) | |
| fig12.update_layout(width=700, height=500, bargap=0.1) | |
| fig13 = px.box(df, x='Churn', y = 'tenure') | |
| fig13.update_yaxes(title_text='Tenure (Months)', row=1, col=1) | |
| fig13.update_xaxes(title_text='Churn', row=1, col=1) | |
| fig13.update_layout(autosize=True, width=750, height=600, | |
| title_font=dict(size=25, family='Courier'), | |
| title='<b>Tenure vs Churn</b>', | |
| ) | |
| return fig1,fig2,fig3,fig4,fig5,fig6,fig7,fig8,fig9,fig10,fig11,fig12,fig13 | |
| def take_input(filepath): | |
| df = pd.read_csv(filepath) | |
| processed_df = preprocess(df) | |
| fig1,fig2,fig3,fig4,fig5,fig6,fig7,fig8,fig9,fig10,fig11,fig12,fig13 = visualize(processed_df) | |
| return fig1,fig2,fig3,fig4,fig5,fig6,fig7,fig8,fig9,fig10,fig11,fig12,fig13, processed_df |