File size: 7,136 Bytes
13a088c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import streamlit as st
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from mlxtend.evaluate import bias_variance_decomp
import numpy as np

# Use a dark background style for plots
plt.style.use('dark_background')

# Function to generate custom data
def generate_data(n_classes, n_samples, pattern='Linear'):
    X = np.zeros((n_classes*n_samples, 2))
    y = np.zeros(n_classes*n_samples, dtype='uint8')
    for j in range(n_classes):
        ix = range(n_samples*j, n_samples*(j+1))
        if pattern == 'Spiral':
            r = np.linspace(0.0, 1, n_samples) # radius
            t = np.linspace(j*4, (j+1)*4, n_samples) + np.random.randn(n_samples)*0.2 # theta
            X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
        elif pattern == 'Linear':
            X[ix] = np.random.rand(n_samples, 2) * [j * 2, 1] + np.random.randn(n_samples, 2) * 0.2
        elif pattern == 'Concentric Circle':
            t = np.linspace(0, 2*np.pi, n_samples)
            r = j/float(n_classes) + np.random.randn(n_samples)*0.1
            X[ix] = np.c_[r*np.cos(t), r*np.sin(t)]
        elif pattern == 'Blob':
            t = np.linspace(0, 2*np.pi, n_samples)
            r = 0.8 + np.random.randn(n_samples)*0.1
            X[ix] = np.c_[r*np.cos(t), r*np.sin(t)] + np.random.randn(n_samples, 2)*0.2
        elif pattern == 'Crescent':
            half_samples = int(n_samples / 2)
            theta = np.linspace(j * np.pi, (j + 2) * np.pi, n_samples)
            r = np.linspace(1.0, 2.5, half_samples)
            r = np.concatenate((r, np.linspace(2.5, 1.0, half_samples)))
            X[ix] = np.c_[r*np.sin(theta), r*np.cos(theta)]
        elif pattern == 'Normal':
            for j in range(n_classes):
                ix = range(n_samples*j, n_samples*(j+1))
                X[ix] = np.random.randn(n_samples, 2) * 0.5 + np.random.randn(2) * j * 2
                y[ix] = j
            return X, y
        elif pattern == 'Random':
            X[ix] = np.random.randn(n_samples, 2)*0.5 + np.random.randn(2)*j*2
        else:
            raise ValueError('Invalid pattern: {}'.format(pattern))
        y[ix] = j
    return X, y

# Function to plot decision boundary and calculate model evaluation metrics
def keffect(k):
    X, y = generate_data(num_classes, num_data_points, pattern=pattern)
    
    knn = KNeighborsClassifier(n_neighbors=k)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
    knn.fit(X_train,y_train)
    y_pred = knn.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    mse, bias, var = bias_variance_decomp(knn, X_train, y_train, X_test, y_test, loss='mse', num_rounds=200, random_seed=1)
    
    # Create a meshgrid for decision boundary plotting
    a = np.arange(start=X_train[:,0].min()-1, stop=X_train[:,0].max()+1, step=0.01)
    b = np.arange(start=X_train[:,1].min()-1, stop=X_train[:,1].max()+1, step=0.01)
    XX, YY = np.meshgrid(a, b)
    input_array = np.array([XX.ravel(), YY.ravel()]).T
    labels = knn.predict(input_array)
    
    # Plot decision boundary
    fig, ax = plt.subplots(figsize=(fig_width, fig_height))
    ax.set_facecolor('#FFF')
    ax.contourf(XX, YY, labels.reshape(XX.shape), alpha=selected_alpha, cmap='Set1', edgecolors='black')
    scatter = ax.scatter(X[:,0], X[:,1], c=y, cmap='Set1', edgecolors='black')
    ax.set_title('K-Nearest Neighbors (K = {})'.format(k), color='white')
    ax.set_xlabel('Feature 1', color='white')
    ax.set_ylabel('Feature 2', color='white')
    ax.tick_params(axis='x', colors='white')
    ax.tick_params(axis='y', colors='white')

    # Remove top and right spines
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)

    result = [accuracy, mse, bias, var]
    return fig, result

# Function to plot bias-variance tradeoff
def plot_bias_variance_tradeoff(start_value, end_value):
    X, y = generate_data(num_classes, num_data_points, pattern=pattern)

    ks = range(start_value, end_value)
    mse, bias, var = [], [], []
    for k in ks:
        knn = KNeighborsClassifier(n_neighbors=k)
        mse_k, bias_k, var_k = bias_variance_decomp(knn, X, y, X, y, loss='mse', num_rounds=200, random_seed=1)
        mse.append(mse_k)
        bias.append(bias_k)
        var.append(var_k)

    fig, ax = plt.subplots(figsize=(fig_width, fig_height))
    ax.plot(ks, mse, label='MSE', color='crimson')
    ax.plot(ks, bias, label='Bias', color='magenta')
    ax.plot(ks, var, label='Variance', color='cyan')
    ax.legend()
    ax.set_title('Bias-Variance Tradeoff', color='white')
    ax.set_xlabel('Number of Neighbors (K)', color='white')
    ax.set_ylabel('Error', color='white')
    ax.tick_params(axis='x', colors='white')
    ax.tick_params(axis='y', colors='white')
    ax.set_xticks(list(range(start_value, end_value, 5)) + [end_value])
    ax.set_facecolor('#000')

    return fig

# Create a streamlit app to interact with the functions
st.set_page_config(page_title='K-Nearest Neighbors', layout='wide')
st.title('K-Nearest Neighbors')

with st.sidebar:
    # Set up Streamlit sidebar
    st.sidebar.header("Plot Settings")
    [fig_width, fig_height] = [st.sidebar.slider(label, 1, 20, default) for label, default in [("Figure Width", 10), ("Figure Height", 6)]]
    selected_alpha = st.sidebar.slider('Select the transparency of the decision boundary', min_value=0.0, max_value=1.0, value=0.5, step=0.1)

    st.write("---")
    st.sidebar.header("Data Settings")
    pattern = st.selectbox('Select a pattern', ['Linear', 'Concentric Circle', 'Spiral', 'Blob', 'Crescent', 'Normal', 'Random'])
    num_classes = st.slider('Select the number of classes', min_value=2, max_value=10, value=2, step=1)
    num_data_points = st.slider('Select the number of data points', min_value=20, max_value=200, value=40, step=20)

    st.write("---")
    st.sidebar.header("Select the number of neighbors (K)")
    selected_k = st.slider(label="", min_value=1, max_value=50, value=3, step=1)
    
    st.write("---")
    st.sidebar.header("Select a range for bias-variance tradeoff")
    range_slider = st.slider(
        label="",
        min_value=1,
        max_value=50,
        value=(1, 20),
        step=1
    )
    start_value, end_value = range_slider

    st.write("---")

if st.button('Get Decision Boundary'):
    # st.write('Decision Boundary')
    fig, result = keffect(min(selected_k, num_data_points))
    st.write(fig)

    st.write('Model evaluation metrics')
    st.write('Accuracy:', round(result[0], 3))
    st.write('MSE:', round(result[1], 3))
    st.write('Bias:', round(result[2], 3))
    st.write('Variance:', round(result[3], 3))

if st.button('Get Bias-Variance Tradeoff'):
    # st.write('Bias-Variance Tradeoff')
    fig2 = plot_bias_variance_tradeoff(min(start_value, num_data_points), min(end_value, num_data_points))
    st.write(fig2)