import numpy as np
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(16, 4))
rng = np.random.default_rng(7)

# Synthetic 3-class data for illustration
centers = [[-2, -2], [2, -1], [0, 3]]
class_colors = ['steelblue', 'tomato', 'seagreen']
X_demo, y_demo = [], []
for i, c in enumerate(centers):
    pts = rng.multivariate_normal(c, [[0.5,0],[0,0.5]], 30)
    X_demo.append(pts); y_demo.extend([i]*30)
X_demo = np.vstack(X_demo); y_demo = np.array(y_demo)

query = np.array([0.5, 0.0])

# ── Diagram 1: K=1 ────────────────────────────────────────────────────────────
for ax, K, title in [(axes[0], 1, 'K=1 (single nearest neighbor)'),
                      (axes[1], 5, 'K=5 (majority vote)'),
                      (axes[2], 15, 'K=15 (many neighbors)')]:
    for i, col in enumerate(class_colors):
        m = y_demo == i
        ax.scatter(X_demo[m,0], X_demo[m,1], c=col, edgecolors='k', s=30, alpha=0.7, zorder=3)
    dists = np.linalg.norm(X_demo - query, axis=1)
    knn_idx = np.argsort(dists)[:K]
    for idx in knn_idx:
        ax.plot([query[0], X_demo[idx,0]], [query[1], X_demo[idx,1]],
                'k-', lw=1, alpha=0.5, zorder=2)
    votes = [int((y_demo[knn_idx] == i).sum()) for i in range(3)]
    pred = int(np.argmax(votes))
    ax.scatter(*query, c=class_colors[pred], edgecolors='black', s=150,
               marker='*', zorder=5, linewidths=2, label=f'Query (pred={pred})')
    radius = dists[knn_idx[-1]] + 0.05
    circle = plt.Circle(query, radius, fill=False, color='gray', lw=1.5, linestyle='--')
    ax.add_patch(circle)
    ax.set_title(title, fontsize=10, fontweight='bold')
    ax.set_xlim(-4.5, 4.5); ax.set_ylim(-4.5, 5)
    ax.legend(fontsize=8); ax.grid(True, linestyle='--', alpha=0.4); ax.set_aspect('equal')

plt.suptitle('Figure 1 — Effect of K on KNN Prediction', fontsize=13, y=1.02)
plt.tight_layout(); plt.show()

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

data = load_wine()
X_raw, y = data.data, data.target

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test  = scaler.transform(X_test_raw)

pca = PCA(n_components=2, random_state=42)
X_vis_tr = pca.fit_transform(X_train)
X_vis_te = pca.transform(X_test)

print(f'Features  : {X_train.shape[1]}  ({", ".join(data.feature_names[:6])}, ...)')
print(f'Train     : {X_train.shape[0]}  ({sum(y_train==0)} class0, {sum(y_train==1)} class1, {sum(y_train==2)} class2)')
print(f'Test      : {X_test.shape[0]}')
print(f'PCA variance explained: {pca.explained_variance_ratio_.sum():.1%}')

Features  : 13  (alcohol, malic_acid, ash, alcalinity_of_ash, magnesium, total_phenols, ...)
Train     : 142  (47 class0, 57 class1, 38 class2)
Test      : 36
PCA variance explained: 55.1%

palette = ['steelblue', 'tomato', 'seagreen']
class_labels = [f'Class {i} ({data.target_names[i]})' for i in range(3)]

fig, axes = plt.subplots(1, 2, figsize=(13, 5))

ax = axes[0]
for i, (col, lbl) in enumerate(zip(palette, class_labels)):
    m = y_train == i
    ax.scatter(X_vis_tr[m,0], X_vis_tr[m,1], c=col, label=lbl,
               edgecolors='k', s=40, alpha=0.7, zorder=3)
ax.set_title('PCA 2D projection — Training set', fontsize=11, fontweight='bold')
ax.set_xlabel('PCA 1'); ax.set_ylabel('PCA 2')
ax.legend(fontsize=9); ax.grid(True, linestyle='--', alpha=0.4)

ax = axes[1]
top_features = ['alcohol', 'flavanoids', 'color_intensity', 'proline']
top_idx = [data.feature_names.index(f) for f in top_features]
for idx, fname in zip(top_idx, top_features):
    for i, col in enumerate(palette):
        ax.boxplot(X_train[y_train==i, idx],
                   positions=[idx*4 + i],
                   widths=0.7, patch_artist=True,
                   boxprops=dict(facecolor=col, alpha=0.7),
                   medianprops=dict(color='black', lw=2),
                   whiskerprops=dict(lw=1), capprops=dict(lw=1),
                   flierprops=dict(marker='o', markersize=3))

ticks_pos = [idx*4 + 1 for idx in top_idx]
ax.set_xticks(ticks_pos)
ax.set_xticklabels(top_features, fontsize=9)
ax.set_ylabel('Standardised value')
ax.set_title('Feature distributions by class (top 4 most discriminative)', fontsize=11, fontweight='bold')
ax.axhline(0, color='gray', lw=0.8, linestyle=':')
ax.grid(True, linestyle='--', alpha=0.4, axis='y')
for col, lbl in zip(palette, class_labels):
    ax.bar(0, 0, color=col, alpha=0.7, label=lbl)
ax.legend(fontsize=8)
plt.tight_layout(); plt.show()

def knn_predict(X_train, y_train, X_query, K):
    predictions = []
    for xq in X_query:
        dists = np.linalg.norm(X_train - xq, axis=1)
        knn_idx = np.argsort(dists)[:K]
        neighbors = y_train[knn_idx]
        counts = np.bincount(neighbors, minlength=len(np.unique(y_train)))
        predictions.append(np.argmax(counts))
    return np.array(predictions)

def error_rate(y_true, y_pred):
    return np.mean(y_true != y_pred)

# Try a range of K values
K_values = list(range(1, 31))
tr_errors, te_errors = [], []
for K in K_values:
    y_pred_tr = knn_predict(X_train, y_train, X_train, K)
    y_pred_te = knn_predict(X_train, y_train, X_test,  K)
    tr_errors.append(error_rate(y_train, y_pred_tr))
    te_errors.append(error_rate(y_test,  y_pred_te))

best_K = K_values[np.argmin(te_errors)]
print(f'Best K by test error: K={best_K}  (test err={min(te_errors):.2%})')

fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(K_values, tr_errors, label='Train error', marker='o', markersize=4, lw=1.5)
ax.plot(K_values, te_errors, label='Test error',  marker='s', markersize=4, lw=1.5, linestyle='--')
ax.axvline(best_K, color='tomato', linestyle=':', lw=2, label=f'Best K={best_K}')
ax.set_xlabel('K'); ax.set_ylabel('Error rate')
ax.set_title('KNN — Error Rate vs K', fontsize=12, fontweight='bold')
ax.legend(); ax.grid(True, linestyle='--', alpha=0.4)
plt.tight_layout(); plt.show()

Best K by test error: K=7  (test err=0.00%)

y_pred_best = knn_predict(X_train, y_train, X_test, best_K)

x_min, x_max = X_vis_tr[:,0].min()-0.5, X_vis_tr[:,0].max()+0.5
y_min, y_max = X_vis_tr[:,1].min()-0.5, X_vis_tr[:,1].max()+0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 120),
                      np.linspace(y_min, y_max, 120))
grid_pca = np.c_[xx.ravel(), yy.ravel()]
# pca was fitted on standardised X_train; inverse_transform returns to that same space
grid_full = pca.inverse_transform(grid_pca)
zz = knn_predict(X_train, y_train, grid_full, best_K).reshape(xx.shape)

fig, axes = plt.subplots(1, 2, figsize=(13, 5))

cmap_light = ['#d4e8f5', '#f5d4d4', '#d4f5d4']
cmap_pts   = palette

for ax, title, show_test in [(axes[0], f'Decision Boundary K={best_K} — Training set', False),
                               (axes[1], f'Decision Boundary K={best_K} — Test set',     True)]:
    ax.contourf(xx, yy, zz, levels=[-0.5,0.5,1.5,2.5],
                colors=cmap_light, alpha=0.6)
    ax.contour(xx, yy, zz, levels=[0.5,1.5], colors='gray', linewidths=1)
    if show_test:
        for i, col in enumerate(cmap_pts):
            m = y_test == i
            correct = y_pred_best[m] == y_test[m]
            ax.scatter(X_vis_te[m,0][correct],  X_vis_te[m,1][correct],
                       c=col, edgecolors='k', s=60, alpha=0.9, zorder=3)
            ax.scatter(X_vis_te[m,0][~correct], X_vis_te[m,1][~correct],
                       c=col, edgecolors='red', s=100, marker='X', alpha=0.9, zorder=4,
                       linewidths=1.5, label=f'Class {i} misclassified' if (~correct).any() else '')
    else:
        for i, (col, lbl) in enumerate(zip(cmap_pts, class_labels)):
            m = y_train == i
            ax.scatter(X_vis_tr[m,0], X_vis_tr[m,1], c=col, label=lbl,
                       edgecolors='k', s=40, alpha=0.8, zorder=3)
    ax.set_title(title, fontsize=10, fontweight='bold')
    ax.set_xlabel('PCA 1'); ax.set_ylabel('PCA 2')
    ax.legend(fontsize=8); ax.grid(True, linestyle='--', alpha=0.3)

plt.tight_layout(); plt.show()
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_best)
print(f'Test accuracy: {1-error_rate(y_test,y_pred_best):.2%}')
print('Confusion matrix:')
print(cm)

/var/folders/vy/j46pv9953f1bkgw4tnv13y840000gn/T/ipykernel_47530/3206292353.py:38: UserWarning: No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
  ax.legend(fontsize=8); ax.grid(True, linestyle='--', alpha=0.3)

Test accuracy: 100.00%
Confusion matrix:
[[12  0  0]
 [ 0 14  0]
 [ 0  0 10]]

fig, axes = plt.subplots(1, 3, figsize=(16, 5))
K_compare = [1, best_K, 20]

for ax, K in zip(axes, K_compare):
    zz_k = knn_predict(X_train, y_train, grid_full, K).reshape(xx.shape)
    ax.contourf(xx, yy, zz_k, levels=[-0.5,0.5,1.5,2.5], colors=cmap_light, alpha=0.6)
    ax.contour(xx, yy, zz_k, levels=[0.5,1.5], colors='gray', linewidths=1)
    for i, col in enumerate(cmap_pts):
        m = y_train == i
        ax.scatter(X_vis_tr[m,0], X_vis_tr[m,1], c=col, edgecolors='k',
                   s=25, alpha=0.7, zorder=3)
    tr_e = error_rate(y_train, knn_predict(X_train, y_train, X_train, K))
    te_e = error_rate(y_test,  knn_predict(X_train, y_train, X_test,  K))
    ax.set_title(f'K={K}\nTrain={tr_e:.1%}  Test={te_e:.1%}', fontsize=11, fontweight='bold')
    ax.set_xlabel('PCA 1'); ax.set_ylabel('PCA 2')
    ax.grid(True, linestyle='--', alpha=0.3)

plt.suptitle('Decision Boundaries for K=1, Best K, and Large K', fontsize=13, y=1.02)
plt.tight_layout(); plt.show()

K	Train error	Test error	Accuracy
K=1	0.00%	2.78%	97.2%
K=7 (best)	2.82%	0.00%	100%
K=20	2.82%	0.00%	100%

K-Nearest Neighbors (KNN)¶

1. Introduction¶

2. The Math¶

Prediction¶

Distance metrics¶

Weighted KNN¶

3. Problem Class¶

4. Implementation¶

Dataset: Wine¶

4.1 Explore the Data¶

4.2 KNN from Scratch¶

4.3 Decision Boundary (Best K)¶

4.4 Comparing K=1, Best K, and Large K¶

5. Results¶

6. Limitations¶