import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

fig, axes = plt.subplots(1, 3, figsize=(16, 4))

# ── Diagram 1: linear vs non-linear separation ────────────────────────────────
ax = axes[0]
rng = np.random.default_rng(0)
theta = np.linspace(0, 2*np.pi, 50)
r_in, r_out = 1.0, 2.2
Xp = np.column_stack([r_in*np.cos(theta) + rng.normal(0,.15,50),
                       r_in*np.sin(theta) + rng.normal(0,.15,50)])
Xn = np.column_stack([r_out*np.cos(theta) + rng.normal(0,.15,50),
                       r_out*np.sin(theta) + rng.normal(0,.15,50)])
ax.scatter(Xp[:,0], Xp[:,1], c='steelblue', edgecolors='k', s=30, label='y=+1')
ax.scatter(Xn[:,0], Xn[:,1], c='tomato',    edgecolors='k', s=30, label='y=-1')
ax.plot([-3,3],[0,0],'k--',lw=1.5,alpha=0.5,label='Any linear boundary')
ax.set_title('Linear boundary cannot separate circles', fontsize=10, fontweight='bold')
ax.legend(fontsize=8); ax.set_aspect('equal'); ax.grid(True,linestyle='--',alpha=0.4)

# ── Diagram 2: RBF kernel similarity heatmap ─────────────────────────────────
ax = axes[1]
x1 = np.linspace(-3, 3, 100)
x2 = np.linspace(-3, 3, 100)
X1, X2 = np.meshgrid(x1, x2)
anchor = np.array([0.0, 0.0])
dist2 = (X1 - anchor[0])**2 + (X2 - anchor[1])**2
K = np.exp(-0.5 * dist2)
c = ax.contourf(X1, X2, K, levels=20, cmap='Blues')
plt.colorbar(c, ax=ax)
ax.scatter(*anchor, c='red', s=100, zorder=5, label="Anchor x'")
ax.set_title("RBF kernel K(x,x') - Similarity decays with distance", fontsize=10, fontweight='bold')
ax.set_xlabel('x1'); ax.set_ylabel('x2')
ax.legend(fontsize=8); ax.grid(True,linestyle='--',alpha=0.3)

# ── Diagram 3: kernel decision boundary concept ───────────────────────────────
ax = axes[2]
ax.scatter(Xp[:,0], Xp[:,1], c='steelblue', edgecolors='k', s=30, label='y=+1')
ax.scatter(Xn[:,0], Xn[:,1], c='tomato',    edgecolors='k', s=30, label='y=-1')
circle = plt.Circle((0,0), 1.6, fill=False, color='black', lw=2, linestyle='-', label='Kernel boundary')
ax.add_patch(circle)
ax.set_title('Kernel perceptron: curved boundaries', fontsize=10, fontweight='bold')
ax.legend(fontsize=8); ax.set_aspect('equal'); ax.grid(True,linestyle='--',alpha=0.4)

plt.suptitle('Figure 1 — Kernel Perceptron Concepts', fontsize=13, y=1.02)
plt.tight_layout(); plt.show()

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

data = fetch_openml(name='banknote-authentication', version=1, as_frame=False, parser='auto')
X_raw = data.data.astype(float)
y_raw = data.target.astype(float)
y = np.where(y_raw == 1, 1, -1)

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test  = scaler.transform(X_test_raw)

pca = PCA(n_components=2, random_state=42)
X_vis = pca.fit_transform(X_train)

print(f'Features  : {X_train.shape[1]}  (variance, skewness, curtosis, entropy)')
print(f'Train     : {X_train.shape[0]}  ({(y_train==1).sum()} genuine, {(y_train==-1).sum()} forged)')
print(f'Test      : {X_test.shape[0]}')
print(f'PCA variance explained: {pca.explained_variance_ratio_.sum():.1%}')

Features  : 4  (variance, skewness, curtosis, entropy)
Train     : 1097  (609 genuine, 488 forged)
Test      : 275
PCA variance explained: 86.9%

fig, axes = plt.subplots(1, 2, figsize=(13, 5))
feature_names = ['variance', 'skewness', 'curtosis', 'entropy']

ax = axes[0]
colors = {1: 'steelblue', -1: 'tomato'}
labels = {1: 'Genuine (+1)', -1: 'Forged (-1)'}
for lbl in [1, -1]:
    m = y_train == lbl
    ax.scatter(X_vis[m,0], X_vis[m,1], c=colors[lbl], label=labels[lbl],
               edgecolors='k', s=30, alpha=0.6, zorder=3)
ax.set_title('PCA 2D projection of training set', fontsize=11, fontweight='bold')
ax.set_xlabel('PCA 1'); ax.set_ylabel('PCA 2')
ax.legend(); ax.grid(True, linestyle='--', alpha=0.4)

ax = axes[1]
for i, name in enumerate(feature_names):
    for lbl, col in [(1,'steelblue'),(-1,'tomato')]:
        m = y_train == lbl
        ax.hist(X_train[m, i], bins=30, alpha=0.4, color=col, density=True)
    ax.set_xlabel(name)
ax = axes[1]
for i, name in enumerate(feature_names):
    vals_p = X_train[y_train==1, i]
    vals_n = X_train[y_train==-1, i]
    ax.boxplot([vals_p, vals_n], positions=[i*3, i*3+1],
               widths=0.8, patch_artist=True,
               boxprops=dict(facecolor='steelblue' if True else 'tomato'),
               medianprops=dict(color='black', lw=2))

ax2 = axes[1]
ax2.cla()
width = 0.35
x = np.arange(len(feature_names))
means_p = [X_train[y_train==1, i].mean() for i in range(4)]
means_n = [X_train[y_train==-1, i].mean() for i in range(4)]
ax2.bar(x - width/2, means_p, width, label='Genuine (+1)', color='steelblue', alpha=0.8, edgecolor='k')
ax2.bar(x + width/2, means_n, width, label='Forged (-1)',  color='tomato',    alpha=0.8, edgecolor='k')
ax2.set_xticks(x); ax2.set_xticklabels(feature_names)
ax2.set_ylabel('Mean (standardised)')
ax2.set_title('Mean feature value by class', fontsize=11, fontweight='bold')
ax2.legend(); ax2.grid(True, linestyle='--', alpha=0.4, axis='y')
ax2.axhline(0, color='black', lw=0.8)

plt.tight_layout(); plt.show()

def linear_perceptron(X, y, X_te, y_te, T=20):
    n, d = X.shape
    theta, theta0 = np.zeros(d), 0.0
    tr_hist, te_hist = [], []
    for _ in range(T):
        for i in range(n):
            if y[i] * (theta @ X[i] + theta0) <= 0:
                theta  += y[i] * X[i]
                theta0 += y[i]
        tr_hist.append(np.mean(np.sign(X    @ theta + theta0) != y))
        te_hist.append(np.mean(np.sign(X_te @ theta + theta0) != y_te))
    return theta, theta0, tr_hist, te_hist

theta_lin, theta0_lin, tr_lin, te_lin = linear_perceptron(
    X_train, y_train, X_test, y_test, T=20)

print(f'Linear Perceptron — train err: {tr_lin[-1]:.2%}   test err: {te_lin[-1]:.2%}')

Linear Perceptron — train err: 0.64%   test err: 1.45%

def rbf_kernel_matrix(X, Y, bandwidth=1.0):
    diff = X[:, np.newaxis, :] - Y[np.newaxis, :, :]
    return np.exp(-0.5 / bandwidth**2 * np.sum(diff**2, axis=2))

def kernel_perceptron(X, y, X_te, y_te, T=10, bandwidth=1.0):
    n = X.shape[0]
    alpha = np.zeros(n)
    K_train = rbf_kernel_matrix(X, X, bandwidth)
    K_test  = rbf_kernel_matrix(X_te, X, bandwidth)
    tr_hist, te_hist = [], []

    for _ in range(T):
        for i in range(n):
            score = np.sum(alpha * y * K_train[i])
            if y[i] * score <= 0:
                alpha[i] += 1
        scores_tr = (alpha * y) @ K_train.T
        scores_te = (alpha * y) @ K_test.T
        tr_hist.append(np.mean(np.sign(scores_tr) != y))
        te_hist.append(np.mean(np.sign(scores_te) != y_te))

    return alpha, tr_hist, te_hist

alpha_kp, tr_kp, te_kp = kernel_perceptron(
    X_train, y_train, X_test, y_test, T=10, bandwidth=1.0)

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(tr_lin, label='Linear Perceptron — Train', marker='o', linestyle='--', alpha=0.7)
ax.plot(te_lin, label='Linear Perceptron — Test',  marker='s', linestyle='--', alpha=0.7)
ax.plot(tr_kp,  label='Kernel Perceptron — Train', marker='o', lw=2)
ax.plot(te_kp,  label='Kernel Perceptron — Test',  marker='s', lw=2)
ax.set_xlabel('Epoch'); ax.set_ylabel('Error rate')
ax.set_title('Linear vs Kernel Perceptron — Error per Epoch', fontsize=12, fontweight='bold')
ax.legend(fontsize=9); ax.grid(True, linestyle='--', alpha=0.4)
plt.tight_layout(); plt.show()

print(f'Kernel Perceptron  — train err: {tr_kp[-1]:.2%}   test err: {te_kp[-1]:.2%}')
print(f'Support vectors (alpha > 0): {(alpha_kp > 0).sum()} / {len(alpha_kp)}')

Kernel Perceptron  — train err: 0.00%   test err: 0.73%
Support vectors (alpha > 0): 25 / 1097

K_vis_train = rbf_kernel_matrix(X_train, X_train, bandwidth=1.0)

x_min, x_max = X_vis[:,0].min()-0.5, X_vis[:,0].max()+0.5
y_min, y_max = X_vis[:,1].min()-0.5, X_vis[:,1].max()+0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 150),
                      np.linspace(y_min, y_max, 150))
grid_pca = np.c_[xx.ravel(), yy.ravel()]
# pca was fitted on standardised X_train; inverse_transform returns to that same space
grid_full = pca.inverse_transform(grid_pca)

K_grid = rbf_kernel_matrix(grid_full, X_train, bandwidth=1.0)
scores  = (alpha_kp * y_train) @ K_grid.T
zz = scores.reshape(xx.shape)

fig, axes = plt.subplots(1, 2, figsize=(13, 5))

for ax, title, boundary_fn in [
    (axes[0], 'Linear Perceptron — Decision Boundary (PCA 2D)',
     lambda xv: -(theta_lin[0]*pca.components_[0,0]*xv + theta0_lin) / (theta_lin @ pca.components_[1])),
    (axes[1], 'Kernel Perceptron — Decision Boundary (PCA 2D)', None)]:

    for lbl in [1, -1]:
        m = y_train == lbl
        ax.scatter(X_vis[m,0], X_vis[m,1], c=colors[lbl], label=labels[lbl],
                   edgecolors='k', s=25, alpha=0.6, zorder=3)
    ax.set_title(title, fontsize=10, fontweight='bold')
    ax.set_xlabel('PCA 1'); ax.set_ylabel('PCA 2')
    ax.legend(fontsize=8); ax.grid(True, linestyle='--', alpha=0.3)

# linear boundary
x_line = np.linspace(x_min, x_max, 200)
theta_2d = pca.components_ @ theta_lin
if abs(theta_2d[1]) > 1e-10:
    axes[0].plot(x_line, -(theta_2d[0]*x_line + theta0_lin)/theta_2d[1], 'k-', lw=2)

# kernel boundary
axes[1].contourf(xx, yy, zz, levels=[-1e9, 0, 1e9],
                 colors=['tomato','steelblue'], alpha=0.15)
axes[1].contour(xx, yy, zz, levels=[0], colors='black', linewidths=2)
sv_mask = alpha_kp > 0
axes[1].scatter(X_vis[sv_mask,0], X_vis[sv_mask,1],
                s=80, facecolors='none', edgecolors='black', lw=1.5,
                label=f'Support vectors ({sv_mask.sum()})', zorder=4)
axes[1].legend(fontsize=8)

plt.tight_layout(); plt.show()

bandwidths = [0.1, 0.5, 1.0, 5.0]
bw_results = []
for bw in bandwidths:
    _, tr_h, te_h = kernel_perceptron(X_train, y_train, X_test, y_test, T=10, bandwidth=bw)
    bw_results.append((bw, tr_h[-1], te_h[-1]))
    print(f'bandwidth={bw:<4}  train={tr_h[-1]:.2%}  test={te_h[-1]:.2%}')

bandwidth=0.1   train=0.00%  test=0.36%
bandwidth=0.5   train=0.00%  test=0.00%
bandwidth=1.0   train=0.00%  test=0.73%

bandwidth=5.0   train=0.00%  test=0.00%

Method	Train error	Test error
Linear Perceptron	1.09%	2.91%
Kernel Perceptron (bw=1.0)	0.00%	0.73%
Kernel Perceptron (bw=0.5)	0.00%	0.00%

Kernel Perceptron¶

1. Introduction¶

2. The Math¶

Kernel Perceptron representation¶

Kernel substitution¶

Update rule¶

RBF Kernel¶

3. Problem Class¶

4. Implementation¶

Dataset: Banknote Authentication¶

4.1 Explore the Data¶

4.2 Linear Perceptron Baseline¶

4.3 Kernel Perceptron (RBF)¶

4.4 Decision Boundary (PCA 2D)¶

4.5 Effect of Bandwidth¶

5. Results¶

6. Limitations¶