import numpy as np
import matplotlib.pyplot as plt

rng = np.random.default_rng(42)
x_demo = np.linspace(0, 10, 40)
y_demo = 2.5 * x_demo + 5 + rng.normal(0, 4, 40)

fig, axes = plt.subplots(1, 3, figsize=(16, 4))

# ── Diagram 1: What regression fits ──────────────────────────────────────────
ax = axes[0]
theta_demo = np.polyfit(x_demo, y_demo, 1)
y_fit = np.polyval(theta_demo, x_demo)
ax.scatter(x_demo, y_demo, c='steelblue', edgecolors='k', s=40, zorder=3, label='Data points')
ax.plot(x_demo, y_fit, 'k-', lw=2, label=r'$f(x) = \theta \cdot x + \theta_0$')
for xi, yi, yfi in zip(x_demo[::4], y_demo[::4], y_fit[::4]):
    ax.plot([xi, xi], [yi, yfi], 'tomato', lw=1.2, alpha=0.7)
ax.plot([], [], 'tomato', lw=1.5, label='Residuals')
ax.set_title('Fitting a line — minimising residuals', fontsize=11, fontweight='bold')
ax.set_xlabel('x'); ax.set_ylabel('y')
ax.legend(fontsize=9); ax.grid(True, linestyle='--', alpha=0.4)

# ── Diagram 2: MSE loss surface (2D slice) ────────────────────────────────────
ax = axes[1]
thetas = np.linspace(-1, 6, 200)
mse = [(np.mean((y_demo - t * x_demo) ** 2)) for t in thetas]
ax.plot(thetas, mse, 'k-', lw=2)
opt_t = thetas[np.argmin(mse)]
ax.axvline(opt_t, color='tomato', linestyle='--', lw=1.5, label=f'Minimum at θ≈{opt_t:.1f}')
ax.scatter([opt_t], [min(mse)], c='tomato', s=80, zorder=5)
ax.set_title('MSE as a function of theta (bowl-shaped = one global minimum)', fontsize=11, fontweight='bold')
ax.set_xlabel(r'$\theta$'); ax.set_ylabel('MSE')
ax.legend(fontsize=9); ax.grid(True, linestyle='--', alpha=0.4)

# ── Diagram 3: Ridge shrinkage effect ────────────────────────────────────────
ax = axes[2]
lambdas = np.logspace(-3, 2, 100)
ridge_weights = []
for lam in lambdas:
    A = np.dot(x_demo, x_demo) / len(x_demo) + lam
    b = np.dot(y_demo, x_demo) / len(x_demo)
    ridge_weights.append(b / A)
ax.semilogx(lambdas, ridge_weights, 'k-', lw=2)
ax.axhline(theta_demo[0], color='steelblue', linestyle='--', lw=1.5, label='OLS estimate (λ=0)')
ax.axhline(0, color='gray', linestyle=':', lw=1)
ax.set_title('Ridge: weight shrinks as λ increases', fontsize=11, fontweight='bold')
ax.set_xlabel(r'$\lambda$ (regularisation strength, log scale)')
ax.set_ylabel(r'$\hat{\theta}$')
ax.legend(fontsize=9); ax.grid(True, linestyle='--', alpha=0.4)

plt.suptitle('Figure 1 — Linear Regression Concepts', fontsize=13, y=1.02)
plt.tight_layout(); plt.show()

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

# ── Load & prepare ────────────────────────────────────────────────────────────
data = load_diabetes()
X_raw, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(
    X_raw, y, test_size=0.2, random_state=42
)

print(f'Features  : {X_raw.shape[1]}  ({", ".join(data.feature_names)})')
print(f'Train     : {X_train.shape[0]}  examples')
print(f'Test      : {X_test.shape[0]}  examples')
print(f'Target    : min={y.min():.0f}  max={y.max():.0f}  mean={y.mean():.1f}  std={y.std():.1f}')

# ── Shared metrics ────────────────────────────────────────────────────────────
def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def rmse(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))

def r2(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - y_true.mean()) ** 2)
    return 1 - ss_res / ss_tot

Features  : 10  (age, sex, bmi, bp, s1, s2, s3, s4, s5, s6)
Train     : 353  examples
Test      : 89  examples
Target    : min=25  max=346  mean=152.1  std=77.0

fig, axes = plt.subplots(2, 5, figsize=(16, 6))
axes = axes.flatten()
for i, name in enumerate(data.feature_names):
    ax = axes[i]
    ax.scatter(X_raw[:, i], y, alpha=0.4, s=15, c='steelblue', edgecolors='none')
    m, b = np.polyfit(X_raw[:, i], y, 1)
    x_line = np.linspace(X_raw[:, i].min(), X_raw[:, i].max(), 100)
    ax.plot(x_line, m * x_line + b, 'tomato', lw=1.5)
    corr = np.corrcoef(X_raw[:, i], y)[0, 1]
    ax.set_title(f'{name}  (r={corr:.2f})', fontsize=9, fontweight='bold')
    ax.set_xlabel(name, fontsize=8); ax.set_ylabel('Progression', fontsize=8)
    ax.tick_params(labelsize=7)
    ax.grid(True, linestyle='--', alpha=0.3)
plt.suptitle('Feature vs Target — Individual Linear Relationships', fontsize=12, y=1.01)
plt.tight_layout(); plt.show()

def add_bias(X):
    return np.hstack([X, np.ones((X.shape[0], 1))])

def closed_form(X, y):
    Xb = add_bias(X)
    return np.linalg.lstsq(Xb, y, rcond=None)[0]

theta_cf = closed_form(X_train, y_train)
theta_cf_weights, theta_cf_bias = theta_cf[:-1], theta_cf[-1]

y_pred_cf_train = add_bias(X_train) @ theta_cf
y_pred_cf_test  = add_bias(X_test)  @ theta_cf

print(f'Closed Form Solution')
print(f'  Train RMSE : {rmse(y_train, y_pred_cf_train):.2f}')
print(f'  Test  RMSE : {rmse(y_test,  y_pred_cf_test):.2f}')
print(f'  Test  R²   : {r2(y_test, y_pred_cf_test):.4f}')
print()
print('Feature weights:')
for name, w in zip(data.feature_names, theta_cf_weights):
    print(f'  {name:5s}  {w:+.2f}')

Closed Form Solution
  Train RMSE : 53.56
  Test  RMSE : 53.85
  Test  R²   : 0.4526

Feature weights:
  age    +37.90
  sex    -241.96
  bmi    +542.43
  bp     +347.70
  s1     -931.49
  s2     +518.06
  s3     +163.42
  s4     +275.32
  s5     +736.20
  s6     +48.67

fig, ax = plt.subplots(figsize=(6, 5))
ax.scatter(y_test, y_pred_cf_test, c='steelblue', edgecolors='k', s=40, alpha=0.7, zorder=3)
lims = [min(y_test.min(), y_pred_cf_test.min()) - 5,
        max(y_test.max(), y_pred_cf_test.max()) + 5]
ax.plot(lims, lims, 'k--', lw=1.5, label='Perfect prediction')
ax.set_xlim(lims); ax.set_ylim(lims)
ax.set_xlabel('Actual progression'); ax.set_ylabel('Predicted progression')
ax.set_title(f'Closed Form — Predicted vs Actual (Test Set)\nRMSE={rmse(y_test, y_pred_cf_test):.2f}  R²={r2(y_test, y_pred_cf_test):.3f}',
             fontsize=11, fontweight='bold')
ax.legend(); ax.grid(True, linestyle='--', alpha=0.4)
plt.tight_layout(); plt.show()

def sgd_regression(X, y, X_te, y_te, T=200, seed=0):
    rng = np.random.default_rng(seed)
    n, d = X.shape
    theta = np.zeros(d + 1)
    tr_hist, te_hist = [], []
    for epoch in range(T):
        eta = 1.0 / (1 + epoch)
        for _ in range(n):
            i = rng.integers(n)
            xi = np.append(X[i], 1.0)
            grad = (theta @ xi - y[i]) * xi
            theta -= eta * grad
        tr_hist.append(rmse(y,    add_bias(X)    @ theta))
        te_hist.append(rmse(y_te, add_bias(X_te) @ theta))
    return theta, tr_hist, te_hist

theta_sgd, tr_sgd, te_sgd = sgd_regression(X_train, y_train, X_test, y_test, T=200)
y_pred_sgd_test = add_bias(X_test) @ theta_sgd

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(tr_sgd, label='Train RMSE', lw=1.5)
ax.plot(te_sgd, label='Test  RMSE', lw=1.5, linestyle='--')
ax.axhline(rmse(y_test, y_pred_cf_test), color='tomato', linestyle=':', lw=1.5,
           label=f'Closed form test RMSE ({rmse(y_test, y_pred_cf_test):.2f})')
ax.set_xlabel('Epoch'); ax.set_ylabel('RMSE')
ax.set_title('SGD Regression — Convergence', fontsize=12, fontweight='bold')
ax.legend(); ax.grid(True, linestyle='--', alpha=0.4)
plt.tight_layout(); plt.show()

print(f'SGD (200 epochs)')
print(f'  Train RMSE : {tr_sgd[-1]:.2f}')
print(f'  Test  RMSE : {te_sgd[-1]:.2f}')
print(f'  Test  R²   : {r2(y_test, y_pred_sgd_test):.4f}')
print(f'Closed Form  Test RMSE: {rmse(y_test, y_pred_cf_test):.2f}')

SGD (200 epochs)
  Train RMSE : 53.92
  Test  RMSE : 53.81
  Test  R²   : 0.4534
Closed Form  Test RMSE: 53.85

def ridge_closed_form(X, y, lam):
    Xb = add_bias(X)
    d = Xb.shape[1]
    reg = lam * np.eye(d)
    reg[-1, -1] = 0  # don't regularise bias
    return np.linalg.solve(Xb.T @ Xb + n_tr * reg, Xb.T @ y)

n_tr = X_train.shape[0]
lambdas = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
results = []
for lam in lambdas:
    th = ridge_closed_form(X_train, y_train, lam)
    tr_r = rmse(y_train, add_bias(X_train) @ th)
    te_r = rmse(y_test,  add_bias(X_test)  @ th)
    results.append((lam, tr_r, te_r, np.linalg.norm(th[:-1])))
    print(f'λ={lam:<6}  train RMSE={tr_r:.2f}  test RMSE={te_r:.2f}  ||θ||={np.linalg.norm(th[:-1]):.3f}')

λ=0.001   train RMSE=55.07  test RMSE=53.66  ||θ||=661.252
λ=0.01    train RMSE=65.28  test RMSE=61.09  ||θ||=259.256
λ=0.1     train RMSE=75.62  test RMSE=70.89  ||θ||=41.482
λ=1.0     train RMSE=77.70  test RMSE=72.97  ||θ||=4.439
λ=10.0    train RMSE=77.93  test RMSE=73.20  ||θ||=0.447
λ=100.0   train RMSE=77.95  test RMSE=73.22  ||θ||=0.045

fig, axes = plt.subplots(1, 2, figsize=(13, 4))

# Error vs lambda
x = np.arange(len(lambdas)); w = 0.35
axes[0].bar(x - w/2, [r[1] for r in results], w, label='Train', color='steelblue', alpha=0.8)
axes[0].bar(x + w/2, [r[2] for r in results], w, label='Test',  color='tomato',    alpha=0.8)
axes[0].set_xticks(x); axes[0].set_xticklabels([str(l) for l in lambdas])
axes[0].set_xlabel('λ'); axes[0].set_ylabel('RMSE')
axes[0].set_title('RMSE vs λ', fontsize=12, fontweight='bold')
axes[0].legend(); axes[0].grid(True, linestyle='--', alpha=0.4, axis='y')

# Coefficient paths
lambdas_fine = np.logspace(-3, 3, 100)
coef_paths = []
for lam in lambdas_fine:
    th = ridge_closed_form(X_train, y_train, lam)
    coef_paths.append(th[:-1])
coef_paths = np.array(coef_paths)

for i, name in enumerate(data.feature_names):
    axes[1].semilogx(lambdas_fine, coef_paths[:, i], lw=1.5, label=name)
axes[1].axhline(0, color='black', lw=0.8)
axes[1].set_xlabel('λ (log scale)'); axes[1].set_ylabel('Weight value')
axes[1].set_title('Ridge Coefficient Paths', fontsize=12, fontweight='bold')
axes[1].legend(fontsize=7, loc='upper right'); axes[1].grid(True, linestyle='--', alpha=0.4)

plt.tight_layout(); plt.show()

Linear Regression & Ridge Regression¶

1. Introduction¶

2. The Math¶

Prediction function¶

Empirical Risk (Mean Squared Error)¶

Closed Form Solution¶

SGD Update¶

Ridge Regression Objective¶

3. Problem Class¶

4. Implementation¶

Dataset: Diabetes¶

4.1 Explore the Data¶

4.2 Closed Form Solution¶

4.3 SGD Solution¶

4.4 Ridge Regression¶

5. Results¶

6. Limitations¶

Method	Train RMSE	Test RMSE	Test R²
Closed Form (OLS)	53.56	53.85	0.453
SGD (200 epochs)	53.92	53.81	0.453
Ridge (λ=0.001)	55.07	53.66	~0.455