import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

fig, axes = plt.subplots(1, 2, figsize=(13, 4))

# ── Diagram 1: Agent-environment loop ─────────────────────────────────────────
ax = axes[0]
ax.axis('off')
ax.text(0.5, 0.95, 'Reinforcement Learning Loop', ha='center', va='top',
        fontsize=13, fontweight='bold', transform=ax.transAxes)
ax.add_patch(mpatches.FancyBboxPatch((0.05,0.55),0.3,0.2, boxstyle='round',
             facecolor='steelblue', alpha=0.5, transform=ax.transAxes))
ax.text(0.20, 0.65, 'Agent', ha='center', va='center', fontsize=12, transform=ax.transAxes)
ax.add_patch(mpatches.FancyBboxPatch((0.65,0.55),0.3,0.2, boxstyle='round',
             facecolor='tomato', alpha=0.5, transform=ax.transAxes))
ax.text(0.80, 0.65, 'Environment', ha='center', va='center', fontsize=12, transform=ax.transAxes)
ax.annotate('action a', xy=(0.65,0.70), xytext=(0.35,0.70),
            ha='left', va='center', fontsize=10, transform=ax.transAxes,
            arrowprops=dict(arrowstyle='->', lw=1.5))
ax.annotate('state s, reward r', xy=(0.35,0.60), xytext=(0.65,0.60),
            ha='right', va='center', fontsize=10, transform=ax.transAxes,
            arrowprops=dict(arrowstyle='->', lw=1.5))
ax.text(0.5, 0.40, 'Goal: learn policy π*(s) = argmax_a Q*(s,a)',
        ha='center', va='top', fontsize=11, transform=ax.transAxes)
ax.text(0.5, 0.25, 'Q(s,a) <- Q(s,a) + eta[r + gamma * max_a Q(s_next,a) - Q(s,a)]',
        ha='center', va='top', fontsize=10, style='italic', transform=ax.transAxes,
        bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.8))

# ── Diagram 2: Discounting ────────────────────────────────────────────────────
ax = axes[1]
t = np.arange(0, 10)
for gamma, ls in [(0.99,'k-'),(0.9,'b--'),(0.5,'r:')]:
    ax.plot(t, gamma**t, ls, lw=2, label=f'γ={gamma}')
ax.set_xlabel('Time step t'); ax.set_ylabel('Discount factor γᵗ')
ax.set_title('Effect of discount factor on future reward value', fontsize=11, fontweight='bold')
ax.legend(fontsize=10); ax.grid(True, linestyle='--', alpha=0.4)

plt.suptitle('Figure 1 — Q-Learning Concepts', fontsize=13, y=1.02)
plt.tight_layout(); plt.show()

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.colors import Normalize

# ── Grid World Environment ────────────────────────────────────────────────────
GRID = 5
START = (0, 0)
GOAL  = (4, 4)
HOLES = {(1, 3), (2, 1), (3, 3)}
ACTIONS = {0: (-1,0), 1: (1,0), 2: (0,-1), 3: (0,1)}  # up, down, left, right
ACTION_NAMES = ['Up','Down','Left','Right']
ACTION_ARROWS = ['^','v','<','>']

def step(state, action):
    r, c = state
    dr, dc = ACTIONS[action]
    nr, nc = max(0,min(GRID-1,r+dr)), max(0,min(GRID-1,c+dc))
    next_state = (nr, nc)
    if next_state == GOAL:
        return next_state, 10.0, True
    if next_state in HOLES:
        return next_state, -5.0, True
    return next_state, -0.1, False

def draw_grid(ax, Q=None, title='', path=None):
    ax.set_xlim(-0.5, GRID-0.5); ax.set_ylim(-0.5, GRID-0.5)
    ax.set_xticks([]); ax.set_yticks([])
    for r in range(GRID):
        for c in range(GRID):
            state = (r, c)
            color = 'white'
            if state == GOAL:  color = 'lightgreen'
            elif state in HOLES: color = 'salmon'
            elif state == START: color = 'lightyellow'
            rect = plt.Rectangle((c-0.5, GRID-1-r-0.5), 1, 1,
                                  facecolor=color, edgecolor='black', lw=1.5)
            ax.add_patch(rect)
            label = 'G' if state==GOAL else ('H' if state in HOLES else 'S' if state==START else '')
            if label: ax.text(c, GRID-1-r, label, ha='center', va='center', fontsize=14, fontweight='bold')
            if Q is not None and state not in HOLES and state != GOAL:
                best_a = np.argmax(Q[r,c])
                ax.text(c, GRID-1-r, ACTION_ARROWS[best_a], ha='center', va='center',
                        fontsize=16, color='navy', fontweight='bold')
    if path:
        for (r1,c1),(r2,c2) in zip(path[:-1], path[1:]):
            ax.annotate('', xy=(c2, GRID-1-r2), xytext=(c1, GRID-1-r1),
                        arrowprops=dict(arrowstyle='->', color='blue', lw=2))
    ax.set_title(title, fontsize=10, fontweight='bold')

fig, ax = plt.subplots(figsize=(5, 5))
draw_grid(ax, title='Grid World Environment (S=Start, G=Goal, H=Hole)')
plt.tight_layout(); plt.show()
print('Environment set up. S=Start, G=Goal, H=Hole (terminal, penalty)')

Environment set up. S=Start, G=Goal, H=Hole (terminal, penalty)

def q_learning(episodes=2000, gamma=0.9, eta=0.1, epsilon=0.2, seed=0):
    rng_ = np.random.default_rng(seed)
    Q = np.zeros((GRID, GRID, 4))
    rewards_hist = []
    steps_hist = []

    for ep in range(episodes):
        state = START
        total_reward = 0
        n_steps = 0
        for _ in range(200):
            r, c = state
            if rng_.random() < epsilon:
                action = rng_.integers(4)
            else:
                action = np.argmax(Q[r, c])
            next_state, reward, done = step(state, action)
            nr, nc = next_state
            td_target = reward if done else reward + gamma * np.max(Q[nr, nc])
            td_error = td_target - Q[r, c, action]
            Q[r, c, action] += eta * td_error
            total_reward += reward
            n_steps += 1
            state = next_state
            if done: break
        rewards_hist.append(total_reward)
        steps_hist.append(n_steps)

    return Q, rewards_hist, steps_hist

Q, rewards_hist, steps_hist = q_learning(episodes=3000, gamma=0.9, eta=0.1, epsilon=0.2)

fig, axes = plt.subplots(1, 2, figsize=(13, 4))
window = 100
smoothed_r = np.convolve(rewards_hist, np.ones(window)/window, mode='valid')
smoothed_s = np.convolve(steps_hist,   np.ones(window)/window, mode='valid')

axes[0].plot(smoothed_r, 'k-', lw=1.5)
axes[0].set_xlabel('Episode'); axes[0].set_ylabel(f'Reward ({window}-ep avg)')
axes[0].set_title('Q-Learning — Average Reward per Episode', fontsize=11, fontweight='bold')
axes[0].grid(True, linestyle='--', alpha=0.4)

axes[1].plot(smoothed_s, 'steelblue', lw=1.5)
axes[1].set_xlabel('Episode'); axes[1].set_ylabel(f'Steps ({window}-ep avg)')
axes[1].set_title('Q-Learning — Steps to Terminate per Episode', fontsize=11, fontweight='bold')
axes[1].grid(True, linestyle='--', alpha=0.4)

plt.tight_layout(); plt.show()
print(f'Final avg reward (last 100 ep): {np.mean(rewards_hist[-100:]):.3f}')
print(f'Final avg steps  (last 100 ep): {np.mean(steps_hist[-100:]):.1f}')

Final avg reward (last 100 ep): 4.424
Final avg steps  (last 100 ep): 8.8

def extract_path(Q, max_steps=20):
    state = START
    path = [state]
    for _ in range(max_steps):
        r, c = state
        action = np.argmax(Q[r, c])
        next_state, _, done = step(state, action)
        path.append(next_state)
        state = next_state
        if done: break
    return path

best_path = extract_path(Q)

fig, axes = plt.subplots(1, 2, figsize=(13, 5))
draw_grid(axes[0], Q=Q, title='Learned Policy (arrows = best action)')
draw_grid(axes[1], Q=Q, title=f'Optimal Path ({len(best_path)-1} steps)', path=best_path)
plt.tight_layout(); plt.show()

fig, axes = plt.subplots(1, 4, figsize=(16, 3))
for a, (ax, name) in enumerate(zip(axes, ACTION_NAMES)):
    data = Q[:,:,a]
    im = ax.imshow(data, cmap='RdYlGn', vmin=data.min(), vmax=data.max())
    plt.colorbar(im, ax=ax, shrink=0.8)
    ax.set_title(f'Q(s, {name})', fontsize=10, fontweight='bold')
    ax.set_xticks(range(GRID)); ax.set_yticks(range(GRID))
plt.suptitle('Q-values for each action', fontsize=12, y=1.02)
plt.tight_layout(); plt.show()
print(f'Path: {best_path}')
print(f'Goal reached: {best_path[-1] == GOAL}')

Path: [(0, 0), (1, 0), (1, 1), (1, 2), (2, 2), (2, 3), (2, 4), (3, 4), (4, 4)]
Goal reached: True

gammas = [0.5, 0.7, 0.9, 0.99]
gamma_results = []
for g in gammas:
    Q_g, rh, sh = q_learning(episodes=3000, gamma=g, eta=0.1, epsilon=0.2, seed=0)
    path = extract_path(Q_g)
    reached = path[-1] == GOAL
    avg_r = np.mean(rh[-100:])
    avg_s = np.mean(sh[-100:])
    gamma_results.append((g, avg_r, avg_s, len(path)-1, reached))
    print(f'γ={g}  avg_reward={avg_r:.3f}  avg_steps={avg_s:.1f}  path_len={len(path)-1}  goal={reached}')

γ=0.5  avg_reward=6.205  avg_steps=8.9  path_len=8  goal=True
γ=0.7  avg_reward=6.170  avg_steps=9.3  path_len=8  goal=True

γ=0.9  avg_reward=4.424  avg_steps=8.8  path_len=8  goal=True
γ=0.99  avg_reward=6.448  avg_steps=9.5  path_len=8  goal=True

Metric	Value
Training episodes	3,000
Final avg reward (last 100 ep, γ=0.9)	4.424
Final avg steps to termination	8.8
Optimal path length	8 steps
Goal reached	Yes

Q-Learning (Reinforcement Learning)¶

1. Introduction¶

2. The Math¶

Markov Decision Process (MDP)¶

Discounted utility¶

Bellman equations¶

Q-value iteration by sampling¶

3. Problem Class¶

4. Implementation¶

Environment: 5×5 Grid World¶

4.1 Q-Learning¶

4.2 Learned Policy & Q-values¶

4.3 Effect of Discount Factor γ¶

5. Results¶

6. Limitations¶