Skip to main content
This notebook implements value iteration for the classic 4×3 gridworld example in Artificial Intelligence: A Modern Approach, Figure 17.2. It mirrors the policy iteration page on the same MDP so you can compare the two algorithms side by side.
  • Terminal states: +1 at (3,0), -1 at (3,1)
  • Wall: (1,1)
  • Step cost: -0.04
  • Discount factor γ = 1.0
Value iteration interleaves the Bellman expectation backup with a max over actions in a single update: vk+1(s)=maxasP(ss,a)(r(s,a,s)+γvk(s))v_{k+1}(s) = \max_a \sum_{s'} P(s' \mid s, a)\,\bigl(r(s, a, s') + \gamma\, v_k(s')\bigr) After convergence the optimal policy is recovered greedily as π(s)=argmaxasP(ss,a)(r+γv(s))\pi^*(s) = \arg\max_a \sum_{s'} P(s' \mid s, a)(r + \gamma\, v^*(s')).
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
# Define state positions and mappings
state_coords = {
    0: (0, 0),
    1: (1, 0),
    2: (2, 0),
    3: (3, 0),
    4: (0, 1),
    5: (1, 1),
    6: (2, 1),
    7: (3, 1),
    8: (0, 2),
    9: (1, 2),
    10: (2, 2),
    11: (3, 2),
}

terminal_states = {3: 1.0, 7: -1.0}
wall = {5}
states = list(range(12))
actions = [0, 1, 2, 3]  # east, north, south, west
step_cost = -0.04
gamma = 1.0
theta = 1e-4

action_delta = {0: (1, 0), 1: (0, -1), 2: (0, 1), 3: (-1, 0)}
state_pos = {s: (x, y) for s, (x, y) in state_coords.items()}
pos_state = {v: k for k, v in state_pos.items()}


def get_transitions(s, a):
    if s in terminal_states or s in wall:
        return [(1.0, s)]
    x, y = state_pos[s]
    results = []
    for prob, direction in zip([0.8, 0.1, 0.1], [a, (a + 1) % 4, (a + 3) % 4]):
        dx, dy = action_delta[direction]
        nx, ny = x + dx, y + dy
        ns = pos_state.get((nx, ny), s)
        if ns in wall:
            ns = s
        results.append((prob, ns))
    return results
# Value iteration
v = [0.0 for _ in states]

while True:
    delta = 0
    for s in states:
        if s in terminal_states or s in wall:
            continue
        v_old = v[s]
        q_vals = []
        for a in actions:
            q = sum(
                prob * (terminal_states.get(s_next, step_cost) + gamma * v[s_next])
                for prob, s_next in get_transitions(s, a)
            )
            q_vals.append(q)
        v[s] = max(q_vals)
        delta = max(delta, abs(v_old - v[s]))
    if delta < theta:
        break

# Extract greedy policy from v*
policy = [None] * len(states)
for s in states:
    if s in terminal_states or s in wall:
        continue
    q_vals = []
    for a in actions:
        q = sum(
            prob * (terminal_states.get(s_next, step_cost) + gamma * v[s_next])
            for prob, s_next in get_transitions(s, a)
        )
        q_vals.append(q)
    policy[s] = int(np.argmax(q_vals))
arrow_map = {0: (0.4, 0), 1: (0, -0.4), 2: (0, 0.4), 3: (-0.4, 0)}

fig, ax = plt.subplots(figsize=(6, 5))
for s, (x, y) in state_coords.items():
    if s in wall:
        color = "gray"
    elif s in terminal_states:
        color = "lightgreen" if terminal_states[s] > 0 else "salmon"
    else:
        color = "white"
    ax.add_patch(patches.Rectangle((x, y), 1, 1, edgecolor="black", facecolor=color))
    if s not in terminal_states and s not in wall:
        dx, dy = arrow_map[policy[s]]
        ax.arrow(
            x + 0.5,
            y + 0.5,
            dx,
            dy,
            head_width=0.15,
            head_length=0.1,
            fc="blue",
            ec="blue",
        )
        ax.text(x + 0.05, y + 0.05, f"{v[s]:.2f}", fontsize=8, color="black")

for s, r in terminal_states.items():
    x, y = state_coords[s]
    ax.text(x + 0.35, y + 0.35, f"{r:+.0f}", fontsize=14, color="black")

ax.set_xlim(0, 4)
ax.set_ylim(0, 3)
ax.set_xticks([])
ax.set_yticks([])
ax.set_aspect("equal")
plt.title("Value Iteration: Step Cost = -0.04, γ = 1.0")
plt.gca().invert_yaxis()
plt.show()
Value iteration optimal policy on the 4×3 gridworld Optimal policy and converged value function from value iteration. Compare with the policy iteration page — both algorithms reach the same π\pi^* and the same vv^* on this MDP.