theta1 = 0.5
theta2 = 0.5
lr = 0.05
eps = 0.2
outer_iters = 50
inner_epochs = 4
theta1_hist = []
theta2_hist = []
ratio1_hist = []
ratio2_hist = []
g1_hist = [] # sampled clipped-surrogate gradient on θ1
g2_hist = [] # sampled clipped-surrogate gradient on θ2
reward_hist = []
expected_J_hist = [] # J(θ1, θ2) = θ2 (reward only at s2)
expected_g1_hist = [] # E[∂L/∂θ1] = 0 (score-function trick — same as REINFORCE)
expected_g2_hist = [] # E[∂L/∂θ2] = 1 when clip is inactive (mirrors REINFORCE)
for i in range(outer_iters):
old_theta1 = theta1
old_theta2 = theta2
# Rollout under (π_old_1, π_old_2)
a1 = 1 if np.random.rand() < old_theta1 else 2
a2 = 1 if np.random.rand() < old_theta2 else 2
reward = 1 if a2 == 1 else 0
for _ in range(inner_epochs):
# Importance ratios for each decision
if a1 == 1:
r1 = theta1 / old_theta1
gr1 = 1.0 / old_theta1
else:
r1 = (1 - theta1) / (1 - old_theta1)
gr1 = -1.0 / (1 - old_theta1)
if a2 == 1:
r2 = theta2 / old_theta2
gr2 = 1.0 / old_theta2
else:
r2 = (1 - theta2) / (1 - old_theta2)
gr2 = -1.0 / (1 - old_theta2)
# Clipped surrogate gradients (same rule as single-state)
g1 = gr1 * reward if 1 - eps < r1 < 1 + eps else 0.0
g2 = gr2 * reward if 1 - eps < r2 < 1 + eps else 0.0
theta1 += lr * g1
theta2 += lr * g2
theta1 = np.clip(theta1, 1e-3, 1 - 1e-3)
theta2 = np.clip(theta2, 1e-3, 1 - 1e-3)
theta1_hist.append(theta1)
theta2_hist.append(theta2)
ratio1_hist.append(r1)
ratio2_hist.append(r2)
g1_hist.append(g1)
g2_hist.append(g2)
reward_hist.append(reward)
expected_J_hist.append(old_theta2)
expected_g1_hist.append(0.0)
expected_g2_hist.append(1.0)
# 1. Policy parameters
plt.figure(figsize=(8, 4))
plt.plot(theta1_hist, label=r"$\theta_1$ (state 1)")
plt.plot(theta2_hist, label=r"$\theta_2$ (state 2)")
plt.title("Two-State PPO: policy parameters across mini-epochs")
plt.xlabel("Mini-epoch step")
plt.ylabel("Parameter value")
plt.legend()
plt.show()
# 2. Sampled ratios vs clip window (one ratio per state)
plt.figure(figsize=(8, 4))
plt.plot(ratio1_hist, color="#cbd5e1", linewidth=1, label=r"sampled $r_1$")
plt.plot(ratio2_hist, color="#fcd34d", linewidth=1, label=r"sampled $r_2$")
plt.axhline(1, color="#2563eb", linewidth=2, linestyle="--",
label=r"expected $\mathbb{E}[r_i] = 1$")
plt.axhline(1 + eps, color="#dc2626", linewidth=1, linestyle=":",
label=fr"clip $1 \pm \epsilon$")
plt.axhline(1 - eps, color="#dc2626", linewidth=1, linestyle=":")
plt.title("Two-State PPO: sampled importance ratios vs clip window")
plt.xlabel("Mini-epoch step")
plt.ylabel(r"$r_i(\theta)$")
plt.legend(loc="upper right", fontsize=9)
plt.show()
# 3. Sampled vs expected gradients (key pedagogical plot)
plt.figure(figsize=(8, 4))
plt.plot(g1_hist, color="#cbd5e1", linewidth=1, label="sampled $g_1$")
plt.plot(expected_g1_hist, color="#2563eb", linewidth=2, linestyle="--",
label=r"expected $\mathbb{E}[g_1] = 0$")
plt.plot(g2_hist, color="#fcd34d", linewidth=1, label="sampled $g_2$")
plt.plot(expected_g2_hist, color="#dc2626", linewidth=2, linestyle="--",
label=r"expected $\mathbb{E}[g_2] = 1$")
plt.title("Two-State PPO: sampled vs expected clipped-surrogate gradients")
plt.xlabel("Mini-epoch step")
plt.ylabel("gradient value")
plt.legend(loc="upper right", fontsize=9)
plt.show()
# 4. Sampled reward vs expected objective J(θ1, θ2) = θ2
plt.figure(figsize=(8, 4))
plt.plot(reward_hist, color="#cbd5e1", linewidth=1, label="sampled $R$")
plt.plot(expected_J_hist, color="#2563eb", linewidth=2, linestyle="--",
label=r"expected $J(\theta_1,\theta_2) = \theta_2$")
plt.title("Two-State PPO: sampled reward vs expected objective $J$")
plt.xlabel("Mini-epoch step")
plt.ylabel("Reward")
plt.legend(loc="lower right", fontsize=9)
plt.show()