@@ -112,6 +112,7 @@ class Q_policy_iteration:
reward = env.get_reward(x, self.xG, self.lose) # get reward of next state
for i in range(len(x)):
value += p[i] * (reward[i] + self.gamma * table[x[i]][policy[x[i]]])
+
return value
@@ -47,6 +47,7 @@ def lose_map():
lose = []
for i in range(25, 36):
lose.append((i, 13))
return lose
@@ -34,4 +34,5 @@ def move_prob(x, u, obs, eta = 0.2):
x_next.append(x)
else:
x_next.append((x[0] + act[0], x[1] + act[1]))
return x_next, p_next