ppo_tf_cartpole.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. import matplotlib
  2. from matplotlib import pyplot as plt
  3. matplotlib.rcParams['font.size'] = 18
  4. matplotlib.rcParams['figure.titlesize'] = 18
  5. matplotlib.rcParams['figure.figsize'] = [9, 7]
  6. matplotlib.rcParams['font.family'] = ['KaiTi']
  7. matplotlib.rcParams['axes.unicode_minus']=False
  8. plt.figure()
  9. import gym,os
  10. import numpy as np
  11. import tensorflow as tf
  12. from tensorflow import keras
  13. from tensorflow.keras import layers,optimizers,losses
  14. from collections import namedtuple
  15. from torch.utils.data import SubsetRandomSampler,BatchSampler
  16. env = gym.make('CartPole-v1') # 创建游戏环境
  17. env.seed(2222)
  18. tf.random.set_seed(2222)
  19. np.random.seed(2222)
  20. os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
  21. assert tf.__version__.startswith('2.')
  22. gamma = 0.98 # 激励衰减因子
  23. epsilon = 0.2 # PPO误差超参数0.8~1.2
  24. batch_size = 32 # batch size
  25. # 创建游戏环境
  26. env = gym.make('CartPole-v0').unwrapped
  27. Transition = namedtuple('Transition', ['state', 'action', 'a_log_prob', 'reward', 'next_state'])
  28. class Actor(keras.Model):
  29. def __init__(self):
  30. super(Actor, self).__init__()
  31. # 策略网络,也叫Actor网络,输出为概率分布pi(a|s)
  32. self.fc1 = layers.Dense(100, kernel_initializer='he_normal')
  33. self.fc2 = layers.Dense(2, kernel_initializer='he_normal')
  34. def call(self, inputs):
  35. x = tf.nn.relu(self.fc1(inputs))
  36. x = self.fc2(x)
  37. x = tf.nn.softmax(x, axis=1) # 转换成概率
  38. return x
  39. class Critic(keras.Model):
  40. def __init__(self):
  41. super(Critic, self).__init__()
  42. # 偏置b的估值网络,也叫Critic网络,输出为v(s)
  43. self.fc1 = layers.Dense(100, kernel_initializer='he_normal')
  44. self.fc2 = layers.Dense(1, kernel_initializer='he_normal')
  45. def call(self, inputs):
  46. x = tf.nn.relu(self.fc1(inputs))
  47. x = self.fc2(x)
  48. return x
  49. class PPO():
  50. # PPO算法主体
  51. def __init__(self):
  52. super(PPO, self).__init__()
  53. self.actor = Actor() # 创建Actor网络
  54. self.critic = Critic() # 创建Critic网络
  55. self.buffer = [] # 数据缓冲池
  56. self.actor_optimizer = optimizers.Adam(1e-3) # Actor优化器
  57. self.critic_optimizer = optimizers.Adam(3e-3) # Critic优化器
  58. def select_action(self, s):
  59. # 送入状态向量,获取策略: [4]
  60. s = tf.constant(s, dtype=tf.float32)
  61. # s: [4] => [1,4]
  62. s = tf.expand_dims(s, axis=0)
  63. # 获取策略分布: [1, 2]
  64. prob = self.actor(s)
  65. # 从类别分布中采样1个动作, shape: [1]
  66. a = tf.random.categorical(tf.math.log(prob), 1)[0]
  67. a = int(a) # Tensor转数字
  68. return a, float(prob[0][a]) # 返回动作及其概率
  69. def get_value(self, s):
  70. # 送入状态向量,获取策略: [4]
  71. s = tf.constant(s, dtype=tf.float32)
  72. # s: [4] => [1,4]
  73. s = tf.expand_dims(s, axis=0)
  74. # 获取策略分布: [1, 2]
  75. v = self.critic(s)[0]
  76. return float(v) # 返回v(s)
  77. def store_transition(self, transition):
  78. # 存储采样数据
  79. self.buffer.append(transition)
  80. def optimize(self):
  81. # 优化网络主函数
  82. # 从缓存中取出样本数据,转换成Tensor
  83. state = tf.constant([t.state for t in self.buffer], dtype=tf.float32)
  84. action = tf.constant([t.action for t in self.buffer], dtype=tf.int32)
  85. action = tf.reshape(action,[-1,1])
  86. reward = [t.reward for t in self.buffer]
  87. old_action_log_prob = tf.constant([t.a_log_prob for t in self.buffer], dtype=tf.float32)
  88. old_action_log_prob = tf.reshape(old_action_log_prob, [-1,1])
  89. # 通过MC方法循环计算R(st)
  90. R = 0
  91. Rs = []
  92. for r in reward[::-1]:
  93. R = r + gamma * R
  94. Rs.insert(0, R)
  95. Rs = tf.constant(Rs, dtype=tf.float32)
  96. # 对缓冲池数据大致迭代10遍
  97. for _ in range(round(10*len(self.buffer)/batch_size)):
  98. # 随机从缓冲池采样batch size大小样本
  99. index = np.random.choice(np.arange(len(self.buffer)), batch_size, replace=False)
  100. # 构建梯度跟踪环境
  101. with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
  102. # 取出R(st),[b,1]
  103. v_target = tf.expand_dims(tf.gather(Rs, index, axis=0), axis=1)
  104. # 计算v(s)预测值,也就是偏置b,我们后面会介绍为什么写成v
  105. v = self.critic(tf.gather(state, index, axis=0))
  106. delta = v_target - v # 计算优势值
  107. advantage = tf.stop_gradient(delta) # 断开梯度连接
  108. # 由于TF的gather_nd与pytorch的gather功能不一样,需要构造
  109. # gather_nd需要的坐标参数,indices:[b, 2]
  110. # pi_a = pi.gather(1, a) # pytorch只需要一行即可实现
  111. a = tf.gather(action, index, axis=0) # 取出batch的动作at
  112. # batch的动作分布pi(a|st)
  113. pi = self.actor(tf.gather(state, index, axis=0))
  114. indices = tf.expand_dims(tf.range(a.shape[0]), axis=1)
  115. indices = tf.concat([indices, a], axis=1)
  116. pi_a = tf.gather_nd(pi, indices) # 动作的概率值pi(at|st), [b]
  117. pi_a = tf.expand_dims(pi_a, axis=1) # [b]=> [b,1]
  118. # 重要性采样
  119. ratio = (pi_a / tf.gather(old_action_log_prob, index, axis=0))
  120. surr1 = ratio * advantage
  121. surr2 = tf.clip_by_value(ratio, 1 - epsilon, 1 + epsilon) * advantage
  122. # PPO误差函数
  123. policy_loss = -tf.reduce_mean(tf.minimum(surr1, surr2))
  124. # 对于偏置v来说,希望与MC估计的R(st)越接近越好
  125. value_loss = losses.MSE(v_target, v)
  126. # 优化策略网络
  127. grads = tape1.gradient(policy_loss, self.actor.trainable_variables)
  128. self.actor_optimizer.apply_gradients(zip(grads, self.actor.trainable_variables))
  129. # 优化偏置值网络
  130. grads = tape2.gradient(value_loss, self.critic.trainable_variables)
  131. self.critic_optimizer.apply_gradients(zip(grads, self.critic.trainable_variables))
  132. self.buffer = [] # 清空已训练数据
  133. def main():
  134. agent = PPO()
  135. returns = [] # 统计总回报
  136. total = 0 # 一段时间内平均回报
  137. for i_epoch in range(500): # 训练回合数
  138. state = env.reset() # 复位环境
  139. for t in range(500): # 最多考虑500步
  140. # 通过最新策略与环境交互
  141. action, action_prob = agent.select_action(state)
  142. next_state, reward, done, _ = env.step(action)
  143. # 构建样本并存储
  144. trans = Transition(state, action, action_prob, reward, next_state)
  145. agent.store_transition(trans)
  146. state = next_state # 刷新状态
  147. total += reward # 累积激励
  148. if done: # 合适的时间点训练网络
  149. if len(agent.buffer) >= batch_size:
  150. agent.optimize() # 训练网络
  151. break
  152. if i_epoch % 20 == 0: # 每20个回合统计一次平均回报
  153. returns.append(total/20)
  154. total = 0
  155. print(i_epoch, returns[-1])
  156. print(np.array(returns))
  157. plt.figure()
  158. plt.plot(np.arange(len(returns))*20, np.array(returns))
  159. plt.plot(np.arange(len(returns))*20, np.array(returns), 's')
  160. plt.xlabel('回合数')
  161. plt.ylabel('总回报')
  162. plt.savefig('ppo-tf-cartpole.svg')
  163. if __name__ == '__main__':
  164. main()
  165. print("end")