REINFORCE_tf.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. import gym,os
  2. import numpy as np
  3. import matplotlib
  4. from matplotlib import pyplot as plt
  5. # Default parameters for plots
  6. matplotlib.rcParams['font.size'] = 18
  7. matplotlib.rcParams['figure.titlesize'] = 18
  8. matplotlib.rcParams['figure.figsize'] = [9, 7]
  9. matplotlib.rcParams['font.family'] = ['KaiTi']
  10. matplotlib.rcParams['axes.unicode_minus']=False
  11. import tensorflow as tf
  12. from tensorflow import keras
  13. from tensorflow.keras import layers,optimizers,losses
  14. from PIL import Image
  15. env = gym.make('CartPole-v1') # 创建游戏环境
  16. env.seed(2333)
  17. tf.random.set_seed(2333)
  18. np.random.seed(2333)
  19. os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
  20. assert tf.__version__.startswith('2.')
  21. learning_rate = 0.0002
  22. gamma = 0.98
  23. class Policy(keras.Model):
  24. # 策略网络,生成动作的概率分布
  25. def __init__(self):
  26. super(Policy, self).__init__()
  27. self.data = [] # 存储轨迹
  28. # 输入为长度为4的向量,输出为左、右2个动作
  29. self.fc1 = layers.Dense(128, kernel_initializer='he_normal')
  30. self.fc2 = layers.Dense(2, kernel_initializer='he_normal')
  31. # 网络优化器
  32. self.optimizer = optimizers.Adam(lr=learning_rate)
  33. def call(self, inputs, training=None):
  34. # 状态输入s的shape为向量:[4]
  35. x = tf.nn.relu(self.fc1(inputs))
  36. x = tf.nn.softmax(self.fc2(x), axis=1)
  37. return x
  38. def put_data(self, item):
  39. # 记录r,log_P(a|s)
  40. self.data.append(item)
  41. def train_net(self, tape):
  42. # 计算梯度并更新策略网络参数。tape为梯度记录器
  43. R = 0 # 终结状态的初始回报为0
  44. for r, log_prob in self.data[::-1]:#逆序取
  45. R = r + gamma * R # 计算每个时间戳上的回报
  46. # 每个时间戳都计算一次梯度
  47. # grad_R=-log_P*R*grad_theta
  48. loss = -log_prob * R
  49. with tape.stop_recording():
  50. # 优化策略网络
  51. grads = tape.gradient(loss, self.trainable_variables)
  52. # print(grads)
  53. self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
  54. self.data = [] # 清空轨迹
  55. def main():
  56. pi = Policy() # 创建策略网络
  57. pi(tf.random.normal((4,4)))
  58. pi.summary()
  59. score = 0.0 # 计分
  60. print_interval = 20 # 打印间隔
  61. returns = []
  62. for n_epi in range(400):
  63. s = env.reset() # 回到游戏初始状态,返回s0
  64. with tf.GradientTape(persistent=True) as tape:
  65. for t in range(501): # CartPole-v1 forced to terminates at 500 step.
  66. # 送入状态向量,获取策略
  67. s = tf.constant(s,dtype=tf.float32)
  68. # s: [4] => [1,4]
  69. s = tf.expand_dims(s, axis=0)
  70. prob = pi(s) # 动作分布:[1,2]
  71. # 从类别分布中采样1个动作, shape: [1]
  72. a = tf.random.categorical(tf.math.log(prob), 1)[0]
  73. a = int(a) # Tensor转数字
  74. s_prime, r, done, info = env.step(a)
  75. # 记录动作a和动作产生的奖励r
  76. # prob shape:[1,2]
  77. pi.put_data((r, tf.math.log(prob[0][a])))
  78. s = s_prime # 刷新状态
  79. score += r # 累积奖励
  80. if n_epi >1000:
  81. env.render()
  82. # im = Image.fromarray(s)
  83. # im.save("res/%d.jpg" % info['frames'][0])
  84. if done: # 当前episode终止
  85. break
  86. # episode终止后,训练一次网络
  87. pi.train_net(tape)
  88. del tape
  89. if n_epi%print_interval==0 and n_epi!=0:
  90. returns.append(score/print_interval)
  91. print(f"# of episode :{n_epi}, avg score : {score/print_interval}")
  92. score = 0.0
  93. env.close() # 关闭环境
  94. plt.plot(np.arange(len(returns))*print_interval, returns)
  95. plt.plot(np.arange(len(returns))*print_interval, returns, 's')
  96. plt.xlabel('回合数')
  97. plt.ylabel('总回报')
  98. plt.savefig('reinforce-tf-cartpole.svg')
  99. if __name__ == '__main__':
  100. main()