dqn_tf.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. import collections
  2. import random
  3. import gym,os
  4. import numpy as np
  5. import tensorflow as tf
  6. from tensorflow import keras
  7. from tensorflow.keras import layers,optimizers,losses
  8. env = gym.make('CartPole-v1') # 创建游戏环境
  9. env.seed(1234)
  10. tf.random.set_seed(1234)
  11. np.random.seed(1234)
  12. os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
  13. assert tf.__version__.startswith('2.')
  14. # Hyperparameters
  15. learning_rate = 0.0002
  16. gamma = 0.99
  17. buffer_limit = 50000
  18. batch_size = 32
  19. class ReplayBuffer():
  20. # 经验回放池
  21. def __init__(self):
  22. # 双向队列
  23. self.buffer = collections.deque(maxlen=buffer_limit)
  24. def put(self, transition):
  25. self.buffer.append(transition)
  26. def sample(self, n):
  27. # 从回放池采样n个5元组
  28. mini_batch = random.sample(self.buffer, n)
  29. s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
  30. # 按类别进行整理
  31. for transition in mini_batch:
  32. s, a, r, s_prime, done_mask = transition
  33. s_lst.append(s)
  34. a_lst.append([a])
  35. r_lst.append([r])
  36. s_prime_lst.append(s_prime)
  37. done_mask_lst.append([done_mask])
  38. # 转换成Tensor
  39. return tf.constant(s_lst, dtype=tf.float32),\
  40. tf.constant(a_lst, dtype=tf.int32), \
  41. tf.constant(r_lst, dtype=tf.float32), \
  42. tf.constant(s_prime_lst, dtype=tf.float32), \
  43. tf.constant(done_mask_lst, dtype=tf.float32)
  44. def size(self):
  45. return len(self.buffer)
  46. class Qnet(keras.Model):
  47. def __init__(self):
  48. # 创建Q网络,输入为状态向量,输出为动作的Q值
  49. super(Qnet, self).__init__()
  50. self.fc1 = layers.Dense(256, kernel_initializer='he_normal')
  51. self.fc2 = layers.Dense(256, kernel_initializer='he_normal')
  52. self.fc3 = layers.Dense(2, kernel_initializer='he_normal')
  53. def call(self, x, training=None):
  54. x = tf.nn.relu(self.fc1(x))
  55. x = tf.nn.relu(self.fc2(x))
  56. x = self.fc3(x)
  57. return x
  58. def sample_action(self, s, epsilon):
  59. # 送入状态向量,获取策略: [4]
  60. s = tf.constant(s, dtype=tf.float32)
  61. # s: [4] => [1,4]
  62. s = tf.expand_dims(s, axis=0)
  63. out = self(s)[0]
  64. coin = random.random()
  65. # 策略改进:e-贪心方式
  66. if coin < epsilon:
  67. # epsilon大的概率随机选取
  68. return random.randint(0, 1)
  69. else: # 选择Q值最大的动作
  70. return int(tf.argmax(out))
  71. def train(q, q_target, memory, optimizer):
  72. # 通过Q网络和影子网络来构造贝尔曼方程的误差,
  73. # 并只更新Q网络,影子网络的更新会滞后Q网络
  74. huber = losses.Huber()
  75. for i in range(10): # 训练10次
  76. # 从缓冲池采样
  77. s, a, r, s_prime, done_mask = memory.sample(batch_size)
  78. with tf.GradientTape() as tape:
  79. # s: [b, 4]
  80. q_out = q(s) # 得到Q(s,a)的分布
  81. # 由于TF的gather_nd与pytorch的gather功能不一样,需要构造
  82. # gather_nd需要的坐标参数,indices:[b, 2]
  83. # pi_a = pi.gather(1, a) # pytorch只需要一行即可实现
  84. indices = tf.expand_dims(tf.range(a.shape[0]), axis=1)
  85. indices = tf.concat([indices, a], axis=1)
  86. q_a = tf.gather_nd(q_out, indices) # 动作的概率值, [b]
  87. q_a = tf.expand_dims(q_a, axis=1) # [b]=> [b,1]
  88. # 得到Q(s',a)的最大值,它来自影子网络! [b,4]=>[b,2]=>[b,1]
  89. max_q_prime = tf.reduce_max(q_target(s_prime),axis=1,keepdims=True)
  90. # 构造Q(s,a_t)的目标值,来自贝尔曼方程
  91. target = r + gamma * max_q_prime * done_mask
  92. # 计算Q(s,a_t)与目标值的误差
  93. loss = huber(q_a, target)
  94. # 更新网络,使得Q(s,a_t)估计符合贝尔曼方程
  95. grads = tape.gradient(loss, q.trainable_variables)
  96. # for p in grads:
  97. # print(tf.norm(p))
  98. # print(grads)
  99. optimizer.apply_gradients(zip(grads, q.trainable_variables))
  100. def main():
  101. env = gym.make('CartPole-v1') # 创建环境
  102. q = Qnet() # 创建Q网络
  103. q_target = Qnet() # 创建影子网络
  104. q.build(input_shape=(2,4))
  105. q_target.build(input_shape=(2,4))
  106. for src, dest in zip(q.variables, q_target.variables):
  107. dest.assign(src) # 影子网络权值来自Q
  108. memory = ReplayBuffer() # 创建回放池
  109. print_interval = 20
  110. score = 0.0
  111. optimizer = optimizers.Adam(lr=learning_rate)
  112. for n_epi in range(10000): # 训练次数
  113. # epsilon概率也会8%到1%衰减,越到后面越使用Q值最大的动作
  114. epsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200))
  115. s = env.reset() # 复位环境
  116. for t in range(600): # 一个回合最大时间戳
  117. # if n_epi>1000:
  118. # env.render()
  119. # 根据当前Q网络提取策略,并改进策略
  120. a = q.sample_action(s, epsilon)
  121. # 使用改进的策略与环境交互
  122. s_prime, r, done, info = env.step(a)
  123. done_mask = 0.0 if done else 1.0 # 结束标志掩码
  124. # 保存5元组
  125. memory.put((s, a, r / 100.0, s_prime, done_mask))
  126. s = s_prime # 刷新状态
  127. score += r # 记录总回报
  128. if done: # 回合结束
  129. break
  130. if memory.size() > 2000: # 缓冲池只有大于2000就可以训练
  131. train(q, q_target, memory, optimizer)
  132. if n_epi % print_interval == 0 and n_epi != 0:
  133. for src, dest in zip(q.variables, q_target.variables):
  134. dest.assign(src) # 影子网络权值来自Q
  135. print("# of episode :{}, avg score : {:.1f}, buffer size : {}, " \
  136. "epsilon : {:.1f}%" \
  137. .format(n_epi, score / print_interval, memory.size(), epsilon * 100))
  138. score = 0.0
  139. env.close()
  140. if __name__ == '__main__':
  141. main()