Skip to content

Commit 9be600f

Browse files
committedMar 6, 2017
first version
1 parent 27e0e85 commit 9be600f

File tree

9 files changed

+1089
-16
lines changed

9 files changed

+1089
-16
lines changed
 

‎Reinforcement_learning_TUT/2_Q_Learning_maze/maze_env.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
MAZE_W = 4 # grid width
2424

2525

26-
class Maze(tk.Tk):
26+
class Maze(tk.Tk, object):
2727
def __init__(self):
2828
super(Maze, self).__init__()
2929
self.action_space = ['u', 'd', 'l', 'r']
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
"""
2+
The double DQN based on this paper: https://arxiv.org/abs/1509.06461
3+
4+
View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
5+
6+
Using:
7+
Tensorflow: 1.0
8+
"""
9+
10+
import numpy as np
11+
import pandas as pd
12+
import tensorflow as tf
13+
14+
np.random.seed(1)
15+
tf.set_random_seed(1)
16+
17+
18+
class DoubleDQN:
19+
def __init__(
20+
self,
21+
n_actions,
22+
n_features,
23+
learning_rate=0.005,
24+
reward_decay=0.9,
25+
e_greedy=0.9,
26+
replace_target_iter=200,
27+
memory_size=500,
28+
batch_size=32,
29+
e_greedy_increment=None,
30+
output_graph=False,
31+
double_q=True,
32+
sess=None,
33+
):
34+
self.n_actions = n_actions
35+
self.n_features = n_features
36+
self.lr = learning_rate
37+
self.gamma = reward_decay
38+
self.epsilon_max = e_greedy
39+
self.replace_target_iter = replace_target_iter
40+
self.memory_size = memory_size
41+
self.batch_size = batch_size
42+
self.epsilon_increment = e_greedy_increment
43+
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
44+
45+
self.double_q = double_q # decide to use double q or not
46+
47+
self.learn_step_counter = 0
48+
self.memory = pd.DataFrame(np.zeros((self.memory_size, n_features*2+2)))
49+
self._build_net()
50+
if sess is None:
51+
self.sess = tf.Session()
52+
else:
53+
self.sess = sess
54+
if output_graph:
55+
tf.summary.FileWriter("logs/", self.sess.graph)
56+
# self.sess.run(tf.global_variables_initializer())
57+
self.cost_his = []
58+
59+
def _build_net(self):
60+
def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
61+
with tf.variable_scope('l1'):
62+
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
63+
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
64+
l1 = tf.nn.relu(tf.matmul(s, w1) + b1)
65+
66+
with tf.variable_scope('l2'):
67+
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
68+
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
69+
out = tf.matmul(l1, w2) + b2
70+
return out
71+
# ------------------ build evaluate_net ------------------
72+
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
73+
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
74+
75+
with tf.variable_scope('eval_net'):
76+
c_names, n_l1, w_initializer, b_initializer = \
77+
['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \
78+
tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
79+
80+
self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer)
81+
82+
with tf.variable_scope('loss'):
83+
self.loss = tf.reduce_sum(tf.squared_difference(self.q_target, self.q_eval))
84+
with tf.variable_scope('train'):
85+
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
86+
87+
# ------------------ build target_net ------------------
88+
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
89+
with tf.variable_scope('target_net'):
90+
c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
91+
92+
self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer)
93+
94+
def store_transition(self, s, a, r, s_):
95+
if not hasattr(self, 'memory_counter'):
96+
self.memory_counter = 0
97+
transition = np.hstack((s, [a, r], s_))
98+
index = self.memory_counter % self.memory_size
99+
self.memory.iloc[index, :] = transition
100+
self.memory_counter += 1
101+
102+
def choose_action(self, observation):
103+
observation = observation[np.newaxis, :]
104+
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
105+
action = np.argmax(actions_value)
106+
107+
if not hasattr(self, 'q'): # record action value it get
108+
self.q = []
109+
self.running_q = 0
110+
self.running_q = self.running_q*0.99 + 0.01 * np.max(actions_value)
111+
self.q.append(self.running_q)
112+
113+
if np.random.uniform() > self.epsilon: # choosing action
114+
action = np.random.randint(0, self.n_actions)
115+
return action
116+
117+
def _replace_target_params(self):
118+
t_params = tf.get_collection('target_net_params')
119+
e_params = tf.get_collection('eval_net_params')
120+
self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
121+
122+
def learn(self):
123+
if self.learn_step_counter % self.replace_target_iter == 0:
124+
self._replace_target_params()
125+
print('\ntarget_params_replaced\n')
126+
127+
batch_memory = self.memory.sample(self.batch_size) \
128+
if self.memory_counter > self.memory_size \
129+
else self.memory.iloc[:self.memory_counter].sample(self.batch_size, replace=True)
130+
131+
q_next, q_eval4next = self.sess.run(
132+
[self.q_next, self.q_eval],
133+
feed_dict={self.s_: batch_memory.iloc[:, -self.n_features:], # next observation
134+
self.s: batch_memory.iloc[:, -self.n_features:]}) # next observation
135+
q_eval = self.sess.run(self.q_eval, {self.s: batch_memory.iloc[:, :self.n_features]})
136+
137+
q_target = q_eval.copy()
138+
139+
batch_index = np.arange(self.batch_size, dtype=np.int32)
140+
eval_act_index = batch_memory.iloc[:, self.n_features].astype(int)
141+
reward = batch_memory.iloc[:, self.n_features + 1]
142+
143+
if self.double_q:
144+
max_act4next = np.argmax(q_eval4next, axis=1) # the action that brings the highest value is evaluated by q_eval
145+
selected_q_next = q_next[batch_index, max_act4next] # Double DQN, select q_next depending on above actions
146+
else:
147+
selected_q_next = np.max(q_next, axis=1) # the natural DQN
148+
149+
q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next
150+
151+
_, self.cost = self.sess.run([self._train_op, self.loss],
152+
feed_dict={self.s: batch_memory.iloc[:, :self.n_features],
153+
self.q_target: q_target})
154+
self.cost_his.append(self.cost)
155+
156+
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
157+
self.learn_step_counter += 1
158+
159+
160+
161+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
"""
2+
Deep Q network,
3+
4+
The mountain car example
5+
"""
6+
7+
8+
import gym
9+
from RL_brain import DoubleDQN
10+
import numpy as np
11+
import matplotlib.pyplot as plt
12+
import tensorflow as tf
13+
14+
15+
env = gym.make('Pendulum-v0')
16+
env.seed(1)
17+
MEMORY_SIZE = 3000
18+
ACTION_SPACE = 5
19+
20+
sess = tf.Session()
21+
with tf.variable_scope('Natural_DQN'):
22+
natural_DQN = DoubleDQN(
23+
n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
24+
e_greedy_increment=0.001, double_q=False, sess=sess
25+
)
26+
27+
with tf.variable_scope('Double_DQN'):
28+
double_DQN = DoubleDQN(
29+
n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
30+
e_greedy_increment=0.001, double_q=True, sess=sess, output_graph=True)
31+
32+
sess.run(tf.global_variables_initializer())
33+
34+
35+
def train(RL):
36+
total_steps = 0
37+
observation = env.reset()
38+
while True:
39+
# if total_steps - MEMORY_SIZE > 8000: env.render()
40+
41+
action = RL.choose_action(observation)
42+
43+
f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4) # convert to [-2 ~ 2] float actions
44+
45+
observation_, reward, done, info = env.step(np.array([f_action]))
46+
47+
reward /= 10 # normalize to a range of (-1, 0). r = 0 when get upright
48+
# the Q target at upright state will be 0, because Q_target = r + gamma * Qmax(s', a') = 0 + gamma * 0
49+
# so when Q at this state is greater than 0, the agent overestimates the Q. Please refer to the final result.
50+
51+
RL.store_transition(observation, action, reward, observation_)
52+
53+
if total_steps > MEMORY_SIZE: # learning
54+
RL.learn()
55+
56+
if total_steps - MEMORY_SIZE > 10000: # stop game
57+
break
58+
59+
observation = observation_
60+
total_steps += 1
61+
return RL.q
62+
63+
q_natural = train(natural_DQN)
64+
q_double = train(double_DQN)
65+
66+
plt.plot(np.array(q_natural), c='r', label='natural')
67+
plt.plot(np.array(q_double), c='b', label='double')
68+
plt.legend(loc='best')
69+
plt.ylabel('Q eval')
70+
plt.xlabel('training steps')
71+
plt.grid()
72+
plt.show()

‎Reinforcement_learning_TUT/5.2_Prioritized_Replay_DQN/RL_brain.py

+502
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
"""
2+
Deep Q network,
3+
4+
The mountain car example
5+
"""
6+
7+
8+
import gym
9+
from RL_brain import DoubleDQNPrioritizedReplay, DeepQNetwork
10+
import matplotlib.pyplot as plt
11+
import tensorflow as tf
12+
import numpy as np
13+
14+
env = gym.make('MountainCar-v0')
15+
env.seed(1)
16+
MEMORY_SIZE = 10000
17+
18+
sess = tf.Session()
19+
with tf.variable_scope('natural_DQN'):
20+
RL_natural = DeepQNetwork(n_actions=3, n_features=2, learning_rate=0.005, e_greedy=0.9,
21+
reward_decay=0.9,
22+
replace_target_iter=500, memory_size=MEMORY_SIZE,
23+
e_greedy_increment=0.0001, sess=sess)
24+
25+
with tf.variable_scope('DQN_with_prioritized_replay'):
26+
RL_prio = DoubleDQNPrioritizedReplay(n_actions=3, n_features=2, learning_rate=0.005, e_greedy=0.9,
27+
reward_decay=0.9,
28+
replace_target_iter=500, memory_size=MEMORY_SIZE,
29+
e_greedy_increment=0.0001, double_q=False, sess=sess)
30+
31+
sess.run(tf.global_variables_initializer())
32+
33+
34+
def train(RL):
35+
total_steps = 0
36+
for i_episode in range(10):
37+
observation = env.reset()
38+
while True:
39+
env.render()
40+
41+
action = RL.choose_action(observation)
42+
43+
observation_, reward, done, info = env.step(action)
44+
45+
if done: reward = 10
46+
47+
RL.store_transition(observation, action, reward, observation_)
48+
49+
if total_steps > MEMORY_SIZE:
50+
RL.learn()
51+
52+
if done:
53+
print('episode: ', i_episode,
54+
' epsilon: ', round(RL.epsilon, 2))
55+
break
56+
57+
observation = observation_
58+
total_steps += 1
59+
return RL.qn
60+
61+
print('train natural DQN')
62+
qn_natural = train(RL_natural)
63+
print('train DQN prioritized')
64+
qn_prio = train(RL_prio)
65+
66+
plt.plot(np.array(qn_natural), c='b', label='natural DQN')
67+
plt.plot(np.array(qn_prio), c='r', label='DQN with prioritized replay')
68+
plt.legend()
69+
plt.ylabel('max q next')
70+
plt.xlabel('training steps')
71+
plt.grid()
72+
plt.show()
73+
74+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
"""
2+
The Dueling DQN based on this paper: https://arxiv.org/abs/1511.06581
3+
4+
View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
5+
6+
Using:
7+
Tensorflow: 1.0
8+
"""
9+
10+
import numpy as np
11+
import pandas as pd
12+
import tensorflow as tf
13+
import matplotlib.pyplot as plt
14+
15+
np.random.seed(1)
16+
tf.set_random_seed(1)
17+
18+
19+
class DuelingDQN:
20+
def __init__(
21+
self,
22+
n_actions,
23+
n_features,
24+
learning_rate=0.01,
25+
reward_decay=0.9,
26+
e_greedy=0.9,
27+
replace_target_iter=300,
28+
memory_size=500,
29+
batch_size=32,
30+
e_greedy_increment=None,
31+
output_graph=False,
32+
dueling=True,
33+
sess=None,
34+
):
35+
self.n_actions = n_actions
36+
self.n_features = n_features
37+
self.lr = learning_rate
38+
self.gamma = reward_decay
39+
self.epsilon_max = e_greedy
40+
self.replace_target_iter = replace_target_iter
41+
self.memory_size = memory_size
42+
self.batch_size = batch_size
43+
self.epsilon_increment = e_greedy_increment
44+
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
45+
46+
self.dueling = dueling # decide to use dueling DQN or not
47+
48+
self.learn_step_counter = 0
49+
self.memory = pd.DataFrame(np.zeros((self.memory_size, n_features*2+2)))
50+
self._build_net()
51+
if sess is None:
52+
self.sess = tf.Session()
53+
else:
54+
self.sess = sess
55+
if output_graph:
56+
tf.summary.FileWriter("logs/", self.sess.graph)
57+
# self.sess.run(tf.global_variables_initializer())
58+
self.cost_his = []
59+
60+
def _build_net(self):
61+
def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
62+
with tf.variable_scope('l1'):
63+
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
64+
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
65+
l1 = tf.nn.relu(tf.matmul(s, w1) + b1)
66+
67+
if self.dueling:
68+
# Dueling DQN
69+
with tf.variable_scope('Value'):
70+
w2 = tf.get_variable('w2', [n_l1, 1], initializer=w_initializer, collections=c_names)
71+
b2 = tf.get_variable('b2', [1, 1], initializer=b_initializer, collections=c_names)
72+
self.V = tf.matmul(l1, w2) + b2
73+
74+
with tf.variable_scope('Advantage'):
75+
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
76+
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
77+
self.A = tf.matmul(l1, w2) + b2
78+
79+
with tf.variable_scope('Q'):
80+
out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True)) # Q = V(s) + A(s,a)
81+
else:
82+
with tf.variable_scope('Q'):
83+
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
84+
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
85+
out = tf.matmul(l1, w2) + b2
86+
87+
return out
88+
89+
# ------------------ build evaluate_net ------------------
90+
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
91+
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
92+
with tf.variable_scope('eval_net'):
93+
c_names, n_l1, w_initializer, b_initializer = \
94+
['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \
95+
tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
96+
97+
self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer)
98+
99+
with tf.variable_scope('loss'):
100+
self.loss = tf.reduce_sum(tf.squared_difference(self.q_target, self.q_eval))
101+
with tf.variable_scope('train'):
102+
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
103+
104+
# ------------------ build target_net ------------------
105+
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
106+
with tf.variable_scope('target_net'):
107+
c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
108+
109+
self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer)
110+
111+
def store_transition(self, s, a, r, s_):
112+
if not hasattr(self, 'memory_counter'):
113+
self.memory_counter = 0
114+
transition = np.hstack((s, [a, r], s_))
115+
index = self.memory_counter % self.memory_size
116+
self.memory.iloc[index, :] = transition
117+
self.memory_counter += 1
118+
119+
def choose_action(self, observation):
120+
observation = observation[np.newaxis, :]
121+
if np.random.uniform() < self.epsilon: # choosing action
122+
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
123+
action = np.argmax(actions_value)
124+
else:
125+
action = np.random.randint(0, self.n_actions)
126+
return action
127+
128+
def _replace_target_params(self):
129+
t_params = tf.get_collection('target_net_params')
130+
e_params = tf.get_collection('eval_net_params')
131+
self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
132+
133+
def learn(self):
134+
if self.learn_step_counter % self.replace_target_iter == 0:
135+
self._replace_target_params()
136+
print('\ntarget_params_replaced\n')
137+
138+
batch_memory = self.memory.sample(self.batch_size) \
139+
if self.memory_counter > self.memory_size \
140+
else self.memory.iloc[:self.memory_counter].sample(self.batch_size, replace=True)
141+
142+
q_next, q_eval4next, = self.sess.run(
143+
[self.q_next, self.q_eval],
144+
feed_dict={self.s_: batch_memory.iloc[:, -self.n_features:], # next observation
145+
self.s: batch_memory.iloc[:, -self.n_features:]}) # next observation
146+
q_eval = self.sess.run(self.q_eval, {self.s: batch_memory.iloc[:, :self.n_features]})
147+
148+
q_target = q_eval.copy()
149+
150+
batch_index = np.arange(self.batch_size, dtype=np.int32)
151+
eval_act_index = batch_memory.iloc[:, self.n_features].astype(int)
152+
reward = batch_memory.iloc[:, self.n_features + 1]
153+
154+
q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
155+
156+
_, self.cost = self.sess.run([self._train_op, self.loss],
157+
feed_dict={self.s: batch_memory.iloc[:, :self.n_features],
158+
self.q_target: q_target})
159+
self.cost_his.append(self.cost)
160+
161+
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
162+
self.learn_step_counter += 1
163+
164+
165+
166+
167+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
"""
2+
Deep Q network,
3+
4+
The mountain car example
5+
"""
6+
7+
8+
import gym
9+
from RL_brain import DuelingDQN
10+
import numpy as np
11+
import matplotlib.pyplot as plt
12+
import tensorflow as tf
13+
14+
15+
env = gym.make('Pendulum-v0')
16+
# env.seed(1)
17+
MEMORY_SIZE = 3000
18+
ACTION_SPACE = 5
19+
20+
sess = tf.Session()
21+
with tf.variable_scope('natural'):
22+
natural_DQN = DuelingDQN(n_actions=ACTION_SPACE, n_features=3, learning_rate=0.001, e_greedy=0.9,
23+
reward_decay=0.9,
24+
replace_target_iter=200, memory_size=MEMORY_SIZE,
25+
e_greedy_increment=0.001, sess=sess, dueling=False)
26+
27+
with tf.variable_scope('dueling'):
28+
dueling_DQN = DuelingDQN(n_actions=ACTION_SPACE, n_features=3, learning_rate=0.001, e_greedy=0.9,
29+
reward_decay=0.9,
30+
replace_target_iter=200, memory_size=MEMORY_SIZE,
31+
e_greedy_increment=0.001, sess=sess, dueling=True,
32+
output_graph=False)
33+
34+
sess.run(tf.global_variables_initializer())
35+
36+
37+
def train(RL):
38+
acc_r = [0]
39+
total_steps = 0
40+
observation = env.reset()
41+
while True:
42+
# if total_steps-MEMORY_SIZE > 9000: env.render()
43+
44+
action = RL.choose_action(observation)
45+
46+
f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4) # [-2 ~ 2] float actions
47+
observation_, reward, done, info = env.step(np.array([f_action]))
48+
49+
reward = reward/10
50+
acc_r.append(reward + acc_r[-1]) # accumulated reward
51+
52+
RL.store_transition(observation, action, reward, observation_)
53+
54+
if total_steps > MEMORY_SIZE:
55+
RL.learn()
56+
57+
if total_steps-MEMORY_SIZE > 10000:
58+
break
59+
60+
observation = observation_
61+
total_steps += 1
62+
return RL.cost_his, acc_r
63+
64+
c_natural, r_natural = train(natural_DQN)
65+
c_dueling, r_dueling = train(dueling_DQN)
66+
67+
plt.figure(1)
68+
plt.plot(np.array(c_natural), c='r', label='natural')
69+
plt.plot(np.array(c_dueling), c='b', label='dueling')
70+
plt.legend(loc='best')
71+
plt.ylabel('cost')
72+
plt.xlabel('training steps')
73+
plt.grid()
74+
75+
plt.figure(2)
76+
plt.plot(np.array(r_natural), c='r', label='natural')
77+
plt.plot(np.array(r_dueling), c='b', label='dueling')
78+
plt.legend(loc='best')
79+
plt.ylabel('accumulated reward')
80+
plt.xlabel('training steps')
81+
plt.grid()
82+
83+
plt.show()
84+

‎Reinforcement_learning_TUT/5_Deep_Q_Network/RL_brain.py

+14-7
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,9 @@ def _build_net(self):
8484
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
8585
self.q_eval = tf.matmul(l1, w2) + b2
8686

87-
with tf.name_scope('loss'):
87+
with tf.variable_scope('loss'):
8888
self.loss = tf.reduce_sum(tf.squared_difference(self.q_target, self.q_eval))
89-
with tf.name_scope('train'):
89+
with tf.variable_scope('train'):
9090
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
9191

9292
# ------------------ build target_net ------------------
@@ -150,14 +150,19 @@ def learn(self):
150150
q_next, q_eval = self.sess.run(
151151
[self.q_next, self.q_eval],
152152
feed_dict={
153-
self.s_: batch_memory.iloc[:, -self.n_features:],
154-
self.s: batch_memory.iloc[:, :self.n_features]
153+
self.s_: batch_memory.iloc[:, -self.n_features:], # fixed params
154+
self.s: batch_memory.iloc[:, :self.n_features], # newest params
155155
})
156156

157157
# change q_target w.r.t q_eval's action
158158
q_target = q_eval.copy()
159-
q_target[np.arange(self.batch_size, dtype=np.int32), batch_memory.iloc[:, self.n_features].astype(int)] = \
160-
batch_memory.iloc[:, self.n_features+1] + self.gamma * np.max(q_next, axis=1)
159+
160+
batch_index = np.arange(self.batch_size, dtype=np.int32)
161+
eval_act_index = batch_memory.iloc[:, self.n_features].astype(int)
162+
reward = batch_memory.iloc[:, self.n_features + 1]
163+
164+
q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
165+
161166
"""
162167
For example in this batch I have 2 samples and 3 actions:
163168
q_eval =
@@ -180,7 +185,7 @@ def learn(self):
180185
[[(-1)-(1), 0, 0],
181186
[0, 0, (-2)-(6)]]
182187
183-
We then backpropagate this error w.r.t the corresponded action to network,
188+
We then backpropagate this error w.r.t the corresponding action to network,
184189
leave other action as error=0 cause we didn't choose it.
185190
"""
186191

@@ -198,6 +203,8 @@ def learn(self):
198203
def plot_cost(self):
199204
import matplotlib.pyplot as plt
200205
plt.plot(np.arange(len(self.cost_his)), self.cost_his)
206+
plt.ylabel('Cost')
207+
plt.xlabel('training steps')
201208
plt.show()
202209

203210

‎Reinforcement_learning_TUT/6_OpenAI_gym/RL_brain.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,9 @@ def _build_net(self):
8484
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
8585
self.q_eval = tf.matmul(l1, w2) + b2
8686

87-
with tf.name_scope('loss'):
87+
with tf.variable_scope('loss'):
8888
self.loss = tf.reduce_sum(tf.squared_difference(self.q_target, self.q_eval))
89-
with tf.name_scope('train'):
89+
with tf.variable_scope('train'):
9090
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
9191

9292
# ------------------ build target_net ------------------
@@ -150,14 +150,19 @@ def learn(self):
150150
q_next, q_eval = self.sess.run(
151151
[self.q_next, self.q_eval],
152152
feed_dict={
153-
self.s_: batch_memory.iloc[:, -self.n_features:],
154-
self.s: batch_memory.iloc[:, :self.n_features]
153+
self.s_: batch_memory.iloc[:, -self.n_features:], # fixed params
154+
self.s: batch_memory.iloc[:, :self.n_features], # newest params
155155
})
156156

157157
# change q_target w.r.t q_eval's action
158158
q_target = q_eval.copy()
159-
q_target[np.arange(self.batch_size, dtype=np.int32), batch_memory.iloc[:, self.n_features].astype(int)] = \
160-
batch_memory.iloc[:, self.n_features+1] + self.gamma * np.max(q_next, axis=1)
159+
160+
batch_index = np.arange(self.batch_size, dtype=np.int32)
161+
eval_act_index = batch_memory.iloc[:, self.n_features].astype(int)
162+
reward = batch_memory.iloc[:, self.n_features + 1]
163+
164+
q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
165+
161166
"""
162167
For example in this batch I have 2 samples and 3 actions:
163168
q_eval =
@@ -180,7 +185,7 @@ def learn(self):
180185
[[(-1)-(1), 0, 0],
181186
[0, 0, (-2)-(6)]]
182187
183-
We then backpropagate this error w.r.t the corresponded action to network,
188+
We then backpropagate this error w.r.t the corresponding action to network,
184189
leave other action as error=0 cause we didn't choose it.
185190
"""
186191

@@ -194,10 +199,11 @@ def learn(self):
194199
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
195200
self.learn_step_counter += 1
196201

197-
198202
def plot_cost(self):
199203
import matplotlib.pyplot as plt
200204
plt.plot(np.arange(len(self.cost_his)), self.cost_his)
205+
plt.ylabel('Cost')
206+
plt.xlabel('training steps')
201207
plt.show()
202208

203209

0 commit comments

Comments
 (0)
Please sign in to comment.