Skip to content

Commit ab40c17

Browse files
committed
Scope summaries by worker
1 parent 1a126a1 commit ab40c17

File tree

4 files changed

+79
-71
lines changed

4 files changed

+79
-71
lines changed

PolicyGradient/a3c/estimator_test.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,15 @@ def testPredict(self):
4747
pred = sess.run(estimator.predictions, feed_dict)
4848

4949
# Assertions
50-
self.assertTrue(loss > 0.0)
50+
self.assertTrue(loss != 0.0)
5151
self.assertEqual(pred["probs"].shape, (1, len(VALID_ACTIONS)))
5252
self.assertEqual(pred["logits"].shape, (1, len(VALID_ACTIONS)))
5353

5454
def testGradient(self):
5555
env = make_env()
5656
sp = StateProcessor()
5757
estimator = PolicyEstimator(len(VALID_ACTIONS))
58+
grads = [g for g, _ in estimator.grads_and_vars]
5859

5960
with self.test_session() as sess:
6061
sess.run(tf.initialize_all_variables())
@@ -64,16 +65,17 @@ def testGradient(self):
6465
processed_state = atari_helpers.atari_make_initial_state(state)
6566
processed_states = np.array([processed_state])
6667

67-
# Run feeds
68+
# Run feeds to get gradients
6869
feed_dict = {
6970
estimator.states: processed_states,
7071
estimator.targets: [1.0],
7172
estimator.actions: [1]
7273
}
73-
loss = sess.run(estimator.train_op, feed_dict)
74+
grads_ = sess.run(grads, feed_dict)
7475

75-
# Assertions
76-
self.assertTrue(loss > 0.0)
76+
# Apply calculated gradients
77+
grad_feed_dict = { k: v for k, v in zip(grads, grads_) }
78+
_ = sess.run(estimator.train_op, grad_feed_dict)
7779

7880

7981
class ValueEstimatorTest(tf.test.TestCase):
@@ -99,13 +101,14 @@ def testPredict(self):
99101
pred = sess.run(estimator.predictions, feed_dict)
100102

101103
# Assertions
102-
self.assertTrue(loss > 0.0)
104+
self.assertTrue(loss != 0.0)
103105
self.assertEqual(pred["logits"].shape, (1,))
104106

105107
def testGradient(self):
106108
env = make_env()
107109
sp = StateProcessor()
108110
estimator = ValueEstimator()
111+
grads = [g for g, _ in estimator.grads_and_vars]
109112

110113
with self.test_session() as sess:
111114
sess.run(tf.initialize_all_variables())
@@ -120,10 +123,11 @@ def testGradient(self):
120123
estimator.states: processed_states,
121124
estimator.targets: [1.0],
122125
}
123-
loss = sess.run(estimator.train_op, feed_dict)
126+
grads_ = sess.run(grads, feed_dict)
124127

125-
# Assertions
126-
self.assertTrue(loss > 0.0)
128+
# Apply calculated gradients
129+
grad_feed_dict = { k: v for k, v in zip(grads, grads_) }
130+
_ = sess.run(estimator.train_op, grad_feed_dict)
127131

128132
if __name__ == '__main__':
129133
unittest.main()

PolicyGradient/a3c/estimators.py

+44-50
Original file line numberDiff line numberDiff line change
@@ -69,44 +69,41 @@ def __init__(self, num_outputs, reuse=False, trainable=True):
6969
with tf.variable_scope("policy_net"):
7070
self.logits = tf.contrib.layers.fully_connected(fc1, num_outputs, activation_fn=None)
7171
self.probs = tf.nn.softmax(self.logits)
72+
self.probs = tf.clip_by_value(self.probs, 1e-6, 1.0)
7273

7374
self.predictions = {
7475
"logits": self.logits,
7576
"probs": self.probs
7677
}
7778

78-
if not trainable:
79-
return
80-
8179
# We add cross-entropy to the loss to encourage exploration
82-
self.cross_entropy = -tf.reduce_sum(self.probs * tf.log(self.probs), 1)
80+
self.cross_entropy = -tf.reduce_sum(self.probs * tf.log(self.probs), 1, name="cross_entropy")
81+
self.cross_entropy_mean = tf.reduce_mean(self.cross_entropy, name="cross_entropy_mean")
8382

8483
# Get the predictions for the chosen actions only
8584
gather_indices = tf.range(batch_size) * tf.shape(self.probs)[1] + self.actions
8685
self.picked_action_probs = tf.gather(tf.reshape(self.probs, [-1]), gather_indices)
8786

8887
self.losses = - (tf.log(self.picked_action_probs) * self.targets + 0.01 * self.cross_entropy)
89-
self.loss = tf.reduce_sum(self.losses)
88+
self.loss = tf.reduce_sum(self.losses, name="loss")
9089

91-
tf.scalar_summary("policy_net/loss", self.loss)
92-
tf.scalar_summary("policy_net/advantage_mean", tf.reduce_mean(self.targets))
93-
tf.scalar_summary("policy_net/entropy_mean", tf.reduce_mean(self.cross_entropy))
94-
tf.histogram_summary("policy_net/cross_entropy", self.cross_entropy)
95-
tf.histogram_summary("policy_net/actions", self.actions)
90+
tf.scalar_summary(self.loss.op.name, self.loss)
91+
tf.scalar_summary(self.cross_entropy_mean.op.name, self.cross_entropy_mean)
92+
tf.histogram_summary(self.cross_entropy.op.name, self.cross_entropy)
9693

97-
# Optimizer Parameters from original paper
98-
self.optimizer = tf.train.AdamOptimizer(1e-4)
99-
self.train_op = tf.contrib.layers.optimize_loss(
100-
loss=self.loss,
101-
global_step=tf.contrib.framework.get_global_step(),
102-
learning_rate=1e-4,
103-
optimizer=self.optimizer,
104-
# clip_gradients=5.0,
105-
summaries=tf.contrib.layers.optimizers.OPTIMIZER_SUMMARIES)
94+
if trainable:
95+
self.optimizer = tf.train.AdamOptimizer(1e-4)
96+
self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
97+
self.grads_and_vars = [[grad, var] for grad, var in self.grads_and_vars if grad is not None]
98+
self.train_op = self.optimizer.apply_gradients(self.grads_and_vars,
99+
global_step=tf.contrib.framework.get_global_step())
106100

107-
# Merge summaries from this network and the shared network (but not the value net)
108-
summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES)
109-
self.summaries = tf.merge_summary([s for s in summary_ops if "policy_net" in s.name or "shared" in s.name])
101+
# Merge summaries from this network and the shared network (but not the value net)
102+
var_scope_name = tf.get_variable_scope().name
103+
summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES)
104+
sumaries = [s for s in summary_ops if "policy_net" in s.name or "shared" in s.name]
105+
sumaries = [s for s in summary_ops if var_scope_name in s.name]
106+
self.summaries = tf.merge_summary(sumaries)
110107

111108

112109
class ValueEstimator():
@@ -139,39 +136,36 @@ def __init__(self, reuse=False, trainable=True):
139136
inputs=fc1,
140137
num_outputs=1,
141138
activation_fn=None)
142-
self.logits = tf.squeeze(self.logits, squeeze_dims=[1])
139+
self.logits = tf.squeeze(self.logits, squeeze_dims=[1], name="logits")
143140

144141
self.losses = tf.squared_difference(self.logits, self.targets)
145-
self.loss = tf.reduce_sum(self.losses)
142+
self.loss = tf.reduce_sum(self.losses, name="loss")
146143

147144
self.predictions = {
148145
"logits": self.logits
149146
}
150147

151-
if not trainable:
152-
return
153-
154-
# Optimizer Parameters from original paper
155-
self.optimizer = tf.train.AdamOptimizer(1e-4)
156-
self.train_op = tf.contrib.layers.optimize_loss(
157-
loss=self.loss,
158-
global_step=tf.contrib.framework.get_global_step(),
159-
learning_rate=1e-4,
160-
optimizer=self.optimizer,
161-
# clip_gradients=5.0,
162-
summaries=tf.contrib.layers.optimizers.OPTIMIZER_SUMMARIES)
163-
164148
# Summaries
165-
tf.scalar_summary("value_net/loss", self.loss)
166-
tf.scalar_summary("value_net/max_value", tf.reduce_max(self.logits))
167-
tf.scalar_summary("value_net/min_value", tf.reduce_min(self.logits))
168-
tf.scalar_summary("value_net/mean_value", tf.reduce_mean(self.logits))
169-
tf.scalar_summary("value_net/reward_max", tf.reduce_max(self.targets))
170-
tf.scalar_summary("value_net/reward_min", tf.reduce_min(self.targets))
171-
tf.scalar_summary("value_net/reward_mean", tf.reduce_mean(self.targets))
172-
tf.histogram_summary("value_net/reward_targets", self.targets)
173-
tf.histogram_summary("value_net/values", self.logits)
174-
175-
# Merge summaries from this network and the shared network (but not the policy net)
176-
summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES)
177-
self.summaries = tf.merge_summary([s for s in summary_ops if "value_net" in s.name or "shared" in s.name])
149+
prefix = tf.get_variable_scope().name
150+
tf.scalar_summary(self.loss.name, self.loss)
151+
tf.scalar_summary("{}/max_value".format(prefix), tf.reduce_max(self.logits))
152+
tf.scalar_summary("{}/min_value".format(prefix), tf.reduce_min(self.logits))
153+
tf.scalar_summary("{}/mean_value".format(prefix), tf.reduce_mean(self.logits))
154+
tf.scalar_summary("{}/reward_max".format(prefix), tf.reduce_max(self.targets))
155+
tf.scalar_summary("{}/reward_min".format(prefix), tf.reduce_min(self.targets))
156+
tf.scalar_summary("{}/reward_mean".format(prefix), tf.reduce_mean(self.targets))
157+
tf.histogram_summary("{}/reward_targets".format(prefix), self.targets)
158+
tf.histogram_summary("{}/values".format(prefix), self.logits)
159+
160+
if trainable:
161+
self.optimizer = tf.train.AdamOptimizer(1e-4)
162+
self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
163+
self.grads_and_vars = [[grad, var] for grad, var in self.grads_and_vars if grad is not None]
164+
self.train_op = self.optimizer.apply_gradients(self.grads_and_vars,
165+
global_step=tf.contrib.framework.get_global_step())
166+
167+
var_scope_name = tf.get_variable_scope().name
168+
summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES)
169+
sumaries = [s for s in summary_ops if "policy_net" in s.name or "shared" in s.name]
170+
sumaries = [s for s in summary_ops if var_scope_name in s.name]
171+
self.summaries = tf.merge_summary(sumaries)

PolicyGradient/a3c/worker.py

+21-11
Original file line numberDiff line numberDiff line change
@@ -165,20 +165,30 @@ def update(self, transitions, sess):
165165
policy_targets.append(policy_target)
166166
value_targets.append(reward)
167167

168+
# Calculate the gradients
168169
feed_dict = {
169-
self.global_policy_net.states: np.array(states),
170-
self.global_policy_net.targets: policy_targets,
171-
self.global_policy_net.actions: actions,
172-
self.global_value_net.states: np.array(states),
173-
self.global_value_net.targets: value_targets,
170+
self.policy_net.states: np.array(states),
171+
self.policy_net.targets: policy_targets,
172+
self.policy_net.actions: actions,
173+
self.value_net.states: np.array(states),
174+
self.value_net.targets: value_targets,
174175
}
175176

176-
# Apply policy net update
177-
global_step, pnet_loss, pnet_summaries, vnet_loss, vnet_summaries = sess.run(
178-
[self.global_step, self.global_policy_net.train_op,
179-
self.global_policy_net.summaries, self.global_value_net.train_op,
180-
self.global_value_net.summaries],
181-
feed_dict)
177+
# Calculate the local gradients
178+
pnet_loss, vnet_loss, pnet_grads, vnet_grads, pnet_summaries, vnet_summaries = sess.run([
179+
self.policy_net.loss,
180+
self.value_net.loss,
181+
[g for g, _ in self.policy_net.grads_and_vars],
182+
[g for g, _ in self.value_net.grads_and_vars],
183+
self.policy_net.summaries,
184+
self.value_net.summaries
185+
], feed_dict)
186+
187+
# Apply the gradients to the global nets
188+
pnet_grad_ops = [g for g, _ in self.global_policy_net.grads_and_vars]
189+
vnet_grad_ops = [g for g, _ in self.global_value_net.grads_and_vars]
190+
grad_feed_dict = { k: v for k, v in zip(pnet_grad_ops + vnet_grad_ops, pnet_grads + vnet_grads)}
191+
global_step, _, _, = sess.run([self.global_step, self.global_policy_net.train_op, self.global_value_net.train_op], grad_feed_dict)
182192

183193
# Write summaries
184194
if self.summary_writer is not None:

PolicyGradient/a3c/worker_test.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def testRunNStepsAndUpdate(self):
8686
state = self.sp.process(self.env.reset())
8787
processed_state = atari_helpers.atari_make_initial_state(state)
8888
w.state = processed_state
89-
transitions = w.run_n_steps(10, sess)
89+
transitions, local_t, global_t = w.run_n_steps(10, sess)
9090
policy_net_loss, value_net_loss, policy_net_summaries, value_net_summaries = w.update(transitions, sess)
9191

9292
self.assertEqual(len(transitions), 10)

0 commit comments

Comments
 (0)