@@ -69,44 +69,41 @@ def __init__(self, num_outputs, reuse=False, trainable=True):
69
69
with tf .variable_scope ("policy_net" ):
70
70
self .logits = tf .contrib .layers .fully_connected (fc1 , num_outputs , activation_fn = None )
71
71
self .probs = tf .nn .softmax (self .logits )
72
+ self .probs = tf .clip_by_value (self .probs , 1e-6 , 1.0 )
72
73
73
74
self .predictions = {
74
75
"logits" : self .logits ,
75
76
"probs" : self .probs
76
77
}
77
78
78
- if not trainable :
79
- return
80
-
81
79
# We add cross-entropy to the loss to encourage exploration
82
- self .cross_entropy = - tf .reduce_sum (self .probs * tf .log (self .probs ), 1 )
80
+ self .cross_entropy = - tf .reduce_sum (self .probs * tf .log (self .probs ), 1 , name = "cross_entropy" )
81
+ self .cross_entropy_mean = tf .reduce_mean (self .cross_entropy , name = "cross_entropy_mean" )
83
82
84
83
# Get the predictions for the chosen actions only
85
84
gather_indices = tf .range (batch_size ) * tf .shape (self .probs )[1 ] + self .actions
86
85
self .picked_action_probs = tf .gather (tf .reshape (self .probs , [- 1 ]), gather_indices )
87
86
88
87
self .losses = - (tf .log (self .picked_action_probs ) * self .targets + 0.01 * self .cross_entropy )
89
- self .loss = tf .reduce_sum (self .losses )
88
+ self .loss = tf .reduce_sum (self .losses , name = "loss" )
90
89
91
- tf .scalar_summary ("policy_net/loss" , self .loss )
92
- tf .scalar_summary ("policy_net/advantage_mean" , tf .reduce_mean (self .targets ))
93
- tf .scalar_summary ("policy_net/entropy_mean" , tf .reduce_mean (self .cross_entropy ))
94
- tf .histogram_summary ("policy_net/cross_entropy" , self .cross_entropy )
95
- tf .histogram_summary ("policy_net/actions" , self .actions )
90
+ tf .scalar_summary (self .loss .op .name , self .loss )
91
+ tf .scalar_summary (self .cross_entropy_mean .op .name , self .cross_entropy_mean )
92
+ tf .histogram_summary (self .cross_entropy .op .name , self .cross_entropy )
96
93
97
- # Optimizer Parameters from original paper
98
- self .optimizer = tf .train .AdamOptimizer (1e-4 )
99
- self .train_op = tf .contrib .layers .optimize_loss (
100
- loss = self .loss ,
101
- global_step = tf .contrib .framework .get_global_step (),
102
- learning_rate = 1e-4 ,
103
- optimizer = self .optimizer ,
104
- # clip_gradients=5.0,
105
- summaries = tf .contrib .layers .optimizers .OPTIMIZER_SUMMARIES )
94
+ if trainable :
95
+ self .optimizer = tf .train .AdamOptimizer (1e-4 )
96
+ self .grads_and_vars = self .optimizer .compute_gradients (self .loss )
97
+ self .grads_and_vars = [[grad , var ] for grad , var in self .grads_and_vars if grad is not None ]
98
+ self .train_op = self .optimizer .apply_gradients (self .grads_and_vars ,
99
+ global_step = tf .contrib .framework .get_global_step ())
106
100
107
- # Merge summaries from this network and the shared network (but not the value net)
108
- summary_ops = tf .get_collection (tf .GraphKeys .SUMMARIES )
109
- self .summaries = tf .merge_summary ([s for s in summary_ops if "policy_net" in s .name or "shared" in s .name ])
101
+ # Merge summaries from this network and the shared network (but not the value net)
102
+ var_scope_name = tf .get_variable_scope ().name
103
+ summary_ops = tf .get_collection (tf .GraphKeys .SUMMARIES )
104
+ sumaries = [s for s in summary_ops if "policy_net" in s .name or "shared" in s .name ]
105
+ sumaries = [s for s in summary_ops if var_scope_name in s .name ]
106
+ self .summaries = tf .merge_summary (sumaries )
110
107
111
108
112
109
class ValueEstimator ():
@@ -139,39 +136,36 @@ def __init__(self, reuse=False, trainable=True):
139
136
inputs = fc1 ,
140
137
num_outputs = 1 ,
141
138
activation_fn = None )
142
- self .logits = tf .squeeze (self .logits , squeeze_dims = [1 ])
139
+ self .logits = tf .squeeze (self .logits , squeeze_dims = [1 ], name = "logits" )
143
140
144
141
self .losses = tf .squared_difference (self .logits , self .targets )
145
- self .loss = tf .reduce_sum (self .losses )
142
+ self .loss = tf .reduce_sum (self .losses , name = "loss" )
146
143
147
144
self .predictions = {
148
145
"logits" : self .logits
149
146
}
150
147
151
- if not trainable :
152
- return
153
-
154
- # Optimizer Parameters from original paper
155
- self .optimizer = tf .train .AdamOptimizer (1e-4 )
156
- self .train_op = tf .contrib .layers .optimize_loss (
157
- loss = self .loss ,
158
- global_step = tf .contrib .framework .get_global_step (),
159
- learning_rate = 1e-4 ,
160
- optimizer = self .optimizer ,
161
- # clip_gradients=5.0,
162
- summaries = tf .contrib .layers .optimizers .OPTIMIZER_SUMMARIES )
163
-
164
148
# Summaries
165
- tf .scalar_summary ("value_net/loss" , self .loss )
166
- tf .scalar_summary ("value_net/max_value" , tf .reduce_max (self .logits ))
167
- tf .scalar_summary ("value_net/min_value" , tf .reduce_min (self .logits ))
168
- tf .scalar_summary ("value_net/mean_value" , tf .reduce_mean (self .logits ))
169
- tf .scalar_summary ("value_net/reward_max" , tf .reduce_max (self .targets ))
170
- tf .scalar_summary ("value_net/reward_min" , tf .reduce_min (self .targets ))
171
- tf .scalar_summary ("value_net/reward_mean" , tf .reduce_mean (self .targets ))
172
- tf .histogram_summary ("value_net/reward_targets" , self .targets )
173
- tf .histogram_summary ("value_net/values" , self .logits )
174
-
175
- # Merge summaries from this network and the shared network (but not the policy net)
176
- summary_ops = tf .get_collection (tf .GraphKeys .SUMMARIES )
177
- self .summaries = tf .merge_summary ([s for s in summary_ops if "value_net" in s .name or "shared" in s .name ])
149
+ prefix = tf .get_variable_scope ().name
150
+ tf .scalar_summary (self .loss .name , self .loss )
151
+ tf .scalar_summary ("{}/max_value" .format (prefix ), tf .reduce_max (self .logits ))
152
+ tf .scalar_summary ("{}/min_value" .format (prefix ), tf .reduce_min (self .logits ))
153
+ tf .scalar_summary ("{}/mean_value" .format (prefix ), tf .reduce_mean (self .logits ))
154
+ tf .scalar_summary ("{}/reward_max" .format (prefix ), tf .reduce_max (self .targets ))
155
+ tf .scalar_summary ("{}/reward_min" .format (prefix ), tf .reduce_min (self .targets ))
156
+ tf .scalar_summary ("{}/reward_mean" .format (prefix ), tf .reduce_mean (self .targets ))
157
+ tf .histogram_summary ("{}/reward_targets" .format (prefix ), self .targets )
158
+ tf .histogram_summary ("{}/values" .format (prefix ), self .logits )
159
+
160
+ if trainable :
161
+ self .optimizer = tf .train .AdamOptimizer (1e-4 )
162
+ self .grads_and_vars = self .optimizer .compute_gradients (self .loss )
163
+ self .grads_and_vars = [[grad , var ] for grad , var in self .grads_and_vars if grad is not None ]
164
+ self .train_op = self .optimizer .apply_gradients (self .grads_and_vars ,
165
+ global_step = tf .contrib .framework .get_global_step ())
166
+
167
+ var_scope_name = tf .get_variable_scope ().name
168
+ summary_ops = tf .get_collection (tf .GraphKeys .SUMMARIES )
169
+ sumaries = [s for s in summary_ops if "policy_net" in s .name or "shared" in s .name ]
170
+ sumaries = [s for s in summary_ops if var_scope_name in s .name ]
171
+ self .summaries = tf .merge_summary (sumaries )
0 commit comments