Update to add global norm

titu1994 · titu1994 · commit f9063db4e5d4 · 2018-05-29T21:24:51.000-05:00
diff --git a/controller.py b/controller.py
@@ -187,6 +187,7 @@ def __init__(self, policy_session, num_layers, state_space,
                  exploration=0.8,
                  controller_cells=32,
                  embedding_dim=20,
+                 clip_norm=0.0,
                  restore_controller=False):
         self.policy_session = policy_session  # type: tf.Session
 
@@ -200,6 +201,7 @@ def __init__(self, policy_session, num_layers, state_space,
         self.discount_factor = discount_factor
         self.exploration = exploration
         self.restore_controller = restore_controller
+        self.clip_norm = clip_norm
 
         self.reward_buffer = []
         self.state_buffer = []
@@ -372,7 +374,15 @@ def build_policy_network(self):
                 tf.summary.scalar('total_loss', self.total_loss)
 
                 self.gradients = self.optimizer.compute_gradients(self.total_loss)
+
                 with tf.name_scope('policy_gradients'):
+                    # normalize gradients so that they dont explode if argument passed
+                    if self.clip_norm is not None and self.clip_norm != 0.0:
+                        norm = tf.constant(self.clip_norm, dtype=tf.float32)
+                        gradients, vars = zip(*self.gradients)  # unpack the two lists of gradients and the variables
+                        gradients, _ = tf.clip_by_global_norm(gradients, norm)  # clip by the norm
+                        self.gradients = list(zip(gradients, vars))  # we need to set values later, convert to list
+
                     # compute policy gradients
                     for i, (grad, var) in enumerate(self.gradients):
                         if grad is not None:
@@ -489,3 +499,10 @@ def train_step(self):
                 self.exploration *= 0.99
 
         return loss
+
+    def remove_files(self):
+        files = ['train_history.csv', 'buffers.txt']
+
+        for file in files:
+            if os.path.exists(file):
+                os.remove(file)
diff --git a/manager.py b/manager.py
@@ -9,7 +9,7 @@ class NetworkManager:
     '''
     Helper class to manage the generation of subnetwork training given a dataset
     '''
-    def __init__(self, dataset, epochs=5, child_batchsize=128, acc_beta=0.8, clip_rewards=False):
+    def __init__(self, dataset, epochs=5, child_batchsize=128, acc_beta=0.8, clip_rewards=0.0):
         '''
         Manager which is tasked with creating subnetworks, training them on a dataset, and retrieving
         rewards in the term of accuracy, which is passed to the controller RNN.
@@ -19,7 +19,7 @@ def __init__(self, dataset, epochs=5, child_batchsize=128, acc_beta=0.8, clip_re
             epochs: number of epochs to train the subnetworks
             child_batchsize: batchsize of training the subnetworks
             acc_beta: exponential weight for the accuracy
-            clip_rewards: whether to clip rewards in [-0.05, 0.05] range to prevent
+            clip_rewards: float - to clip rewards in [-range, range] to prevent
                 large weight updates. Use when training is highly unstable.
         '''
         self.dataset = dataset
@@ -89,9 +89,12 @@ def get_rewards(self, model_fn, actions):
                 reward = np.clip(reward, -0.05, 0.05)
 
             # update moving accuracy with bias correction for 1st update
-            self.moving_acc = self.beta * self.moving_acc + (1 - self.beta) * acc
-            self.moving_acc = self.moving_acc / (1 - self.beta_bias)
-            self.beta_bias = 0
+            if self.beta > 0.0 and self.beta < 1.0:
+                self.moving_acc = self.beta * self.moving_acc + (1 - self.beta) * acc
+                self.moving_acc = self.moving_acc / (1 - self.beta_bias)
+                self.beta_bias = 0
+
+                reward = np.clip(reward, -0.1, 0.1)
 
             print()
             print("Manager: EWA Accuracy = ", self.moving_acc)
diff --git a/train.py b/train.py
@@ -17,12 +17,14 @@
 NUM_LAYERS = 4  # number of layers of the state space
 MAX_TRIALS = 250  # maximum number of models generated
 
-MAX_EPOCHS = 10  # maximum number of epochs to train
+MAX_EPOCHS = 1  # maximum number of epochs to train
 CHILD_BATCHSIZE = 128  # batchsize of the child models
-EXPLORATION = 0.8  # high exploration for the first 1000 steps
+EXPLORATION = 0.9  # high exploration for the first 1000 steps
 REGULARIZATION = 1e-3  # regularization strength
 CONTROLLER_CELLS = 32  # number of cells in RNN controller
-CLIP_REWARDS = False  # clip rewards in the [-0.05, 0.05] range
+EMBEDDING_DIM = 20  # dimension of the embeddings for each state
+ACCURACY_BETA = 0.8  # beta value for the moving average of the accuracy
+CLIP_REWARDS = 0.0  # clip rewards in the [-0.05, 0.05] range
 RESTORE_CONTROLLER = True  # restore controller to continue training
 
 # construct a state space
@@ -53,17 +55,22 @@
                             reg_param=REGULARIZATION,
                             exploration=EXPLORATION,
                             controller_cells=CONTROLLER_CELLS,
+                            embedding_dim=EMBEDDING_DIM,
                             restore_controller=RESTORE_CONTROLLER)
 
 # create the Network Manager
-manager = NetworkManager(dataset, epochs=MAX_EPOCHS, child_batchsize=CHILD_BATCHSIZE, clip_rewards=CLIP_REWARDS)
+manager = NetworkManager(dataset, epochs=MAX_EPOCHS, child_batchsize=CHILD_BATCHSIZE, clip_rewards=CLIP_REWARDS,
+                         acc_beta=ACCURACY_BETA)
 
 # get an initial random state space if controller needs to predict an
 # action from the initial state
 state = state_space.get_random_state_space(NUM_LAYERS)
 print("Initial Random State : ", state_space.parse_state_space_list(state))
 print()
 
+# clear the previous files
+controller.remove_files()
+
 # train for number of trails
 for trial in range(MAX_TRIALS):
     with policy_sess.as_default():