Add API for specifying activation functions for neural network layers

cloudkj · cloudkj · commit 12dc959733ae · 2018-01-25T21:46:55.000-08:00
diff --git a/src/lambda_ml/core.clj b/src/lambda_ml/core.clj
@@ -28,10 +28,6 @@
   [coll]
   (first (apply max-key second (frequencies coll))))
 
-(defn sigmoid
-  [z]
-  (/ 1 (+ 1 (expt Math/E (- z)))))
-
 (defn random-partition
   "Returns n partitions of elements randomly selected from coll."
   [n coll]
@@ -66,3 +62,27 @@
            (sample-without-replacement (subvec (assoc coll index (first coll)) 1)
                                        (dec n)
                                        (conj s (nth coll index)))))))
+
+;; Common functions
+
+(defn relu
+  [z]
+  (max 0 z))
+
+(defn relu'
+  [z]
+  (if (> z 0) 1 0))
+
+(defn sigmoid
+  [z]
+  (/ 1 (+ 1 (expt Math/E (- z)))))
+
+(defn sigmoid'
+  [z]
+  (* z (- 1 z)))
+
+(defn derivative
+  [f]
+  (cond
+    (= f relu) relu'
+    (= f sigmoid) sigmoid'))
diff --git a/src/lambda_ml/neural_network.clj b/src/lambda_ml/neural_network.clj
@@ -5,11 +5,13 @@
   ```
   (def data [[0 0 [0]] [0 1 [1]] [1 0 [1]] [1 1 [0]]])
   (def fit
-    (let [hidden-layers [3]
-          alpha 0.5
-          lambda 0.001]
-      (-> #(neural-network-fit % data)
-          (iterate (make-neural-network hidden-layers alpha lambda))
+    (let [alpha 0.5
+          lambda 0.001
+          model (-> (make-neural-network alpha lambda)
+                    (add-neural-network-layer 2 sigmoid)   ;; input layer
+                    (add-neural-network-layer 3 sigmoid)   ;; hidden layer
+                    (add-neural-network-layer 1 sigmoid))] ;; output layer
+      (-> (iterate #(neural-network-fit % data) model)
           (nth 5000))))
   (neural-network-predict fit (map butlast data))
   ;;=> [[0.04262340225834812] [0.9582632706756758] [0.9581124103456861] [0.04103544440312673]]
@@ -29,38 +31,41 @@
 (defn feed-forward
   "Returns the activation values for nodes in a neural network after forward
   propagating the values of a single input example x through the network."
-  [x theta]
-  (reduce (fn [activations weights]
+  [x theta fns]
+  (reduce (fn [activations [weights f]]
             (let [inputs (if (empty? activations) (m/matrix x) (last activations))
                   inputs+bias (m/join bias inputs)
-                  outputs (m/emap c/sigmoid (m/mmul weights inputs+bias))]
+                  outputs (m/emap f (m/mmul weights inputs+bias))]
               (conj activations outputs)))
           []
-          theta))
+          (map vector theta fns)))
 
 (defn feed-forward-batch
   "Returns the activation values for nodes in a neural network after forward
   propagating a collection of input examples x through the network."
-  [x theta]
-  (-> (reduce (fn [inputs weights]
+  [x theta fns]
+  (-> (reduce (fn [inputs [weights f]]
                 (let [bias (m/broadcast 1.0 [1 (m/column-count inputs)])
                       inputs+bias (m/join bias inputs)
-                      outputs (m/emap c/sigmoid (m/mmul weights inputs+bias))]
+                      outputs (m/emap f (m/mmul weights inputs+bias))]
                   outputs))
               (m/transpose (m/matrix x))
-              theta)
+              (map vector theta fns))
       (m/transpose)))
 
 (defn back-propagate
   "Returns the errors of each node in a neural network after propagating the
   the errors at the output nodes, computed against a single target value y,
   backwards through the network."
-  [y theta activations output-error]
-  (->> (map vector (reverse (rest theta)) (reverse (butlast activations)))
-       (reduce (fn [errors [w a]]
-                 (cons (m/mul a (m/sub 1 a) (m/mmul (first errors) (drop-bias w)))
+  [y theta fns' activations output-error]
+  (->> (map vector
+            (reverse (rest theta))
+            (reverse (butlast activations))
+            (reverse (butlast fns')))
+       (reduce (fn [errors [w a f]]
+                 (cons (m/mul (m/emap f a) (m/mmul (first errors) (drop-bias w)))
                        errors))
-               (list (output-error y (last activations))))
+               (list (output-error y (last activations) (last fns'))))
        (vec)))
 
 (defn compute-gradients
@@ -77,39 +82,40 @@
   "Returns the numeric approximations of the gradients for each weight given the
   input values of a single example x and label y. Used for debugging by checking
   against the computed gradients during backpropagation."
-  [x y theta cost]
+  [x y theta fns cost]
   (mapv (fn [k weights]
           (m/matrix (for [i (range (m/row-count weights))]
                       (for [j (range (m/column-count weights))]
                         (let [w (m/select weights i j)
                               theta+ (assoc theta k (m/set-selection weights i j (+ w epsilon)))
                               theta- (assoc theta k (m/set-selection weights i j (- w epsilon)))]
-                          (/ (- (cost (list x) (list y) theta+)
-                                (cost (list x) (list y) theta-))
+                          (/ (- (cost (list x) (list y) theta+ fns)
+                                (cost (list x) (list y) theta- fns))
                              (* 2 epsilon)))))))
         (range)
         theta))
 
 (defn gradient-descent-step
   "Performs a single gradient step on the input and target values of a single
   example x and label y, and returns the updated weights."
-  [x y theta alpha lambda cost output-error]
-  (let [activations (feed-forward x theta)
-        errors (back-propagate y theta activations output-error)
+  [x y theta fns alpha lambda cost output-error]
+  (let [activations (feed-forward x theta fns)
+        errors (back-propagate y theta (map c/derivative fns) activations output-error)
         gradients (compute-gradients x activations errors)
         regularization (map (fn [w]
                               (-> (m/mul alpha lambda w)
                                   (m/set-column 0 (m/matrix (repeat (m/row-count w) 0)))))
                             theta)]
     ;; Numeric gradient checking
-    ;;(println (map (comp #(/ (m/esum %) (m/ecount %)) m/abs m/sub) gradients (numeric-gradients x y theta cost)))
+    ;;(println (map (comp #(/ (m/esum %) (m/ecount %)) m/abs m/sub) gradients (numeric-gradients x y theta fns cost)))
     (mapv m/sub theta (map #(m/mul % alpha) gradients) regularization)))
 
 (defn gradient-descent
   "Performs gradient descent on input and target values of all examples x and
   y, and returns the updated weights."
   [model x y]
-  (let [{alpha :alpha lambda :lambda theta :parameters cost :cost output-error :output-error} model]
+  (let [{alpha :alpha lambda :lambda theta :parameters cost :cost
+         fns :activation-fns output-error :output-error} model]
     (loop [inputs x
            targets y
            weights theta]
@@ -120,6 +126,7 @@
                (gradient-descent-step (first inputs)
                                       (first targets)
                                       weights
+                                      fns
                                       alpha
                                       lambda
                                       cost
@@ -139,24 +146,25 @@
 ;; Cost functions
 
 (defn cross-entropy-cost
-  [x y theta]
-  (let [a (feed-forward-batch x theta)]
+  [x y theta fns]
+  (let [a (feed-forward-batch x theta fns)]
     (/ (m/esum (m/add (m/mul y (m/log a))
                       (m/mul (m/sub 1 y) (m/log (m/sub 1 a)))))
        (- (count x)))))
 
 (defn cross-entropy-output-error
-  [y activations]
+  [y activations f']
+  ;; Cross entropy error is independent of the derivative of output activation
   (m/sub activations y))
 
 (defn quadratic-cost
-  [x y theta]
-  (/ (m/esum (m/square (m/sub (feed-forward-batch x theta) y)))
+  [x y theta fns]
+  (/ (m/esum (m/square (m/sub (feed-forward-batch x theta fns) y)))
      2))
 
 (defn quadratic-output-error
-  [y activations]
-  (m/mul (m/sub activations y) activations (m/sub 1 activations)))
+  [y activations f']
+  (m/mul (m/sub activations y) (m/emap f' activations)))
 
 ;; API
 
@@ -166,30 +174,25 @@
   ([model data]
    (neural-network-fit model (map (comp vec butlast) data) (map (comp vec last) data)))
   ([model x y]
-   (let [{hidden :hidden layers :layers theta :parameters} model
-         layers (or layers
-                    (concat [(count (first x))]   ;; number of input nodes
-                            hidden                ;; number of nodes at each hidden layer
-                            [(count (first y))])) ;; number of output nodes
+   (let [{layers :layers theta :parameters} model
          model (-> model
-                   (assoc :layers layers)
                    (assoc :parameters (or theta (init-parameters layers))))]
      (assoc model :parameters (gradient-descent model x y)))))
 
 (defn neural-network-predict
   "Predicts the values of example data using a neural network model."
   [model x]
-  (let [{theta :parameters} model]
+  (let [{theta :parameters fns :activation-fns} model]
     (when (not (nil? theta))
-      (mapv vec (feed-forward-batch x theta)))))
+      (mapv vec (feed-forward-batch x theta fns)))))
 
 (defn neural-network-cost
   ([model data]
    (neural-network-cost model (map (comp vec butlast) data) (map (comp vec last) data)))
   ([model x y]
-   (let [{theta :parameters cost :cost} model]
+   (let [{theta :parameters fns :activation-fns cost :cost} model]
      (when (not (nil? theta))
-       (cost x y theta)))))
+       (cost x y theta fns)))))
 
 (defn print-neural-network
   "Prints information about a given neural network."
@@ -202,16 +205,23 @@
                                                (str (dec (count (first thetai))) " x " (count thetai))))))))
 
 (defn make-neural-network
-  "Returns a neural network model where alpha is the learning rate and hidden is
-  a sequence of numbers where the ith element is the number of nodes in the ith
-  hidden layer."
-  ([hidden alpha lambda]
-   (make-neural-network hidden alpha lambda cross-entropy-cost))
-  ([hidden alpha lambda cost]
+  "Returns a neural network model where alpha is the learning rate."
+  ([alpha lambda]
+   (make-neural-network alpha lambda cross-entropy-cost))
+  ([alpha lambda cost]
    {:alpha alpha
     :lambda lambda
-    :hidden hidden
+    :layers []
+    :activation-fns []
     :cost cost
     :output-error (cond
                     (= cost cross-entropy-cost) cross-entropy-output-error
                     (= cost quadratic-cost)     quadratic-output-error)}))
+
+(defn add-neural-network-layer
+  "Adds a layer to a neural network model with n nodes and an activation
+  function f."
+  [model n f]
+  (-> model
+      (update :layers #(conj % n))
+      (update :activation-fns #(conj % f))))
diff --git a/test/lambda_ml/neural_network_test.clj b/test/lambda_ml/neural_network_test.clj
@@ -1,14 +1,16 @@
 (ns lambda-ml.neural-network-test
   (:require [clojure.test :refer :all]
+            [lambda-ml.core :refer :all]
             [lambda-ml.neural-network :refer :all]))
 
 (deftest test-feed-forward
   (let [weights [[[0.35 0.15 0.20]
                   [0.35 0.25 0.30]]
                  [[0.60 0.40 0.45]
                   [0.60 0.50 0.55]]]
+        fs [sigmoid sigmoid]
         x [0.05 0.1]
-        [hidden output] (feed-forward x weights)]
+        [hidden output] (feed-forward x weights fs)]
     (is (< (Math/abs (- 0.593269920 (first hidden)))  1E-6))
     (is (< (Math/abs (- 0.596884378 (second hidden))) 1E-6))
     (is (< (Math/abs (- 0.751365070 (first output)))  1E-6))
@@ -20,8 +22,9 @@
                   [ 0.5   0.3  -0.4]]
                  [[-0.1  -0.4   0.1   0.6]
                   [ 0.6   0.2  -0.1  -0.2]]]
+        fs [sigmoid sigmoid]
         x [0.6 0.1]
-        [hidden output] (feed-forward x weights)]
+        [hidden output] (feed-forward x weights fs)]
     (is (< (Math/abs (- 0.53494294 (nth hidden 0))) 1E-6))
     (is (< (Math/abs (- 0.55477923 (nth hidden 1))) 1E-6))
     (is (< (Math/abs (- 0.65475346 (nth hidden 2))) 1E-6))
@@ -33,11 +36,13 @@
                   [0.35 0.25 0.30]]
                  [[0.60 0.40 0.45]
                   [0.60 0.50 0.55]]]
+        fs [sigmoid sigmoid]
         x [0.05 0.1]
         y [0.01 0.99]
         alpha 0.5
         lambda 0
-        [w0 w1] (gradient-descent-step x y weights alpha lambda
+        [w0 w1] (gradient-descent-step x y weights fs
+                                       alpha lambda
                                        quadratic-cost
                                        quadratic-output-error)]
     (is (< (Math/abs (- 0.149780716 (nth (nth w0 0) 1))) 1E-6))
@@ -54,7 +59,10 @@
               [0 1 [1]]
               [1 0 [1]]
               [1 1 [0]]]
-        model (make-neural-network [3] 0.5 0.0)
+        model (-> (make-neural-network 0.5 0.0)
+                  (add-neural-network-layer 2 sigmoid)
+                  (add-neural-network-layer 3 sigmoid)
+                  (add-neural-network-layer 1 sigmoid))
         fit (nth (iterate #(neural-network-fit % data) model) 5000)
         predictions (map first (neural-network-predict fit (map butlast data)))]
     (is (> 0.1 (nth predictions 0)))