update notebook, more understandable

liyu95 · liyu95 · commit fbfcc5a5e426 · 2019-03-02T22:20:41.000+03:00
diff --git a/1.Fully_connected_psepssm_predict_enzyme/predict_enzyme.ipynb b/1.Fully_connected_psepssm_predict_enzyme/predict_enzyme.ipynb
@@ -4,22 +4,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Load data first"
+    "# Load related packages"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using TensorFlow backend.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import cPickle\n",
     "import numpy as np\n",
@@ -33,27 +25,16 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": true,
-    "scrolled": true
-   },
-   "outputs": [],
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
-    "test_ratio = 0.1\n",
-    "number_class = 2\n",
-    "number_features = 16306\n",
-    "batch_size = 1024\n",
-    "epochs = 50"
+    "# Function related to loading data"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 13,
+   "metadata": {},
    "outputs": [],
    "source": [
     "def Pfam_from_pickle_file_encoding(name_list_pickle_filename,model_names_list_filename):\n",
@@ -76,9 +57,16 @@
     "\treturn encoding"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load the data"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -106,21 +94,55 @@
     "label = tf.keras.utils.to_categorical(label,num_classes=2)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Define hyper-parameters"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 15,
    "metadata": {
-    "collapsed": true
+    "scrolled": true
    },
    "outputs": [],
+   "source": [
+    "test_ratio = 0.1 # how much data for training and how much data for testing\n",
+    "number_class = 2 # total number of classes, useful for define network structure\n",
+    "number_features = 16306 # total number of feature, useful for define network structure\n",
+    "batch_size = 1024 # stochastic gradient descent, training batch size\n",
+    "epochs = 5 # training epoches"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Splite training data and testing data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "x_train, x_test, y_train, y_test = train_test_split(\n",
     "    feature, label, test_size=test_ratio, random_state=0)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Build the network"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -130,86 +152,115 @@
       "_________________________________________________________________\n",
       "Layer (type)                 Output Shape              Param #   \n",
       "=================================================================\n",
-      "dense_1 (Dense)              (None, 1024)              16698368  \n",
+      "dense_7 (Dense)              (None, 1024)              16698368  \n",
       "_________________________________________________________________\n",
-      "dropout_1 (Dropout)          (None, 1024)              0         \n",
+      "dropout_5 (Dropout)          (None, 1024)              0         \n",
       "_________________________________________________________________\n",
-      "dense_2 (Dense)              (None, 1024)              1049600   \n",
+      "dense_8 (Dense)              (None, 1024)              1049600   \n",
       "_________________________________________________________________\n",
-      "dropout_2 (Dropout)          (None, 1024)              0         \n",
+      "dropout_6 (Dropout)          (None, 1024)              0         \n",
       "_________________________________________________________________\n",
-      "dense_3 (Dense)              (None, 2)                 2050      \n",
+      "dense_9 (Dense)              (None, 2)                 2050      \n",
       "=================================================================\n",
       "Total params: 17,750,018\n",
       "Trainable params: 17,750,018\n",
       "Non-trainable params: 0\n",
-      "_________________________________________________________________\n",
-      "Train on 39902 samples, validate on 4434 samples\n",
-      "Epoch 1/50\n",
-      "39902/39902 [==============================] - 9s 232us/step - loss: 0.4076 - acc: 0.8252 - val_loss: 0.2237 - val_acc: 0.9213\n",
-      "Epoch 2/50\n",
-      "39902/39902 [==============================] - 8s 191us/step - loss: 0.1455 - acc: 0.9540 - val_loss: 0.1822 - val_acc: 0.9443\n",
-      "Epoch 3/50\n",
-      "39902/39902 [==============================] - 7s 185us/step - loss: 0.1090 - acc: 0.9658 - val_loss: 0.1856 - val_acc: 0.9436\n",
-      "Epoch 4/50\n",
-      "39902/39902 [==============================] - 7s 166us/step - loss: 0.0979 - acc: 0.9679 - val_loss: 0.1887 - val_acc: 0.9441\n",
-      "Epoch 5/50\n",
-      "39902/39902 [==============================] - 6s 162us/step - loss: 0.0912 - acc: 0.9693 - val_loss: 0.1952 - val_acc: 0.9452\n",
-      "Epoch 6/50\n",
-      "39902/39902 [==============================] - 7s 175us/step - loss: 0.0861 - acc: 0.9712 - val_loss: 0.2007 - val_acc: 0.9454\n",
-      "Epoch 7/50\n",
-      "39902/39902 [==============================] - 7s 178us/step - loss: 0.0845 - acc: 0.9715 - val_loss: 0.2042 - val_acc: 0.9425\n",
-      "Epoch 8/50\n",
-      "39902/39902 [==============================] - 7s 178us/step - loss: 0.0821 - acc: 0.9723 - val_loss: 0.2068 - val_acc: 0.9436\n",
-      "Epoch 9/50\n",
-      "39902/39902 [==============================] - 7s 164us/step - loss: 0.0810 - acc: 0.9721 - val_loss: 0.2098 - val_acc: 0.9429\n",
-      "Epoch 10/50\n",
-      "39902/39902 [==============================] - 6s 149us/step - loss: 0.0788 - acc: 0.9731 - val_loss: 0.2098 - val_acc: 0.9434\n",
-      "Epoch 11/50\n",
-      "39902/39902 [==============================] - 6s 156us/step - loss: 0.0782 - acc: 0.9733 - val_loss: 0.2208 - val_acc: 0.9416\n",
-      "Epoch 12/50\n",
-      "39902/39902 [==============================] - 7s 170us/step - loss: 0.0768 - acc: 0.9732 - val_loss: 0.2277 - val_acc: 0.9432\n",
-      "Epoch 13/50\n",
-      "39902/39902 [==============================] - 7s 174us/step - loss: 0.0765 - acc: 0.9738 - val_loss: 0.2270 - val_acc: 0.9418\n",
-      "Epoch 14/50\n",
-      "39902/39902 [==============================] - 7s 175us/step - loss: 0.0742 - acc: 0.9744 - val_loss: 0.2364 - val_acc: 0.9391\n",
-      "Epoch 15/50\n",
-      "16384/39902 [===========>..................] - ETA: 3s - loss: 0.0697 - acc: 0.9757"
+      "_________________________________________________________________\n"
      ]
     }
    ],
    "source": [
-    "model = Sequential()\n",
-    "model.add(Dense(1024, activation='relu', input_shape=(number_features,)))\n",
-    "model.add(Dropout(0.3))\n",
-    "model.add(Dense(1024, activation='relu'))\n",
-    "model.add(Dropout(0.3))\n",
-    "model.add(Dense(number_class, activation='softmax'))\n",
-    "\n",
-    "model.summary()\n",
-    "\n",
+    "model = Sequential() # linear stack of layers\n",
+    "model.add(Dense(1024, activation='relu', input_shape=(number_features,))) # fully connected layer\n",
+    "model.add(Dropout(0.3)) # dropout some nodes to avoid overfitting\n",
+    "model.add(Dense(1024, activation='relu')) # fully conncted layer\n",
+    "model.add(Dropout(0.3)) # dropout\n",
+    "model.add(Dense(number_class, activation='softmax')) # final classification layer\n",
+    "model.summary() # summarize the model structure and parameters"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Define loss, optimizer (update rule), and metrics of monitoring the training process"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "model.compile(loss='categorical_crossentropy',\n",
     "              optimizer=keras.optimizers.adam(),\n",
-    "              metrics=['accuracy'])\n",
-    "\n",
+    "              metrics=['accuracy'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run the training loop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train on 39902 samples, validate on 4434 samples\n",
+      "Epoch 1/5\n",
+      "39902/39902 [==============================] - 6s 144us/step - loss: 0.3966 - acc: 0.8318 - val_loss: 0.2212 - val_acc: 0.9283\n",
+      "Epoch 2/5\n",
+      "39902/39902 [==============================] - 6s 142us/step - loss: 0.1457 - acc: 0.9545 - val_loss: 0.1830 - val_acc: 0.9436\n",
+      "Epoch 3/5\n",
+      "39902/39902 [==============================] - 6s 140us/step - loss: 0.1083 - acc: 0.9663 - val_loss: 0.1853 - val_acc: 0.9452\n",
+      "Epoch 4/5\n",
+      "39902/39902 [==============================] - 5s 132us/step - loss: 0.0986 - acc: 0.9670 - val_loss: 0.1906 - val_acc: 0.9447\n",
+      "Epoch 5/5\n",
+      "39902/39902 [==============================] - 5s 120us/step - loss: 0.0916 - acc: 0.9697 - val_loss: 0.1960 - val_acc: 0.9454\n"
+     ]
+    }
+   ],
+   "source": [
     "history = model.fit(x_train, y_train,\n",
     "                    batch_size=batch_size,\n",
     "                    epochs=epochs,\n",
     "                    verbose=1,\n",
-    "                    validation_data=(x_test, y_test))\n",
-    "score = model.evaluate(x_test, y_test, verbose=0)\n",
-    "print('Test loss:', score[0])\n",
-    "print('Test accuracy:', score[1])"
+    "                    validation_data=(x_test, y_test))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate the trained model"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "('Test loss:', 0.1959872514036361)\n",
+      "('Test accuracy:', 0.9454217409840241)\n"
+     ]
+    }
+   ],
+   "source": [
+    "score = model.evaluate(x_test, y_test, verbose=0)\n",
+    "print('Test loss:', score[0])\n",
+    "print('Test accuracy:', score[1])"
+   ]
   }
  ],
  "metadata": {
@@ -221,14 +272,14 @@
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.1"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.13"
   }
  },
  "nbformat": 4,