ZJULearning · xwk · Oct 22, 2019 · Oct 25, 2019 · Nov 12, 2019 · Nov 12, 2019
diff --git a/.gitignore b/.gitignore
@@ -99,3 +99,13 @@ ENV/
 
 # mypy
 .mypy_cache/
+/.idea
+/checkpoint
+/conda_venv
+/output
+/pretrained
+/train_data_viz
+/conda_venv.tf1.0
+/checkpoint.sav
+/checkpoints_worth_saving
+/temp
diff --git a/README.deckard.md b/README.deckard.md
@@ -0,0 +1,49 @@
+# How to do training
+ * checkout the code with my modifications
+ * create a python2.7 conda venv per environment.py2.7_tf1.14.yml
+ * copy our own training dataset at s3://deckard-data-science-assets-us-east-2/curated_datasets/street_number_recognition
+   * train.tfrecord and val.tfrecord
+   * see s3://deckard-data-science-assets-us-east-2/curated_datasets/street_number_recognitionreadme.txt for how these
+   tfrecord files are prepared
+ * if you want to try different trunk DNN, update model_type/feat_layers/strides values in config.py
+ * source set_env.sh
+    * basically add the pylib/src to PYTHONPATH
+ * ./scripts/train.sh 0,1 24
+     - 0,1 means use GPU0 and 1
+     - 24 means for each batch, each GPU handle 24 images
+ * the artifacts of the run will be created at ./checkpoint
+ * open another terminal and run
+    ```
+        python scripts/eval_fscore.py
+    ```  
+    This script will periodically evaluate the recall/precision/f1_score of the
+     latest checkpoint in ./checkpoint and save the result as tf_summary, which can be viewed with 
+     tensorboard with other metrics saved by the main training process 
+
+    Notice it will also save a fscore.csv in ./checkpoint.
+ * If you want to use tensorboard to visualize the metrics,
+    ```tensorboard --logdir checkpoint```
+
+# How to test the trained model on new images  
+ * If you need to get the detected boxes as data
+  - first, run
+  ```./scripts/test.sh ${GPU_ID} ${checkpoint_folder}/model.ckpt-xxx ${image_dir}```
+   The script will output the detected boxes as IC15 label file (one for each image) in 
+    ${checkpoint_folder}/test/model.ckpt-xxx/txt/*.txt
+
+  - then optionally, you can use street_num_spotting/src/street_num_spotting/convert_pixellink_test_result_to_our_json_format.py
+  to convert the IC15 label files to a json file (in our own label format)
+
+  - then you can use street_num_spotting/notebook/recognition_result_reviewer.ipynb to review
+  the detection result. You need to update following config values accordingly
+    - config.image_base_dir, set to image_dir
+    - config.recognition_result_json_fpath, set to the json label file created in the step above  
+
+ * If you just want to see the detected text boxes rendered on the original images
+    ```./scripts/test_any.sh ${GPU_ID} ${checkpoint_folder}/model.ckpt-xxx ${image_dir} ${output_dir}```
+  The script will output jpgs with bounding box rendered into output_dir
+
+
+# About the metrics saved in tf.summary
+ * pixel_link_loss = prediction_loss_on_clones + regularization_loss
+ * clone0/xxx_loss, the prediction loss component on clone 0
diff --git a/config.py b/config.py
@@ -8,6 +8,8 @@
 import pixel_link
 slim = tf.contrib.slim
 
+# optimizer = 'Adam'
+optimizer = 'Momentum'
 #=====================================================================
 #====================Pre-processing params START======================
 # VGG mean parameters.
@@ -27,7 +29,7 @@
 area_range = [0.1, 1]
 flip = False
 using_shorter_side_filtering=True
-min_shorter_side = 10
+min_shorter_side = 6
 max_shorter_side = np.infty
 #====================Pre-processing params END========================
 #=====================================================================
@@ -39,7 +41,7 @@
 #====================Post-processing params START=====================
 decode_method = pixel_link.DECODE_METHOD_join
 min_area = 300
-min_height = 10
+min_height = 12
 #====================Post-processing params END=======================
 #=====================================================================
 
@@ -58,12 +60,28 @@
 #pixel_neighbour_type = pixel_link.PIXEL_NEIGHBOUR_TYPE_4
 
 
-#model_type = pixel_link_symbol.MODEL_TYPE_vgg16
-#feat_layers = ['conv2_2', 'conv3_3', 'conv4_3', 'conv5_3', 'fc7']
-#strides = [2]
+# model_type = pixel_link_symbol.MODEL_TYPE_vgg16
+# feat_layers = ['conv2_2', 'conv3_3', 'conv4_3', 'conv5_3', 'fc7']
+# strides = [2]
+
 model_type = pixel_link_symbol.MODEL_TYPE_vgg16
+# with 512x512 input,
+# conv3_3 output is 128x128x256
+# conv4_3 output is 64x64x512
+# conv5_3 output is 32x32x512
+# fc7 output is 32x32x1024
+# all output is after Relu
 feat_layers = ['conv3_3', 'conv4_3', 'conv5_3', 'fc7']
 strides = [4]
+#
+# model_type = pixel_link_symbol.MODEL_TYPE_mobilenetv2
+# # with 512x512 input,
+# # layer_4 output is 128x128x24
+# # layer_7 output is 64x64x32
+# # layer_14 output is 32x32x96
+# # layer_19 output is 16x16x1280
+# feat_layers = ['layer_4', 'layer_7', 'layer_14', 'layer_19']
+# strides = [4]
 
 pixel_cls_weight_method = pixel_link.PIXEL_CLS_WEIGHT_bbox_balanced
 bbox_border_width = 1
@@ -184,7 +202,7 @@ def print_ckpt(path):
 
         if not print_ckpt(flags.train_dir):
             print_ckpt(flags.checkpoint_path)                
-            
+
         pprint(flags.__flags, stream=stream)
 
         print('\n# =========================================================================== #', file=stream)

diff --git a/datasets/dataset_factory.py b/datasets/dataset_factory.py
@@ -1,60 +1,88 @@
 """A factory-pattern class which returns classification image/label pairs."""
 from datasets import dataset_utils
 
+
 class DatasetConfig():
     def __init__(self, file_pattern, split_sizes):
         self.file_pattern = file_pattern
         self.split_sizes = split_sizes
-
+
+
 icdar2013 = DatasetConfig(
-        file_pattern = '*_%s.tfrecord', 
-        split_sizes = {
-            'train': 229,
-            'test': 233
-        }
+    file_pattern='*_%s.tfrecord',
+    split_sizes={
+        'train': 229,
+        'test': 233
+    }
 )
 icdar2015 = DatasetConfig(
-        file_pattern = 'icdar2015_%s.tfrecord', 
-        split_sizes = {
-            'train': 1000,
-            'test': 500
-        }
+    file_pattern='icdar2015_%s.tfrecord',
+    split_sizes={
+        'train': 1000,
+        'test': 500
+    }
 )
 td500 = DatasetConfig(
-        file_pattern = '*_%s.tfrecord', 
-        split_sizes = {
-            'train': 300,
-            'test': 200
-        }
+    file_pattern='*_%s.tfrecord',
+    split_sizes={
+        'train': 300,
+        'test': 200
+    }
 )
 tr400 = DatasetConfig(
-        file_pattern = 'tr400_%s.tfrecord', 
-        split_sizes = {
-            'train': 400
-        }
+    file_pattern='tr400_%s.tfrecord',
+    split_sizes={
+        'train': 400
+    }
 )
 scut = DatasetConfig(
-    file_pattern = 'scut_%s.tfrecord',
-    split_sizes = {
+    file_pattern='scut_%s.tfrecord',
+    split_sizes={
         'train': 1715
     }
 )
 
 synthtext = DatasetConfig(
-    file_pattern = '*.tfrecord',
-#     file_pattern = 'SynthText_*.tfrecord',
-    split_sizes = {
+    file_pattern='*.tfrecord',
+    #     file_pattern = 'SynthText_*.tfrecord',
+    split_sizes={
         'train': 858750
     }
 )
 
+street_number = DatasetConfig(
+    file_pattern='%s.tfrecord',
+    split_sizes={
+        'train': 576,
+        'val': 144,
+    }
+)
+
+synthesized_149k_and_street_number_train = DatasetConfig(
+    file_pattern='*.tfrecord',
+    split_sizes={
+        'train': 149107 + 576,
+    }
+)
+
+synthesized_149k = DatasetConfig(
+    file_pattern='bbox_clipped_within_image_boundary.tfrecord',
+    split_sizes={
+        'train': 149107,
+    }
+)
+
+
 datasets_map = {
-    'icdar2013':icdar2013,
-    'icdar2015':icdar2015,
-    'scut':scut,
-    'td500':td500,
-    'tr400':tr400,
-    'synthtext':synthtext
+    'icdar2013': icdar2013,
+    'icdar2015': icdar2015,
+    'scut': scut,
+    'td500': td500,
+    'tr400': tr400,
+    'synthtext': synthtext,
+    'street_number': street_number,
+    'synthesized_149k_and_street_number_train': synthesized_149k_and_street_number_train,
+    'synthesized_149k': synthesized_149k,
 }
 
 
@@ -76,4 +104,4 @@ def get_dataset(dataset_name, split_name, dataset_dir, reader=None):
     dataset_config = datasets_map[dataset_name];
     file_pattern = dataset_config.file_pattern
     num_samples = dataset_config.split_sizes[split_name]
-    return dataset_utils.get_split(split_name, dataset_dir,file_pattern, num_samples, reader)
+    return dataset_utils.get_split(split_name, dataset_dir, file_pattern, num_samples, reader)
diff --git a/datasets/dataset_utils.py b/datasets/dataset_utils.py
@@ -137,13 +137,15 @@ def get_list(obj, idx):
             'image/encoded': bytes_feature(image_data)}))
     return example
 
+
 def get_split(split_name, dataset_dir, file_pattern, num_samples, reader=None):
     dataset_dir = util.io.get_absolute_path(dataset_dir)
 
     if util.str.contains(file_pattern, '%'):
         file_pattern = util.io.join_path(dataset_dir, file_pattern % split_name)
     else:
         file_pattern = util.io.join_path(dataset_dir, file_pattern)
+
     # Allowing None in the signature so that dataset_factory can use the default.
     if reader is None:
         reader = tf.TFRecordReader

diff --git a/datasets/ic15_like_data_to_tfrecords.py b/datasets/ic15_like_data_to_tfrecords.py
@@ -0,0 +1,92 @@
+# encoding=utf-8
+import argparse
+import os
+
+import numpy as np
+import tensorflow as tf
+import util
+from dataset_utils import int64_feature, float_feature, bytes_feature, convert_to_example
+import config
+
+
+def cvt_to_tfrecords(output_path, data_path, gt_path):
+    image_names = util.io.ls(data_path, '.jpg')  # [0:10]
+    print "%d images found in %s" % (len(image_names), data_path)
+    with tf.python_io.TFRecordWriter(output_path) as tfrecord_writer:
+        for idx, image_name in enumerate(image_names):
+            oriented_bboxes = []
+            bboxes = []
+            labels = []  # a mask for labels_text list to indicate whether it is a 'ignored' label text or not
+            labels_text = []
+            path = util.io.join_path(data_path, image_name)
+            print "\tconverting image: %d/%d %s" % (idx, len(image_names), image_name)
+            image_data = tf.gfile.FastGFile(path, 'r').read()
+
+            image = util.img.imread(path, rgb=True)
+            shape = image.shape
+            h, w = shape[0:2]
+            h *= 1.0
+            w *= 1.0
+            image_name = util.str.split(image_name, '.')[0]
+            gt_name = 'gt_' + image_name + '.txt'
+            gt_filepath = util.io.join_path(gt_path, gt_name)
+            lines = util.io.read_lines(gt_filepath)
+
+            for line in lines:
+                line = util.str.remove_all(line, '\xef\xbb\xbf')
+                gt = util.str.split(line, ',')
+                oriented_box = [int(gt[i]) for i in range(8)]
+                oriented_box = np.asarray(oriented_box) / ([w, h] * 4)
+                # notice in the case of syntehsized data,
+                # it is possible some bbox will extend through the image boundary.                #
+                oriented_box = np.clip(oriented_box, 0.0, 1.0)
+
+                oriented_bboxes.append(oriented_box)
+
+                xs = oriented_box.reshape(4, 2)[:, 0]
+                ys = oriented_box.reshape(4, 2)[:, 1]
+                xmin = xs.min()
+                xmax = xs.max()
+                ymin = ys.min()
+                ymax = ys.max()
+                bboxes.append([xmin, ymin, xmax, ymax])
+
+                # might be wrong here, but it doesn't matter because the label is not going to be used in detection
+                labels_text.append(gt[-1])
+                ignored = util.str.contains(gt[-1], '###')
+                if ignored:
+                    labels.append(config.ignore_label)
+                else:
+                    labels.append(config.text_label)
+            example = convert_to_example(image_data, image_name, labels, labels_text, bboxes, oriented_bboxes, shape)
+            tfrecord_writer.write(example.SerializeToString())
+
+
+def main(args):
+    output_dir = os.path.dirname(args.output_tfrecords_path)
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    cvt_to_tfrecords(output_path=args.output_tfrecords_path, data_path=args.image_dir, gt_path=args.label_dir)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--image_dir", default='/home/victor/workspace/datasets/scene_text/street_number_recognition/train')
+    parser.add_argument("-l", "--label_dir", default='/home/victor/workspace/datasets/scene_text/street_number_recognition/train_ic15_format_label')
+    parser.add_argument("-t", "--output_tfrecords_path", default='/home/victor/workspace/datasets/scene_text/street_number_recognition/train.tfrecord')
+    args = parser.parse_args()
+    main(args)
+
+    # root_dir = util.io.get_absolute_path('~/workspace/datasets/scene_text/ICDAR2015/detection/')
+    # output_dir = util.io.get_absolute_path('~/workspace/pixel_link/tfrecord/icdar2015/')
+    # util.io.mkdir(output_dir)
+    # training_data_dir = util.io.join_path(root_dir, 'ch4_training_images')
+    # training_gt_dir = util.io.join_path(root_dir, 'ch4_training_localization_transcription_gt')
+    #
+    # test_data_dir = util.io.join_path(root_dir, 'ch4_test_images')
+    # test_gt_dir = util.io.join_path(root_dir, 'ch4_test_localization_transcription_gt')
+    # cvt_to_tfrecords(output_path=util.io.join_path(output_dir, 'icdar2015_test.tfrecord'), data_path=test_data_dir,
+    #                  gt_path=test_gt_dir)
+    #
+
+