Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,13 @@ ENV/

# mypy
.mypy_cache/
/.idea
/checkpoint
/conda_venv
/output
/pretrained
/train_data_viz
/conda_venv.tf1.0
/checkpoint.sav
/checkpoints_worth_saving
/temp
49 changes: 49 additions & 0 deletions README.deckard.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# How to do training
* checkout the code with my modifications
* create a python2.7 conda venv per environment.py2.7_tf1.14.yml
* copy our own training dataset at s3://deckard-data-science-assets-us-east-2/curated_datasets/street_number_recognition
* train.tfrecord and val.tfrecord
* see s3://deckard-data-science-assets-us-east-2/curated_datasets/street_number_recognitionreadme.txt for how these
tfrecord files are prepared
* if you want to try different trunk DNN, update model_type/feat_layers/strides values in config.py
* source set_env.sh
* basically add the pylib/src to PYTHONPATH
* ./scripts/train.sh 0,1 24
- 0,1 means use GPU0 and 1
- 24 means for each batch, each GPU handle 24 images
* the artifacts of the run will be created at ./checkpoint
* open another terminal and run
```
python scripts/eval_fscore.py
```
This script will periodically evaluate the recall/precision/f1_score of the
latest checkpoint in ./checkpoint and save the result as tf_summary, which can be viewed with
tensorboard with other metrics saved by the main training process

Notice it will also save a fscore.csv in ./checkpoint.
* If you want to use tensorboard to visualize the metrics,
```tensorboard --logdir checkpoint```

# How to test the trained model on new images
* If you need to get the detected boxes as data
- first, run
```./scripts/test.sh ${GPU_ID} ${checkpoint_folder}/model.ckpt-xxx ${image_dir}```
The script will output the detected boxes as IC15 label file (one for each image) in
${checkpoint_folder}/test/model.ckpt-xxx/txt/*.txt

- then optionally, you can use street_num_spotting/src/street_num_spotting/convert_pixellink_test_result_to_our_json_format.py
to convert the IC15 label files to a json file (in our own label format)

- then you can use street_num_spotting/notebook/recognition_result_reviewer.ipynb to review
the detection result. You need to update following config values accordingly
- config.image_base_dir, set to image_dir
- config.recognition_result_json_fpath, set to the json label file created in the step above

* If you just want to see the detected text boxes rendered on the original images
```./scripts/test_any.sh ${GPU_ID} ${checkpoint_folder}/model.ckpt-xxx ${image_dir} ${output_dir}```
The script will output jpgs with bounding box rendered into output_dir


# About the metrics saved in tf.summary
* pixel_link_loss = prediction_loss_on_clones + regularization_loss
* clone0/xxx_loss, the prediction loss component on clone 0
30 changes: 24 additions & 6 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import pixel_link
slim = tf.contrib.slim

# optimizer = 'Adam'
optimizer = 'Momentum'
#=====================================================================
#====================Pre-processing params START======================
# VGG mean parameters.
Expand All @@ -27,7 +29,7 @@
area_range = [0.1, 1]
flip = False
using_shorter_side_filtering=True
min_shorter_side = 10
min_shorter_side = 6
max_shorter_side = np.infty
#====================Pre-processing params END========================
#=====================================================================
Expand All @@ -39,7 +41,7 @@
#====================Post-processing params START=====================
decode_method = pixel_link.DECODE_METHOD_join
min_area = 300
min_height = 10
min_height = 12
#====================Post-processing params END=======================
#=====================================================================

Expand All @@ -58,12 +60,28 @@
#pixel_neighbour_type = pixel_link.PIXEL_NEIGHBOUR_TYPE_4


#model_type = pixel_link_symbol.MODEL_TYPE_vgg16
#feat_layers = ['conv2_2', 'conv3_3', 'conv4_3', 'conv5_3', 'fc7']
#strides = [2]
# model_type = pixel_link_symbol.MODEL_TYPE_vgg16
# feat_layers = ['conv2_2', 'conv3_3', 'conv4_3', 'conv5_3', 'fc7']
# strides = [2]

model_type = pixel_link_symbol.MODEL_TYPE_vgg16
# with 512x512 input,
# conv3_3 output is 128x128x256
# conv4_3 output is 64x64x512
# conv5_3 output is 32x32x512
# fc7 output is 32x32x1024
# all output is after Relu
feat_layers = ['conv3_3', 'conv4_3', 'conv5_3', 'fc7']
strides = [4]
#
# model_type = pixel_link_symbol.MODEL_TYPE_mobilenetv2
# # with 512x512 input,
# # layer_4 output is 128x128x24
# # layer_7 output is 64x64x32
# # layer_14 output is 32x32x96
# # layer_19 output is 16x16x1280
# feat_layers = ['layer_4', 'layer_7', 'layer_14', 'layer_19']
# strides = [4]

pixel_cls_weight_method = pixel_link.PIXEL_CLS_WEIGHT_bbox_balanced
bbox_border_width = 1
Expand Down Expand Up @@ -184,7 +202,7 @@ def print_ckpt(path):

if not print_ckpt(flags.train_dir):
print_ckpt(flags.checkpoint_path)

pprint(flags.__flags, stream=stream)

print('\n# =========================================================================== #', file=stream)
Expand Down
92 changes: 60 additions & 32 deletions datasets/dataset_factory.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,88 @@
"""A factory-pattern class which returns classification image/label pairs."""
from datasets import dataset_utils


class DatasetConfig():
def __init__(self, file_pattern, split_sizes):
self.file_pattern = file_pattern
self.split_sizes = split_sizes



icdar2013 = DatasetConfig(
file_pattern = '*_%s.tfrecord',
split_sizes = {
'train': 229,
'test': 233
}
file_pattern='*_%s.tfrecord',
split_sizes={
'train': 229,
'test': 233
}
)
icdar2015 = DatasetConfig(
file_pattern = 'icdar2015_%s.tfrecord',
split_sizes = {
'train': 1000,
'test': 500
}
file_pattern='icdar2015_%s.tfrecord',
split_sizes={
'train': 1000,
'test': 500
}
)
td500 = DatasetConfig(
file_pattern = '*_%s.tfrecord',
split_sizes = {
'train': 300,
'test': 200
}
file_pattern='*_%s.tfrecord',
split_sizes={
'train': 300,
'test': 200
}
)
tr400 = DatasetConfig(
file_pattern = 'tr400_%s.tfrecord',
split_sizes = {
'train': 400
}
file_pattern='tr400_%s.tfrecord',
split_sizes={
'train': 400
}
)
scut = DatasetConfig(
file_pattern = 'scut_%s.tfrecord',
split_sizes = {
file_pattern='scut_%s.tfrecord',
split_sizes={
'train': 1715
}
)

synthtext = DatasetConfig(
file_pattern = '*.tfrecord',
# file_pattern = 'SynthText_*.tfrecord',
split_sizes = {
file_pattern='*.tfrecord',
# file_pattern = 'SynthText_*.tfrecord',
split_sizes={
'train': 858750
}
)

street_number = DatasetConfig(
file_pattern='%s.tfrecord',
split_sizes={
'train': 576,
'val': 144,
}
)

synthesized_149k_and_street_number_train = DatasetConfig(
file_pattern='*.tfrecord',
split_sizes={
'train': 149107 + 576,
}
)

synthesized_149k = DatasetConfig(
file_pattern='bbox_clipped_within_image_boundary.tfrecord',
split_sizes={
'train': 149107,
}
)


datasets_map = {
'icdar2013':icdar2013,
'icdar2015':icdar2015,
'scut':scut,
'td500':td500,
'tr400':tr400,
'synthtext':synthtext
'icdar2013': icdar2013,
'icdar2015': icdar2015,
'scut': scut,
'td500': td500,
'tr400': tr400,
'synthtext': synthtext,
'street_number': street_number,
'synthesized_149k_and_street_number_train': synthesized_149k_and_street_number_train,
'synthesized_149k': synthesized_149k,
}


Expand All @@ -76,4 +104,4 @@ def get_dataset(dataset_name, split_name, dataset_dir, reader=None):
dataset_config = datasets_map[dataset_name];
file_pattern = dataset_config.file_pattern
num_samples = dataset_config.split_sizes[split_name]
return dataset_utils.get_split(split_name, dataset_dir,file_pattern, num_samples, reader)
return dataset_utils.get_split(split_name, dataset_dir, file_pattern, num_samples, reader)
2 changes: 2 additions & 0 deletions datasets/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,13 +137,15 @@ def get_list(obj, idx):
'image/encoded': bytes_feature(image_data)}))
return example


def get_split(split_name, dataset_dir, file_pattern, num_samples, reader=None):
dataset_dir = util.io.get_absolute_path(dataset_dir)

if util.str.contains(file_pattern, '%'):
file_pattern = util.io.join_path(dataset_dir, file_pattern % split_name)
else:
file_pattern = util.io.join_path(dataset_dir, file_pattern)

# Allowing None in the signature so that dataset_factory can use the default.
if reader is None:
reader = tf.TFRecordReader
Expand Down
92 changes: 92 additions & 0 deletions datasets/ic15_like_data_to_tfrecords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# encoding=utf-8
import argparse
import os

import numpy as np
import tensorflow as tf
import util
from dataset_utils import int64_feature, float_feature, bytes_feature, convert_to_example
import config


def cvt_to_tfrecords(output_path, data_path, gt_path):
image_names = util.io.ls(data_path, '.jpg') # [0:10]
print "%d images found in %s" % (len(image_names), data_path)
with tf.python_io.TFRecordWriter(output_path) as tfrecord_writer:
for idx, image_name in enumerate(image_names):
oriented_bboxes = []
bboxes = []
labels = [] # a mask for labels_text list to indicate whether it is a 'ignored' label text or not
labels_text = []
path = util.io.join_path(data_path, image_name)
print "\tconverting image: %d/%d %s" % (idx, len(image_names), image_name)
image_data = tf.gfile.FastGFile(path, 'r').read()

image = util.img.imread(path, rgb=True)
shape = image.shape
h, w = shape[0:2]
h *= 1.0
w *= 1.0
image_name = util.str.split(image_name, '.')[0]
gt_name = 'gt_' + image_name + '.txt'
gt_filepath = util.io.join_path(gt_path, gt_name)
lines = util.io.read_lines(gt_filepath)

for line in lines:
line = util.str.remove_all(line, '\xef\xbb\xbf')
gt = util.str.split(line, ',')
oriented_box = [int(gt[i]) for i in range(8)]
oriented_box = np.asarray(oriented_box) / ([w, h] * 4)
# notice in the case of syntehsized data,
# it is possible some bbox will extend through the image boundary. #
oriented_box = np.clip(oriented_box, 0.0, 1.0)

oriented_bboxes.append(oriented_box)

xs = oriented_box.reshape(4, 2)[:, 0]
ys = oriented_box.reshape(4, 2)[:, 1]
xmin = xs.min()
xmax = xs.max()
ymin = ys.min()
ymax = ys.max()
bboxes.append([xmin, ymin, xmax, ymax])

# might be wrong here, but it doesn't matter because the label is not going to be used in detection
labels_text.append(gt[-1])
ignored = util.str.contains(gt[-1], '###')
if ignored:
labels.append(config.ignore_label)
else:
labels.append(config.text_label)
example = convert_to_example(image_data, image_name, labels, labels_text, bboxes, oriented_bboxes, shape)
tfrecord_writer.write(example.SerializeToString())


def main(args):
output_dir = os.path.dirname(args.output_tfrecords_path)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
cvt_to_tfrecords(output_path=args.output_tfrecords_path, data_path=args.image_dir, gt_path=args.label_dir)


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--image_dir", default='/home/victor/workspace/datasets/scene_text/street_number_recognition/train')
parser.add_argument("-l", "--label_dir", default='/home/victor/workspace/datasets/scene_text/street_number_recognition/train_ic15_format_label')
parser.add_argument("-t", "--output_tfrecords_path", default='/home/victor/workspace/datasets/scene_text/street_number_recognition/train.tfrecord')
args = parser.parse_args()
main(args)

# root_dir = util.io.get_absolute_path('~/workspace/datasets/scene_text/ICDAR2015/detection/')
# output_dir = util.io.get_absolute_path('~/workspace/pixel_link/tfrecord/icdar2015/')
# util.io.mkdir(output_dir)
# training_data_dir = util.io.join_path(root_dir, 'ch4_training_images')
# training_gt_dir = util.io.join_path(root_dir, 'ch4_training_localization_transcription_gt')
#
# test_data_dir = util.io.join_path(root_dir, 'ch4_test_images')
# test_gt_dir = util.io.join_path(root_dir, 'ch4_test_localization_transcription_gt')
# cvt_to_tfrecords(output_path=util.io.join_path(output_dir, 'icdar2015_test.tfrecord'), data_path=test_data_dir,
# gt_path=test_gt_dir)
#


Loading