Skip to content

Commit

Permalink
Implement batch multiprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
hlgirard committed Apr 26, 2019
1 parent a6fa757 commit e1b6397
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 15 deletions.
71 changes: 59 additions & 12 deletions src/crystal_processing/process_image_folder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
os.environ['KMP_DUPLICATE_LIB_OK']='True' # Required to avoid OMP: Error #15
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Tensorflow logging level set to ALL (0, default), WARNING (1), ERROR (2) or NONE (3)

import math
import re
import logging
from joblib import Parallel, delayed

import numpy as np
from tqdm import tqdm
import pandas as pd

from tensorflow.keras.models import model_from_json
Expand All @@ -18,6 +19,7 @@

def load_model(path):
'''Loads model from path and get most recent associated weights'''
# TODO: Move this methods to the models utils package

model_name = path.split('/')[-1].split('.')[0]

Expand Down Expand Up @@ -90,27 +92,72 @@ def process_image(image_path, crop_box, model, save_overlay = False):

return (date_taken, num_drops, num_clear, num_crystal)

def process_image_batch(image_list, crop_box, model_path, save_overlay = False):
'''Process a batch of images and return a list of results
Parameters
----------
image_list: list[string]
List of paths to the image to process
crop_box: (minRow, maxRow, minCol, maxCol)
Cropping box to select the region of interest
model_path: string
Path to the tensorflow model to load
save_overlay: bool, optional
Save an image with green / red overlays for drops containing crystals / empty to `image_path / overlay`
Returns
-------
list[(date_take: datetime, num_drops: int, num_clear: int, num_crystal: int, image_name: string)]
List of extracted parameters for each of the images
Date from the EXIF data, number of drops, number of clear drops, number of drops containing crystals, name of the image
'''

# Instantiate the model
model = load_model(model_path)

# Process the data
data = []
for image_path in image_list:
image_name = os.path.basename(image_path)
data.append(process_image(image_path, crop_box, model, save_overlay=save_overlay) + (image_name,))

return data

def process_image_folder(directory, crop_box=None, show_plot=False, save_overlay=False):

# List images in directory
image_list = [file for file in os.listdir(directory) if file.endswith('.JPG')]

# Load model
model = load_model("models/cnn-simple-model.json")
image_list = [os.path.join(directory, image_path) for image_path in os.listdir(directory) if image_path.endswith('.JPG')]

# Compute the number of batches necessary
num_images = len(image_list)
print(f"Number of images: {num_images}")
batch_size = max([1, num_images // (os.cpu_count()-1)])
print(f"Batch size: {batch_size}")
num_batches = int(math.ceil(num_images // batch_size))
print(f"Number of batches: {num_batches}")

# Define the model path
model_path = "models/cnn-simple-model.json"

# Obtain crop box from user if not passed as argument
if not crop_box:
first_image = open_grey_scale_image(os.path.join(directory, image_list[0]))
first_image = open_grey_scale_image(image_list[0])
crop_box = select_rectangle(first_image)

# Process all images from directory in parallel
if num_batches == 0:
# Process serialy
data = [process_image_batch(image_list[i*batch_size:min([(i+1)*batch_size, num_images])], crop_box, model_path, save_overlay)
for i in range(num_batches)]
else:
data = Parallel(n_jobs=-2, verbose=10)(delayed(process_image_batch)(image_list[i*batch_size:min([(i+1)*batch_size, num_images])], crop_box, model_path, save_overlay)
for i in range(num_batches))

# Process all images from directory in series
data = []
for image_name in tqdm(image_list):
data.append(process_image(os.path.join(directory, image_name), crop_box, model, save_overlay = save_overlay) + (image_name,))
flat_data = [item for sublist in data for item in sublist]

# Make a dataframe from the data and save it to disk
df = pd.DataFrame(sorted(data, key = lambda x: x[0]), columns=["DateTime", "Num drops", "Num clear", "Num crystal", "Image Name"])
df = pd.DataFrame(sorted(flat_data, key = lambda x: x[0]), columns=["DateTime", "Num drops", "Num clear", "Num crystal", "Image Name"])
df['RelTime'] = (df['DateTime'] - df['DateTime'][0]).dt.total_seconds()
df.to_csv(os.path.join(directory, "crystalData.csv"))

Expand All @@ -122,4 +169,4 @@ def process_image_folder(directory, crop_box=None, show_plot=False, save_overlay
if __name__ == "__main__":
folder = "notebooks/example_data"

process_image_folder(folder, save_overlay = True)
process_image_folder(folder, save_overlay=False, show_plot=False)
4 changes: 1 addition & 3 deletions src/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,9 @@ def select_rectangle(img):
Returns
-------
tuple
tuple
Rectangle coordinates following the numpy array convention (minRow, minCol, maxRow, maxCol)
"""

mpl.use('TkAgg')

print('Select the region of interest then press Q/q to confirm selection and exit.')

Expand Down

0 comments on commit e1b6397

Please sign in to comment.