Implement batch multiprocessing

hlgirard · Apr 26, 2019 · e1b6397 · e1b6397
1 parent a6fa757
commit e1b6397
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 15 deletions.
diff --git a/src/crystal_processing/process_image_folder.py b/src/crystal_processing/process_image_folder.py
@@ -2,11 +2,12 @@
 os.environ['KMP_DUPLICATE_LIB_OK']='True' # Required to avoid OMP: Error #15
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Tensorflow logging level set to ALL (0, default), WARNING (1), ERROR (2) or NONE (3)
 
+import math
 import re
 import logging
+from joblib import Parallel, delayed
 
 import numpy as np
-from tqdm import tqdm
 import pandas as pd
 
 from tensorflow.keras.models import model_from_json
@@ -18,6 +19,7 @@
 
 def load_model(path):
     '''Loads model from path and get most recent associated weights'''
+    # TODO: Move this methods to the models utils package
 
     model_name = path.split('/')[-1].split('.')[0]
 
@@ -90,27 +92,72 @@ def process_image(image_path, crop_box, model, save_overlay = False):
 
     return (date_taken, num_drops, num_clear, num_crystal)
 
+def process_image_batch(image_list, crop_box, model_path, save_overlay = False):
+    '''Process a batch of images and return a list of results
+
+    Parameters
+    ----------
+    image_list: list[string]
+        List of paths to the image to process
+    crop_box: (minRow, maxRow, minCol, maxCol)
+        Cropping box to select the region of interest
+    model_path: string
+        Path to the tensorflow model to load
+    save_overlay: bool, optional
+        Save an image with green / red overlays for drops containing crystals / empty to `image_path / overlay`
+
+    Returns
+    -------
+    list[(date_take: datetime, num_drops: int, num_clear: int, num_crystal: int, image_name: string)]
+        List of extracted parameters for each of the images
+        Date from the EXIF data, number of drops, number of clear drops, number of drops containing crystals, name of the image
+    '''
+
+    # Instantiate the model
+    model = load_model(model_path)
+
+    # Process the data
+    data = []
+    for image_path in image_list:
+        image_name = os.path.basename(image_path)
+        data.append(process_image(image_path, crop_box, model, save_overlay=save_overlay) + (image_name,))
+
+    return data
+
 def process_image_folder(directory, crop_box=None, show_plot=False, save_overlay=False):
 
     # List images in directory
-    image_list = [file for file in os.listdir(directory) if file.endswith('.JPG')]
-
-    # Load model
-    model = load_model("models/cnn-simple-model.json")
+    image_list = [os.path.join(directory, image_path) for image_path in os.listdir(directory) if image_path.endswith('.JPG')]
+
+    # Compute the number of batches necessary
+    num_images = len(image_list)
+    print(f"Number of images: {num_images}")
+    batch_size = max([1, num_images // (os.cpu_count()-1)])
+    print(f"Batch size: {batch_size}")
+    num_batches = int(math.ceil(num_images // batch_size))
+    print(f"Number of batches: {num_batches}")
+
+    # Define the model path
+    model_path = "models/cnn-simple-model.json"
 
     # Obtain crop box from user if not passed as argument
     if not crop_box:
-        first_image = open_grey_scale_image(os.path.join(directory, image_list[0]))
+        first_image = open_grey_scale_image(image_list[0])
         crop_box = select_rectangle(first_image)
 
+    # Process all images from directory in parallel
+    if num_batches == 0:
+        # Process serialy
+        data = [process_image_batch(image_list[i*batch_size:min([(i+1)*batch_size, num_images])], crop_box, model_path, save_overlay)
+                for i in range(num_batches)]
+    else:
+        data = Parallel(n_jobs=-2, verbose=10)(delayed(process_image_batch)(image_list[i*batch_size:min([(i+1)*batch_size, num_images])], crop_box, model_path, save_overlay)
+                                   for i in range(num_batches))
 
-    # Process all images from directory in series
-    data = []
-    for image_name in tqdm(image_list):
-        data.append(process_image(os.path.join(directory, image_name), crop_box, model, save_overlay = save_overlay) + (image_name,)) 
+    flat_data = [item for sublist in data for item in sublist]
 
     # Make a dataframe from the data and save it to disk
-    df = pd.DataFrame(sorted(data, key = lambda x: x[0]), columns=["DateTime", "Num drops", "Num clear", "Num crystal", "Image Name"])
+    df = pd.DataFrame(sorted(flat_data, key = lambda x: x[0]), columns=["DateTime", "Num drops", "Num clear", "Num crystal", "Image Name"])
     df['RelTime'] = (df['DateTime'] - df['DateTime'][0]).dt.total_seconds()
     df.to_csv(os.path.join(directory, "crystalData.csv"))
 
@@ -122,4 +169,4 @@ def process_image_folder(directory, crop_box=None, show_plot=False, save_overlay
 if __name__ == "__main__":
     folder = "notebooks/example_data"
 
-    process_image_folder(folder, save_overlay = True)
+    process_image_folder(folder, save_overlay=False, show_plot=False)
diff --git a/src/data/utils.py b/src/data/utils.py
@@ -55,11 +55,9 @@ def select_rectangle(img):
 
     Returns
     -------
-    tuple 
+    tuple
         Rectangle coordinates following the numpy array convention (minRow, minCol, maxRow, maxCol)
     """
-
-    mpl.use('TkAgg')
 
     print('Select the region of interest then press Q/q to confirm selection and exit.')
-Original file line number
+Diff line change
@@ Expand Up / @@ -55,11 +55,9 @@ def select_rectangle(img): @@
         Returns
         -------
-        tuple
+        tuple
             Rectangle coordinates following the numpy array convention (minRow, minCol, maxRow, maxCol)
         """
-        mpl.use('TkAgg')
         print('Select the region of interest then press Q/q to confirm selection and exit.')
@@ Expand Down @@