Skip to content

2.2 Tutorial ‐ Chaining Liquid Engine methods

Bruno Manuel Santos Saraiva edited this page Sep 26, 2024 · 1 revision

Chaining Liquid Engine methods

Liquid Engines methods can be chained together to achieve different levels of optimization granularity

A working example of how this can be performed can be seen in NanoPyx's implementation of eSRRF.

eSRRF case study

eSRRF is a method developed by our lab capable of generating super-resolution images from conventional microscopes by exploiting the radial fluctuations of fluorophores.
At its core it can be seen as a combination of several three granular methods:

Each of these methods is implemented as a Liquid Engine method with multiple implementations. Users can opt to call each of these methods individually and let the Liquid Engine select the best implementation for each of them:

# imports
import numpy as np
from tifffile import imread
from nanopyx.core.transform._le_interpolation_catmull_rom import ShiftAndMagnify
from nanopyx.core.transform._le_roberts_cross_gradients import GradientRobertsCross
from nanopyx.core.transform._le_radial_gradient_convergence import RadialGradientConvergence

# read the image to be processed
image = imread('path/to/image.tif').astype(np.float32)

# creating the liquid engine object instances
crsm = ShiftAndMagnify()
rbc = GradientRobertsCross()
rgc = RadialGradientConvergence()

# setup the used parameters
magnification = 5
sensitivity = 1
radius = 1.5
doIntensityWeighting = True

# run the eSRRF workflow
magnified_image = crsm.run(image, 0, 0, magnification, magnification)
gradient_col, gradient_row = rbc.run(image)
gradient_col_interp = crsm.run(gradient_col, 0, 0, magnification*2, magnification*2)
radient_row_interp = crsm.run(gradient_row, 0, 0, magnification*2, magnification*2)
radial_gradients = rgc.run(gradient_col_interp, gradient_row_interp, magnified_image, magnification=magnification, radius=radius, sensitivity=sensitivity, doIntensityWeighting=doIntensityWeighting)

Considering eSRRF is a method that generates several large interpolated images as intermediary steps, we noticed for the cases where the GPU is the fastest option that it was better to keep all steps in GPU memory instead of optimizing the selection on each step.
We have achieved this by allowing the eSRRF Liquid Engine implementation to select the run_type for all methods together. This is how it looks for the "threaded" run_type:

class eSRRF(LiquidEngine):
    """
    eSRRF using the NanoPyx Liquid Engine and running as a single task.
    """

    def __init__(self, clear_benchmarks=False, testing=False, verbose=True):
        # init code here

    def run(self, image, magnification: int = 5, radius: float = 1.5, sensitivity: float = 1, doIntensityWeighting: bool = True, run_type=None):
        # run code here

    def benchmark(self, image, magnification: int = 5, radius: float = 1.5, sensitivity: float = 1, doIntensityWeighting: bool = True):
        # benchmark code here

    def _run_threaded(self, image, magnification=5, radius=1.5, sensitivity=1, doIntensityWeighting=True):
        """
        @cpu
        @threaded
        @cython
        """
        runtype = "threaded".capitalize()
        crsm = ShiftAndMagnify(verbose=False)
        rbc = GradientRobertsCross(verbose=False)
        rgc = RadialGradientConvergence(verbose=False)
        
        magnified_image = crsm.run(image, 0, 0, magnification, magnification, run_type=runtype)
        gradient_col, gradient_row = rbc.run(image, run_type=runtype)
        gradient_col_interp = crsm.run(gradient_col, 0, 0, magnification*2, magnification*2, run_type=runtype)
        gradient_row_interp = crsm.run(gradient_row, 0, 0, magnification*2, magnification*2, run_type=runtype)
        radial_gradients = rgc.run(gradient_col_interp, gradient_row_interp, magnified_image, magnification=magnification, radius=radius, sensitivity=sensitivity, doIntensityWeighting=doIntensityWeighting, run_type=runtype)

        return radial_gradients

However, for the GPU case where we want to keep all intermediary steps in GPU memory buffers, instead of calling the more granular liquid engine methods, we have opted to chain the OpenCL kernels and treating it as a single method call.

class eSRRF(LiquidEngine):
    """
    eSRRF using the NanoPyx Liquid Engine and running as a single task.
    """

    def __init__(self, clear_benchmarks=False, testing=False, verbose=True):
        # init code here

    def run(self, image, magnification: int = 5, radius: float = 1.5, sensitivity: float = 1, doIntensityWeighting: bool = True, run_type=None):
        # run code here

    def benchmark(self, image, magnification: int = 5, radius: float = 1.5, sensitivity: float = 1, doIntensityWeighting: bool = True):
        # benchmark code here

    def _run_opencl(self, image, magnification=5, radius=1.5, sensitivity=1, doIntensityWeighting=True, device=None, mem_div=1):
        """
        @gpu
        """
        if device is None:
            device = _fastest_device

        # creating OpenCL context
        cl_ctx = cl.Context([device['device']])
        dc = device['device']
        cl_queue = cl.CommandQueue(cl_ctx)

        output_shape = (image.shape[0], int(image.shape[1]*magnification), int(image.shape[2]*magnification))

        # memory management calculations
        
        total_memory = (3*image[0, :, :].nbytes) + (2*np.zeros((1, output_shape[1], output_shape[2]), dtype=np.float32).nbytes) + (2*np.zeros((1, output_shape[1]*2, output_shape[2]*2), dtype=np.float32).nbytes)
        output_image = np.zeros(output_shape, dtype=np.float32)
        
        max_slices = int((dc.global_mem_size // total_memory)/mem_div)
        max_slices = self._check_max_slices(image, max_slices)

        # creating GPU memory buffers

        mf = cl.mem_flags
        input_cl = cl.Buffer(cl_ctx, mf.READ_ONLY, self._check_max_buffer_size(image[0:max_slices, :, :].nbytes, dc, max_slices))
        output_cl = cl.Buffer(cl_ctx, mf.WRITE_ONLY, self._check_max_buffer_size(np.empty((max_slices, output_shape[1], output_shape[2]), dtype=np.float32).nbytes, dc, max_slices))
        magnified_cl = cl.Buffer(cl_ctx, mf.READ_WRITE, self._check_max_buffer_size(np.empty((max_slices, output_shape[1], output_shape[2]), dtype=np.float32).nbytes, dc, max_slices))
        col_gradients_cl = cl.Buffer(cl_ctx, mf.READ_WRITE, self._check_max_buffer_size(np.empty((max_slices, image.shape[1], image.shape[2]), dtype=np.float32).nbytes, dc, max_slices))
        row_gradients_cl = cl.Buffer(cl_ctx, mf.READ_WRITE, self._check_max_buffer_size(np.empty((max_slices, image.shape[1], image.shape[2]), dtype=np.float32).nbytes, dc, max_slices))
        col_magnified_gradients_cl = cl.Buffer(cl_ctx, mf.READ_WRITE, self._check_max_buffer_size(np.empty((max_slices, image.shape[1]*magnification*2, image.shape[2]*magnification*2), dtype=np.float32).nbytes, dc, max_slices))
        row_magnified_gradients_cl = cl.Buffer(cl_ctx, mf.READ_WRITE, self._check_max_buffer_size(np.empty((max_slices, image.shape[1]*magnification*2, image.shape[2]*magnification*2), dtype=np.float32).nbytes, dc, max_slices))
        cl.enqueue_copy(cl_queue, input_cl, image[0:max_slices,:,:]).wait()

        # setting up OpenCl kernels

        cr_code = self._get_cl_code("_le_interpolation_catmull_rom_.cl", device['DP'])
        cr_prg = cl.Program(cl_ctx, cr_code).build(options=["-cl-mad-enable -cl-fast-relaxed-math"])
        cr_knl = cr_prg.shiftAndMagnify

        rc_code = self._get_cl_code("_le_roberts_cross_gradients.cl", device['DP'])
        rc_prg = cl.Program(cl_ctx, rc_code).build(options=["-cl-mad-enable -cl-fast-relaxed-math"])
        rc_knl = rc_prg.gradient_roberts_cross

        rgc_code = self._get_cl_code("_le_radial_gradient_convergence.cl", device['DP'])
        rgc_prg = cl.Program(cl_ctx, rgc_code).build(options=["-cl-mad-enable -cl-fast-relaxed-math"])
        rgc_knl = rgc_prg.calculate_rgc

        # perform the calculations using the previously created buffers and kernels

        for i in range(0, image.shape[0], max_slices):
            if image.shape[0] - i >= max_slices:
                n_slices = max_slices
            else:
                n_slices = image.shape[0] - i

            cr_knl(cl_queue,
                (n_slices, int(image.shape[1]*magnification), int(image.shape[2]*magnification)),
                self.get_work_group(dc, (n_slices, image.shape[1]*magnification, image.shape[2]*magnification)), 
                input_cl,
                magnified_cl,
                np.float32(0),
                np.float32(0),
                np.float32(magnification),
                np.float32(magnification)).wait()

            rc_knl(cl_queue,
                (n_slices,),
                None,
                input_cl,
                col_gradients_cl,
                row_gradients_cl,
                np.int32(image.shape[1]),
                np.int32(image.shape[2])).wait()

            cr_knl(cl_queue,
                (n_slices, int(image.shape[1]*magnification*2), int(image.shape[2]*magnification*2)), 
                self.get_work_group(dc, (n_slices, image.shape[1]*magnification*2, image.shape[2]*magnification*2)), 
                col_gradients_cl,
                col_magnified_gradients_cl,
                np.float32(0),
                np.float32(0),
                np.float32(magnification*2),
                np.float32(magnification*2)).wait()

            cr_knl(cl_queue,
                (n_slices, int(image.shape[1]*magnification*2), int(image.shape[2]*magnification*2)), 
                self.get_work_group(dc, (n_slices, image.shape[1]*magnification*2, image.shape[2]*magnification*2)), 
                row_gradients_cl,
                row_magnified_gradients_cl,
                np.float32(0),
                np.float32(0),
                np.float32(magnification*2),
                np.float32(magnification*2)).wait()

            rgc_knl(cl_queue,
                (n_slices, (image.shape[1]*magnification-magnification*2) - magnification*2, image.shape[2]*magnification-magnification*2 - magnification*2),
                self.get_work_group(dc, (n_slices, (image.shape[1]*magnification-magnification*2) - magnification*2, image.shape[2]*magnification-magnification*2 - magnification*2)),
                col_magnified_gradients_cl,
                row_magnified_gradients_cl,
                magnified_cl,
                output_cl,
                np.int32(image.shape[2]*magnification),
                np.int32(image.shape[1]*magnification),
                np.int32(magnification),
                np.float32(2),
                np.float32(radius),
                np.float32(2 * (radius / 2.355) + 1),
                np.float32(2 * (radius / 2.355) * (radius / 2.355)),
                np.float32(sensitivity),
                np.int32(doIntensityWeighting)).wait()
            
            cl.enqueue_copy(cl_queue, output_image[i:i+n_slices,:,:], output_cl).wait()

            if i+n_slices<image.shape[0]:
                cl.enqueue_copy(cl_queue, input_cl, image[i+n_slices:i+2*n_slices,:,:]).wait()

            cl_queue.finish()


        return output_image

With this level of modularity, Liquid Engine enables users to achieve maximum performance by choosing where and how to dynamically change implementations.

Clone this wiki locally