diff --git a/sfft/CustomizedPacket.py b/sfft/CustomizedPacket.py
index e4cd3de..fcd011e 100644
--- a/sfft/CustomizedPacket.py
+++ b/sfft/CustomizedPacket.py
@@ -1,3 +1,7 @@
+import cupy as cp
+import tracemalloc
+import logging
+import sys
 import time
 import numpy as np
 import os.path as pa
@@ -13,7 +17,7 @@ class Customized_Packet:
     @staticmethod
     def CP(FITS_REF, FITS_SCI, FITS_mREF, FITS_mSCI, ForceConv, GKerHW, \
         FITS_DIFF=None, FITS_Solution=None, KerPolyOrder=2, BGPolyOrder=2, ConstPhotRatio=True, \
-        BACKEND_4SUBTRACT='Cupy', CUDA_DEVICE_4SUBTRACT='0', NUM_CPU_THREADS_4SUBTRACT=8, VERBOSE_LEVEL=2):
+        BACKEND_4SUBTRACT='Cupy', CUDA_DEVICE_4SUBTRACT='0', NUM_CPU_THREADS_4SUBTRACT=8, VERBOSE_LEVEL=2, logger=None):
         
         """
         * Parameters for Customized SFFT
@@ -85,11 +89,20 @@ def CP(FITS_REF, FITS_SCI, FITS_mREF, FITS_mSCI, ForceConv, GKerHW, \
 
         """
 
+        # Configure logger (Rob)
+        _logger = logging.getLogger(f'sfft.CustomizedPacket.CP')
+        if not _logger.hasHandlers():
+            log_out = logging.StreamHandler(sys.stderr)
+            formatter = logging.Formatter(f'[%(asctime)s - %(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
+            log_out.setFormatter(formatter)
+            _logger.addHandler(log_out)
+            _logger.setLevel(logging.DEBUG) # ERROR, WARNING, INFO, or DEBUG (in that order by increasing detail)
+
         # * Read input images
-        PixA_REF = fits.getdata(FITS_REF, ext=0).T
-        PixA_SCI = fits.getdata(FITS_SCI, ext=0).T
-        PixA_mREF = fits.getdata(FITS_mREF, ext=0).T
-        PixA_mSCI = fits.getdata(FITS_mSCI, ext=0).T
+        PixA_REF = fits.getdata(FITS_REF, memmap=False, ext=0).T
+        PixA_SCI = fits.getdata(FITS_SCI, memmap=False, ext=0).T
+        PixA_mREF = fits.getdata(FITS_mREF, memmap=False, ext=0).T
+        PixA_mSCI = fits.getdata(FITS_mSCI, memmap=False, ext=0).T
 
         if not PixA_REF.flags['C_CONTIGUOUS']:
             PixA_REF = np.ascontiguousarray(PixA_REF, np.float64)
@@ -131,15 +144,29 @@ def CP(FITS_REF, FITS_SCI, FITS_mREF, FITS_mSCI, ForceConv, GKerHW, \
             print('MeLOn CheckPoint: TRIGGER Function Compilations of SFFT-SUBTRACTION!')
 
         Tcomp_start = time.time()
+
+        ### I ADDED DEBUG MEM TRACING STATEMENTS
+        # tracemalloc.start()
         SFFTConfig = SingleSFFTConfigure.SSC(NX=PixA_REF.shape[0], NY=PixA_REF.shape[1], KerHW=KerHW, \
             KerPolyOrder=KerPolyOrder, BGPolyOrder=BGPolyOrder, ConstPhotRatio=ConstPhotRatio, \
             BACKEND_4SUBTRACT=BACKEND_4SUBTRACT, NUM_CPU_THREADS_4SUBTRACT=NUM_CPU_THREADS_4SUBTRACT, \
             VERBOSE_LEVEL=VERBOSE_LEVEL)
 
+        # size, peak = tracemalloc.get_traced_memory()
+        # logger.debug(f'MEMORY IN CUSTOMIZEDPACKET FROM SFFTCONFIG size={size}, peak={peak}')
+        # tracemalloc.clear_traces()
+        # tracemalloc.stop()
+
         if VERBOSE_LEVEL in [1, 2]:
             _message = 'Function Compilations of SFFT-SUBTRACTION TAKES [%.3f s]' %(time.time() - Tcomp_start)
             print('\nMeLOn Report: %s' %_message)
 
+        mempool = cp.get_default_memory_pool()
+        pinned_mempool = cp.get_default_pinned_memory_pool()
+        # print('MEMPOOL USED BYTES', mempool.used_bytes())
+        # print('MEMPOOL TOTAL BYTES', mempool.total_bytes())
+        # print('PINNED MEMPOOL FREE BLOCKS', pinned_mempool.n_free_blocks())
+
         # * Perform SFFT Subtraction
         if ConvdSide == 'REF':
             PixA_mI, PixA_mJ = PixA_mREF, PixA_mSCI
@@ -160,11 +187,26 @@ def CP(FITS_REF, FITS_SCI, FITS_mREF, FITS_mSCI, ForceConv, GKerHW, \
         if VERBOSE_LEVEL in [0, 1, 2]:
             print('MeLOn CheckPoint: TRIGGER SFFT-SUBTRACTION!')
 
+        # print('MEMPOOL USED BYTES', mempool.used_bytes())
+        # print('MEMPOOL TOTAL BYTES', mempool.total_bytes())
+        # print('PINNED MEMPOOL FREE BLOCKS', pinned_mempool.n_free_blocks())
+
         Tsub_start = time.time()
+
+        # tracemalloc.start()
+
         _tmp = GeneralSFFTSubtract.GSS(PixA_I=PixA_I, PixA_J=PixA_J, PixA_mI=PixA_mI, PixA_mJ=PixA_mJ, \
             SFFTConfig=SFFTConfig, ContamMask_I=None, BACKEND_4SUBTRACT=BACKEND_4SUBTRACT, \
             NUM_CPU_THREADS_4SUBTRACT=NUM_CPU_THREADS_4SUBTRACT, VERBOSE_LEVEL=VERBOSE_LEVEL)
 
+        # print('MEMPOOL USED BYTES', mempool.used_bytes())
+        # print('MEMPOOL TOTAL BYTES', mempool.total_bytes())
+        # print('PINNED MEMPOOL FREE BLOCKS', pinned_mempool.n_free_blocks())
+
+        # size, peak = tracemalloc.get_traced_memory()
+        # logger.debug(f'MEMORY IN CUSTOMIZEDPACKET FROM GENERALSFFTSUBTRACT size={size}, peak={peak}')
+        # tracemalloc.clear_traces()
+
         Solution, PixA_DIFF = _tmp[:2]
         if VERBOSE_LEVEL in [1, 2]:
             _message = 'SFFT-SUBTRACTION TAKES [%.3f s]' %(time.time() - Tsub_start)
diff --git a/sfft/utils/ConvKernelConvertion.py b/sfft/utils/ConvKernelConvertion.py
index fe59b51..832ea83 100644
--- a/sfft/utils/ConvKernelConvertion.py
+++ b/sfft/utils/ConvKernelConvertion.py
@@ -1,4 +1,5 @@
 import numpy as np
+import cupy
 
 __author__ = "Lei Hu <leihu@andrew.cmu.edu>"
 __version__ = "v1.0"
@@ -17,16 +18,16 @@ def CSZ(ConvKernel, N0, N1):
         L0, L1 = ConvKernel.shape  
         w0, w1 = (L0-1) // 2, (L1-1) // 2      
         pd0, pd1 = N0 - L0, N1 - L1
-        TailZP = np.lib.pad(ConvKernel, ((0, pd0), (0, pd1)), 'constant', constant_values=(0, 0))    # Tail-Zero-Padding
-        KIMG_CSZ = np.roll(np.roll(TailZP, -w0, axis=0), -w1, axis=1)    # Circular-Shift
+        TailZP = cupy.pad(ConvKernel, ((0, pd0), (0, pd1)), 'constant', constant_values=(0, 0))    # Tail-Zero-Padding
+        KIMG_CSZ = cupy.roll(np.roll(TailZP, -w0, axis=0), -w1, axis=1)    # Circular-Shift
         return KIMG_CSZ
 
     def iCSZ(KIMG, L0, L1):
         N0, N1 = KIMG.shape
         w0, w1 = (L0-1) // 2, (L1-1) // 2
-        KIMG_iCSZ = np.roll(np.roll(KIMG, w1, axis=1), w0, axis=0)    # inverse Circular-Shift
+        KIMG_iCSZ = cupy.roll(cupy.roll(KIMG, w1, axis=1), w0, axis=0)    # inverse Circular-Shift
         ConvKernel = KIMG_iCSZ[:L0, :L1]    # Tail-Truncation 
-        lost_weight = 1.0 - np.sum(np.abs(ConvKernel)) / np.sum(np.abs(KIMG_iCSZ))
+        lost_weight = 1.0 - cupy.sum(cupy.abs(ConvKernel)) / cupy.sum(cupy.abs(KIMG_iCSZ))
         print('MeLOn CheckPoint: Tail-Truncation Lost-Weight [%.4f %s] (Absolute Percentage Error) ' %(lost_weight*100, '%'))
         return ConvKernel
 
diff --git a/sfft/utils/CudaResampling.py b/sfft/utils/CudaResampling.py
new file mode 100644
index 0000000..20a738b
--- /dev/null
+++ b/sfft/utils/CudaResampling.py
@@ -0,0 +1,410 @@
+import time
+import warnings
+import cupy as cp
+import numpy as np
+from math import floor
+from astropy.wcs import WCS, FITSFixedWarning
+from sfft.utils.CupyWCSTransform import Cupy_WCS_Transform
+
+__last_update__ = "2024-09-19"
+__author__ = "Lei Hu <leihu@andrew.cmu.edu>"
+
+class Cuda_Resampling:
+    def __init__(self, RESAMP_METHOD="BILINEAR", VERBOSE_LEVEL=2):
+        """
+        Image resampling using CUDA.
+        """
+        self.RESAMP_METHOD = RESAMP_METHOD
+        self.VERBOSE_LEVEL = VERBOSE_LEVEL
+        return None
+    
+    def projection_cd(self, hdr_obj, hdr_targ, CDKEY="CD"):
+        """Mapping the target pixel centers to the object frame using Cupy for CD WCS (NO distortion)"""
+        NTX = int(hdr_targ["NAXIS1"]) 
+        NTY = int(hdr_targ["NAXIS2"])
+
+        XX_targ_GPU, YY_targ_GPU = cp.meshgrid(
+            cp.arange(0, NTX) + 1., 
+            cp.arange(0, NTY) + 1., 
+            indexing='ij'
+        )
+        
+        CWT = Cupy_WCS_Transform()
+        # * perform CD transformation through target WCS
+        if True:
+            # read header [target]
+            KEYDICT, CD_GPU = CWT.read_cd_wcs(hdr_wcs=hdr_targ, CDKEY=CDKEY)
+            CRPIX1_targ, CRPIX2_targ = KEYDICT["CRPIX1"], KEYDICT["CRPIX2"]
+            
+            # relative to reference point [target]
+            u_GPU, v_GPU = XX_targ_GPU.flatten() - CRPIX1_targ, YY_targ_GPU.flatten() - CRPIX2_targ
+            
+            # CD transformation [target]
+            x_GPU, y_GPU = CWT.cd_transform(IMAGE_X_GPU=u_GPU, IMAGE_Y_GPU=v_GPU, CD_GPU=CD_GPU)
+
+        # * perform CD^-1 transformation through object WCS
+        if True:
+            # read header [object]
+            KEYDICT, CD_GPU = CWT.read_cd_wcs(hdr_wcs=hdr_obj, CDKEY=CDKEY)
+            CRPIX1_obj, CRPIX2_obj = KEYDICT["CRPIX1"], KEYDICT["CRPIX2"]
+
+            # relative to reference point, consider offset between the WCS reference points [object]
+            # WARNING: the offset calculations may be wrong when the WCS reference points are very close to the N/S Poles!
+            x_GPU += (CRPIX1_targ - CRPIX1_obj + 180.0) % 360.0 - 180.0   # from using target reference point to object
+            y_GPU += CRPIX2_targ - CRPIX2_obj                             # ~
+
+            # inverse CD transformation [object]
+            u_GPU, v_GPU = CWT.cd_transform_inv(WORLD_X_GPU=x_GPU, WORLD_Y_GPU=y_GPU, CD_GPU=CD_GPU)
+            
+            # relative to image origin [object]
+            XX_proj_GPU = CRPIX1_obj + u_GPU.reshape((NTX, NTY))
+            YY_proj_GPU = CRPIX2_obj + v_GPU.reshape((NTX, NTY))
+        
+        return XX_proj_GPU, YY_proj_GPU
+
+    def projection_sip(self, hdr_obj, hdr_targ, Nsamp=1024, RANDOM_SEED=10086):
+        """Mapping the target pixel centers to the object frame using Cupy for SIP WCS"""
+        NTX = int(hdr_targ["NAXIS1"]) 
+        NTY = int(hdr_targ["NAXIS2"])
+
+        XX_targ_GPU, YY_targ_GPU = cp.meshgrid(
+            cp.arange(0, NTX) + 1., 
+            cp.arange(0, NTY) + 1., 
+            indexing='ij'
+        )
+        
+        CWT = Cupy_WCS_Transform()
+        # * perform forward transformation (+CD) through target WCS
+        if True:
+            # read header [target]
+            KEYDICT, CD_GPU, A_SIP_GPU, B_SIP_GPU = CWT.read_sip_wcs(hdr_wcs=hdr_targ)
+            CRPIX1_targ, CRPIX2_targ = KEYDICT["CRPIX1"], KEYDICT["CRPIX2"]
+            CRVAL1_targ, CRVAL2_targ = KEYDICT["CRVAL1"], KEYDICT["CRVAL2"]
+            
+            # relative to reference point [target]
+            u_GPU, v_GPU = XX_targ_GPU.flatten() - CRPIX1_targ, YY_targ_GPU.flatten() - CRPIX2_targ
+            
+            # forward transformation for target grid of pixel centers [target]
+            U_GPU, V_GPU = CWT.sip_forward_transform(u_GPU=u_GPU, v_GPU=v_GPU, A_SIP_GPU=A_SIP_GPU, B_SIP_GPU=B_SIP_GPU)
+            
+            # plus CD transformation [target]
+            x_GPU, y_GPU = CWT.cd_transform(IMAGE_X_GPU=U_GPU, IMAGE_Y_GPU=V_GPU, CD_GPU=CD_GPU)
+        
+        # * perform backward transformation (+CD^-1) through object WCS
+        if True:
+            # read header [object]
+            KEYDICT, CD_GPU, A_SIP_GPU, B_SIP_GPU = CWT.read_sip_wcs(hdr_wcs=hdr_obj)
+            CRPIX1_obj, CRPIX2_obj = KEYDICT["CRPIX1"], KEYDICT["CRPIX2"]
+            CRVAL1_obj, CRVAL2_obj = KEYDICT["CRVAL1"], KEYDICT["CRVAL2"]
+
+            # sampling random coordinates [object]
+            u_GPU, v_GPU = CWT.random_coord_sampling(N0=KEYDICT["N0"], N1=KEYDICT["N1"], 
+                CRPIX1=CRPIX1_obj, CRPIX2=CRPIX2_obj, Nsamp=Nsamp, RANDOM_SEED=RANDOM_SEED)
+            
+            # fit polynomial form backward transformation [object]
+            U_GPU, V_GPU = CWT.sip_forward_transform(u_GPU=u_GPU, v_GPU=v_GPU, A_SIP_GPU=A_SIP_GPU, B_SIP_GPU=B_SIP_GPU)
+            AP_lstsq_GPU, BP_lstsq_GPU = CWT.lstsq_sip_backward_transform(u_GPU=u_GPU, v_GPU=v_GPU, 
+                U_GPU=U_GPU, V_GPU=V_GPU, A_ORDER=KEYDICT["A_ORDER"], B_ORDER=KEYDICT["B_ORDER"])[2:4]
+
+            # relative to reference point, consider offset between the WCS reference points [object]
+            # WARNING: the offset calculations may be wrong when the WCS reference points are very close to the N/S Poles!
+            # TODO: this is not accruate, ~ 1pix.
+            offset1 = ((CRVAL1_targ - CRVAL1_obj + 180.0) % 360.0 - 180.0) * cp.cos(cp.deg2rad((CRVAL2_targ + CRVAL2_obj)/2.))
+            offset2 = CRVAL2_targ - CRVAL2_obj
+            x_GPU += offset1
+            y_GPU += offset2
+
+            # inverse CD transformation [object]
+            U_GPU, V_GPU = CWT.cd_transform_inv(WORLD_X_GPU=x_GPU, WORLD_Y_GPU=y_GPU, CD_GPU=CD_GPU)
+            
+            # backward transformation [object]
+            FP_UV_GPU = CWT.sip_backward_matrix(U_GPU=U_GPU, V_GPU=V_GPU, ORDER=KEYDICT["A_ORDER"])
+            GP_UV_GPU = CWT.sip_backward_matrix(U_GPU=U_GPU, V_GPU=V_GPU, ORDER=KEYDICT["B_ORDER"])
+
+            u_GPU, v_GPU = CWT.sip_backward_transform(U_GPU=U_GPU, V_GPU=V_GPU, 
+                FP_UV_GPU=FP_UV_GPU, GP_UV_GPU=GP_UV_GPU, AP_GPU=AP_lstsq_GPU, BP_GPU=BP_lstsq_GPU)
+            
+            # relative to image origin [object]
+            XX_proj_GPU = CRPIX1_obj + u_GPU.reshape((NTX, NTY))
+            YY_proj_GPU = CRPIX2_obj + v_GPU.reshape((NTX, NTY))
+
+        return XX_proj_GPU, YY_proj_GPU
+
+    def projection_astropy(self, hdr_obj, hdr_targ):
+        """Mapping the target pixel centers to the object frame using Astropy"""
+        # * read object WCS and target WCS
+        def _readWCS(hdr, VERBOSE_LEVEL=2):
+            with warnings.catch_warnings():
+                if VERBOSE_LEVEL in [0, 1]: behavior = 'ignore'
+                if VERBOSE_LEVEL in [2]: behavior = 'default'
+                warnings.filterwarnings(behavior, category=FITSFixedWarning)
+
+                if hdr['CTYPE1'] == 'RA---TAN' and 'PV1_0' in hdr:
+                    _hdr = hdr.copy()
+                    _hdr['CTYPE1'] = 'RA---TPV'
+                    _hdr['CTYPE2'] = 'DEC--TPV'
+                else: _hdr = hdr
+                w = WCS(_hdr)
+            return w
+
+        w_obj = _readWCS(hdr=hdr_obj, VERBOSE_LEVEL=self.VERBOSE_LEVEL)
+        w_targ = _readWCS(hdr=hdr_targ, VERBOSE_LEVEL=self.VERBOSE_LEVEL)
+
+        # * maaping target pixel centers to the object frame
+        NTX = int(hdr_targ["NAXIS1"]) 
+        NTY = int(hdr_targ["NAXIS2"])
+        
+        XX_targ, YY_targ = np.meshgrid(np.arange(0, NTX)+1., np.arange(0, NTY)+1., indexing='ij')
+        XY_targ = np.array([XX_targ.flatten(), YY_targ.flatten()]).T
+        XY_proj = w_obj.all_world2pix(w_targ.all_pix2world(XY_targ, 1), 1)
+
+        XX_proj = XY_proj[:, 0].reshape((NTX, NTY))
+        YY_proj = XY_proj[:, 1].reshape((NTX, NTY))
+
+        XX_proj_GPU = cp.array(XX_proj, dtype=np.float64)
+        YY_proj_GPU = cp.array(YY_proj, dtype=np.float64)
+
+        return XX_proj_GPU, YY_proj_GPU
+
+    def frame_extension(self, XX_proj_GPU, YY_proj_GPU, PixA_obj_GPU):
+        """Extend the object frame for resampling"""
+        NTX, NTY = XX_proj_GPU.shape
+        NOX, NOY = PixA_obj_GPU.shape
+
+        # * padding the object frame
+        if self.RESAMP_METHOD == 'BILINEAR':
+            KERHW = (1, 1)
+
+        if self.RESAMP_METHOD == 'LANCZOS3':
+            KERHW = (3, 3)
+
+        # find the root index and shift with maximal kernel halfwidth (KERHW)
+        # Note: the index ranges (RMIN --- RMAX) and (CMIN --- CMAX) can cover
+        #       all pixels that interpolation may be use.
+
+        RMIN = (floor(XX_proj_GPU.min().item()) - 1) - KERHW[0]
+        RMAX = (floor(XX_proj_GPU.max().item()) - 1) + KERHW[0]
+        RPAD = (-np.min([RMIN, 0]), np.max([RMAX - (NOX - 1), 0]))
+
+        CMIN = (floor(YY_proj_GPU.min().item()) - 1) - KERHW[1]
+        CMAX = (floor(YY_proj_GPU.max().item()) - 1) + KERHW[1]
+        CPAD = (-np.min([CMIN, 0]), np.max([CMAX - (NOY - 1), 0]))
+
+        PAD_WIDTH = (RPAD, CPAD)    
+        PixA_Eobj_GPU = cp.pad(PixA_obj_GPU, PAD_WIDTH, mode='constant', constant_values=0.)
+        NEOX, NEOY = PixA_Eobj_GPU.shape
+
+        XX_Eproj_GPU = XX_proj_GPU + PAD_WIDTH[0][0]
+        YY_Eproj_GPU = YY_proj_GPU + PAD_WIDTH[1][0]
+
+        RMIN_E = (floor(XX_Eproj_GPU.min().item()) - 1) - KERHW[0]
+        RMAX_E = (floor(XX_Eproj_GPU.max().item()) - 1) + KERHW[0]
+
+        CMIN_E = (floor(YY_Eproj_GPU.min().item()) - 1) - KERHW[1]
+        CMAX_E = (floor(YY_Eproj_GPU.max().item()) - 1) + KERHW[1]
+
+        assert RMIN_E >= 0 and CMIN_E >= 0 
+        assert RMAX_E < NEOX and CMAX_E < NEOY
+
+        EProjDict = {}
+        EProjDict['NTX'] = NTX
+        EProjDict['NTY'] = NTY
+
+        EProjDict['NOX'] = NOX
+        EProjDict['NOY'] = NOY
+
+        EProjDict['NEOX'] = NEOX
+        EProjDict['NEOY'] = NEOY
+
+        EProjDict['XX_Eproj_GPU'] = XX_Eproj_GPU
+        EProjDict['YY_Eproj_GPU'] = YY_Eproj_GPU
+
+        return PixA_Eobj_GPU, EProjDict
+    
+    def resampling(self, PixA_Eobj_GPU, EProjDict):
+        """Resampling the object frame to the target frame using CUDA"""
+        NTX = EProjDict['NTX']
+        NTY = EProjDict['NTY']
+
+        NEOX = EProjDict['NEOX']
+        NEOY = EProjDict['NEOY']
+
+        XX_Eproj_GPU = EProjDict['XX_Eproj_GPU']
+        YY_Eproj_GPU = EProjDict['YY_Eproj_GPU']
+
+        # * Cupy configuration
+        MaxThreadPerB = 8
+        GPUManage = lambda NT: ((NT-1)//MaxThreadPerB + 1, min(NT, MaxThreadPerB))
+        BpG_PIX0, TpB_PIX0 = GPUManage(NTX)
+        BpG_PIX1, TpB_PIX1 = GPUManage(NTY)
+        BpG_PIX, TpB_PIX = (BpG_PIX0, BpG_PIX1), (TpB_PIX0, TpB_PIX1, 1)
+
+        PixA_resamp_GPU = cp.zeros((NTX, NTY), dtype=np.float64)
+
+        if self.RESAMP_METHOD == "BILINEAR":
+        
+            # * perform bilinear resampling using CUDA
+            # input: PixA_Eobj | (NEOX, NEOY)
+            # input: XX_Eproj, YY_Eproj | (NTX, NTY)
+            # output: PixA_resamp | (NTX, NTY)
+            
+            _refdict = {'NTX': NTX, 'NTY': NTY, 'NEOX': NEOX, 'NEOY': NEOY}
+            _funcstr = r"""
+            extern "C" __global__ void kmain(double XX_Eproj_GPU[%(NTX)s][%(NTY)s], double YY_Eproj_GPU[%(NTX)s][%(NTY)s], 
+                double PixA_Eobj_GPU[%(NEOX)s][%(NEOY)s], double PixA_resamp_GPU[%(NTX)s][%(NTY)s])
+            {
+                int ROW = blockIdx.x*blockDim.x+threadIdx.x;
+                int COL = blockIdx.y*blockDim.y+threadIdx.y;
+
+                int NTX = %(NTX)s;
+                int NTY = %(NTY)s;
+                int NEOX = %(NEOX)s;
+                int NEOY = %(NEOY)s;
+
+                if (ROW < NTX && COL < NTY) {
+                
+                    double x = XX_Eproj_GPU[ROW][COL];
+                    double y = YY_Eproj_GPU[ROW][COL];
+
+                    int r1 = floor(x) - 1;
+                    int c1 = floor(y) - 1;
+                    int r2 = r1 + 1;
+                    int c2 = c1 + 1;
+
+                    double dx = x - floor(x); 
+                    double dy = y - floor(y);
+
+                    double w11 = (1-dx) * (1-dy);
+                    double w12 = (1-dx) * dy;
+                    double w21 = dx * (1-dy);
+                    double w22 = dx * dy;
+
+                    PixA_resamp_GPU[ROW][COL] = w11 * PixA_Eobj_GPU[r1][c1] + w12 * PixA_Eobj_GPU[r1][c2] + 
+                        w21 * PixA_Eobj_GPU[r2][c1] + w22 * PixA_Eobj_GPU[r2][c2];
+                }
+            }
+            """
+            _code = _funcstr % _refdict
+            _module = cp.RawModule(code=_code, backend=u'nvcc', translate_cucomplex=False)
+            resamp_func = _module.get_function('kmain')
+            
+            t0 = time.time()
+            resamp_func(args=(XX_Eproj_GPU, YY_Eproj_GPU, PixA_Eobj_GPU, PixA_resamp_GPU), block=TpB_PIX, grid=BpG_PIX)
+            if self.VERBOSE_LEVEL in [1, 2]:
+                print('MeLOn CheckPoint: Cuda resampling takes [%.6f s]' %(time.time() - t0))
+            
+        if self.RESAMP_METHOD == "LANCZOS3":
+            
+            # * perform LANCZOS-3 resampling using CUDA
+            # input: XX_Eproj, YY_Eproj | (NTX, NTY)
+            # output: PixA_resamp | (NTX, NTY)
+            
+            _refdict = {'NTX': NTX, 'NTY': NTY}
+            _funcstr = r"""
+            extern "C" __global__ void kmain(double XX_Eproj_GPU[%(NTX)s][%(NTY)s], double YY_Eproj_GPU[%(NTX)s][%(NTY)s], 
+                double LKERNEL_X_GPU[6][%(NTX)s][%(NTY)s], double LKERNEL_Y_GPU[6][%(NTX)s][%(NTY)s])
+            {
+                int ROW = blockIdx.x*blockDim.x+threadIdx.x;
+                int COL = blockIdx.y*blockDim.y+threadIdx.y;
+
+                int NTX = %(NTX)s;
+                int NTY = %(NTY)s;
+                
+                double PI = 3.141592653589793;
+                double PIS = 9.869604401089358;
+                
+                if (ROW < NTX && COL < NTY) {
+                    
+                    double x = XX_Eproj_GPU[ROW][COL];
+                    double y = YY_Eproj_GPU[ROW][COL];
+                    
+                    double dx = x - floor(x); 
+                    double dy = y - floor(y);
+                    
+                    // LANCZOS3 weights in x axis
+                    double wx0 = 3.0 * sin(PI*(-2.0 - dx)) * sin(PI*(-2.0 - dx)/3.0) / (PIS*(-2.0 - dx) * (-2.0 - dx));
+                    double wx1 = 3.0 * sin(PI*(-1.0 - dx)) * sin(PI*(-1.0 - dx)/3.0) / (PIS*(-1.0 - dx) * (-1.0 - dx));
+                    double wx2 = 1.0;
+                    if (fabs(dx) > 1e-4) {
+                        wx2 = 3.0 * sin(PI*(-dx)) * sin(PI*(-dx)/3.0) / (PIS*(-dx) * (-dx));
+                    }
+                    double wx3 = 3.0 * sin(PI*(1.0 - dx)) * sin(PI*(1.0 - dx)/3.0) / (PIS*(1.0 - dx) * (1.0 - dx));
+                    double wx4 = 3.0 * sin(PI*(2.0 - dx)) * sin(PI*(2.0 - dx)/3.0) / (PIS*(2.0 - dx) * (2.0 - dx));
+                    double wx5 = 3.0 * sin(PI*(3.0 - dx)) * sin(PI*(3.0 - dx)/3.0) / (PIS*(3.0 - dx) * (3.0 - dx));
+                    
+                    LKERNEL_X_GPU[0][ROW][COL] = wx0;
+                    LKERNEL_X_GPU[1][ROW][COL] = wx1;
+                    LKERNEL_X_GPU[2][ROW][COL] = wx2;
+                    LKERNEL_X_GPU[3][ROW][COL] = wx3;
+                    LKERNEL_X_GPU[4][ROW][COL] = wx4;
+                    LKERNEL_X_GPU[5][ROW][COL] = wx5;
+                    
+                    // LANCZOS3 weights in y axis
+                    double wy0 = 3.0 * sin(PI*(-2.0 - dy)) * sin(PI*(-2.0 - dy)/3.0) / (PIS*(-2.0 - dy) * (-2.0 - dy));
+                    double wy1 = 3.0 * sin(PI*(-1.0 - dy)) * sin(PI*(-1.0 - dy)/3.0) / (PIS*(-1.0 - dy) * (-1.0 - dy));
+                    double wy2 = 1.0;
+                    if (fabs(dy) > 1e-4) {
+                        wy2 = 3.0 * sin(PI*(-dy)) * sin(PI*(-dy)/3.0) / (PIS*(-dy) * (-dy));
+                    }
+                    double wy3 = 3.0 * sin(PI*(1.0 - dy)) * sin(PI*(1.0 - dy)/3.0) / (PIS*(1.0 - dy) * (1.0 - dy));
+                    double wy4 = 3.0 * sin(PI*(2.0 - dy)) * sin(PI*(2.0 - dy)/3.0) / (PIS*(2.0 - dy) * (2.0 - dy));
+                    double wy5 = 3.0 * sin(PI*(3.0 - dy)) * sin(PI*(3.0 - dy)/3.0) / (PIS*(3.0 - dy) * (3.0 - dy));
+                    
+                    LKERNEL_Y_GPU[0][ROW][COL] = wy0;
+                    LKERNEL_Y_GPU[1][ROW][COL] = wy1;
+                    LKERNEL_Y_GPU[2][ROW][COL] = wy2;
+                    LKERNEL_Y_GPU[3][ROW][COL] = wy3;
+                    LKERNEL_Y_GPU[4][ROW][COL] = wy4;
+                    LKERNEL_Y_GPU[5][ROW][COL] = wy5;
+                }
+            }
+            """
+            _code = _funcstr % _refdict
+            _module = cp.RawModule(code=_code, backend=u'nvcc', translate_cucomplex=False)
+            weightkernel_func = _module.get_function('kmain')
+            
+            _refdict = {'NTX': NTX, 'NTY': NTY, 'NEOX': NEOX, 'NEOY': NEOY}
+            _funcstr = r"""
+            extern "C" __global__ void kmain(double XX_Eproj_GPU[%(NTX)s][%(NTY)s], double YY_Eproj_GPU[%(NTX)s][%(NTY)s], 
+                double LKERNEL_X_GPU[6][%(NTX)s][%(NTY)s], double LKERNEL_Y_GPU[6][%(NTX)s][%(NTY)s], 
+                double PixA_Eobj_GPU[%(NEOX)s][%(NEOY)s], double PixA_resamp_GPU[%(NTX)s][%(NTY)s])
+            {
+                int ROW = blockIdx.x*blockDim.x+threadIdx.x;
+                int COL = blockIdx.y*blockDim.y+threadIdx.y;
+
+                int NTX = %(NTX)s;
+                int NTY = %(NTY)s;
+                int NEOX = %(NEOX)s;
+                int NEOY = %(NEOY)s;
+
+                if (ROW < NTX && COL < NTY) {
+                
+                    double x = XX_Eproj_GPU[ROW][COL];
+                    double y = YY_Eproj_GPU[ROW][COL];
+                    
+                    int r0 = floor(x) - 3;
+                    int c0 = floor(y) - 3;
+                    
+                    for(int i = 0; i < 6; ++i){
+                        for(int j = 0; j < 6; ++j){
+                            double w = LKERNEL_X_GPU[i][ROW][COL] * LKERNEL_Y_GPU[j][ROW][COL];
+                            PixA_resamp_GPU[ROW][COL] += w * PixA_Eobj_GPU[r0 + i][c0 + j];
+                        }
+                    }
+                }
+            }
+            """
+            _code = _funcstr % _refdict
+            _module = cp.RawModule(code=_code, backend=u'nvcc', translate_cucomplex=False)
+            resamp_func = _module.get_function('kmain')
+            
+            t0 = time.time()
+            LKERNEL_X_GPU = cp.zeros((6, NTX, NTY), dtype=np.float64)
+            LKERNEL_Y_GPU = cp.zeros((6, NTX, NTY), dtype=np.float64)
+            weightkernel_func(args=(XX_Eproj_GPU, YY_Eproj_GPU, LKERNEL_X_GPU, LKERNEL_Y_GPU), block=TpB_PIX, grid=BpG_PIX)
+            resamp_func(args=(XX_Eproj_GPU, YY_Eproj_GPU, LKERNEL_X_GPU, LKERNEL_Y_GPU, 
+                PixA_Eobj_GPU, PixA_resamp_GPU), block=TpB_PIX, grid=BpG_PIX)
+            if self.VERBOSE_LEVEL in [1, 2]:
+                print('MeLOn CheckPoint: Cuda resampling takes [%.6f s]' %(time.time() - t0))
+
+        return PixA_resamp_GPU
diff --git a/sfft/utils/CupyWCSTransform.py b/sfft/utils/CupyWCSTransform.py
new file mode 100644
index 0000000..ceb47b0
--- /dev/null
+++ b/sfft/utils/CupyWCSTransform.py
@@ -0,0 +1,241 @@
+import cupy as cp
+
+__last_update__ = "2024-09-19"
+__author__ = "Shu Liu <shl159@pitt.edu> & Lei Hu <leihu@andrew.cmu.edu>"
+
+class Cupy_WCS_Transform:
+    def __init__(self):
+        return None
+
+    def read_cd_wcs(self, hdr_wcs, CDKEY="CD"):
+        """
+        # * Note on the CD matrix transformation:
+        The sky coordinate (x, y) relative to reference point can be connected with 
+        the image coordinate (u, v) relative to reference point by the CD matrix:
+            [x]   [CD1_1 CD1_2] [u]
+            [y] = [CD2_1 CD2_2] [v]
+        where CD1_1, CD1_2, CD2_1, CD2_2 are stored in the FITS header.
+        
+        """
+        assert hdr_wcs["CTYPE1"] == "RA---TAN"
+        assert hdr_wcs["CTYPE2"] == "DEC--TAN"
+
+        N0 = int(hdr_wcs["NAXIS1"])
+        N1 = int(hdr_wcs["NAXIS2"])
+
+        CRPIX1 = float(hdr_wcs["CRPIX1"])
+        CRPIX2 = float(hdr_wcs["CRPIX2"])
+
+        CRVAL1 = float(hdr_wcs["CRVAL1"])
+        CRVAL2 = float(hdr_wcs["CRVAL2"])
+
+        KEYDICT = {
+            "N0": N0, "N1": N1, 
+            "CRPIX1": CRPIX1, "CRPIX2": CRPIX2,
+            "CRVAL1": CRVAL1, "CRVAL2": CRVAL2
+        }
+
+        CD1_1 = hdr_wcs[f"{CDKEY}1_1"]
+        CD1_2 = hdr_wcs[f"{CDKEY}1_2"] if f"{CDKEY}1_2" in hdr_wcs else 0.
+        CD2_1 = hdr_wcs[f"{CDKEY}2_1"] if f"{CDKEY}2_1" in hdr_wcs else 0.
+        CD2_2 = hdr_wcs[f"{CDKEY}2_2"]
+
+        CD_GPU = cp.array([
+            [CD1_1, CD1_2], 
+            [CD2_1, CD2_2]
+        ], dtype=cp.float64)
+        
+        return KEYDICT, CD_GPU
+
+    def read_sip_wcs(self, hdr_wcs):
+        """
+        # * Note on the SIP transformation:
+        The image coordinate (u, v) relative to reference point and we have undistorted image coordinate (U, V):
+            U = u + f(u, v)
+            V = v + g(u, v)
+        where f(u, v) and g(u, v) are the SIP distortion functions with polynomial form:
+            f(u, v) = sum_{p, q} A_{pq} u^p v^q, p + q <= A_ORDER
+            g(u, v) = sum_{p, q} B_{pq} u^p v^q, p + q <= B_ORDER
+        These coefficients are stored in the FITS header and define a foward transformation from (u, v) to (U, V). 
+        
+        The sky coordinate (x, y) relative to reference point can be connected with (U, V) by the reversible CD matrix:
+            [x]   [CD1_1 CD1_2] [U]
+            [y] = [CD2_1 CD2_2] [V]
+        So the forward transformation & CD matrix (with considering the reference point) is equivalent to a pix2world function.
+        
+        The reverse question is how to obtain the backward transformation from undistorted (U, V) to distorted (u, v).
+        Once we have the backward transformation, we can combine a reversed CD matrix 
+        (with considering the reference point) to get the world2pix function.
+
+        A simple approach (not accurate) is to assume the forward transformation also has a polynomial form:
+            u = U + fp(U, V)
+            v = V + gp(U, V)
+        where 
+            fp(U, V) = sum_{p, q} AP_{pq} U^p V^q, p + q <= A_ORDER
+            gp(U, V) = sum_{p, q} BP_{pq} U^p V^q, p + q <= B_ORDER
+        
+        The coefficients AP_{pq} and BP_{pq} can be obtained by solving the linear equations separately.
+            [u - U] = [U^p*V^q] [AP_{pq}]
+            
+            shapes: 
+            b: [u - U]   | (N, 1), 
+            A: [U^p*V^q] | (N, (A_ORDER+1)*(A_ORDER+2)/2)
+            x: [AP_{pq}] | ((A_ORDER+1)*(A_ORDER+2)/2, 1)
+            
+            [v - V] = [U^p*V^q] [BP_{pq}]
+            b: [v - V]   | (N, 1),
+            A: [U^p*V^q] | (N, (B_ORDER+1)*(B_ORDER+2)/2)
+            x: [BP_{pq}] | ((B_ORDER+1)*(B_ORDER+2)/2, 1)
+
+            The two matrices encode the "backward" transformation in fp(U, V) and gp(U, V).
+            Let's denote the two matrices as FP_UV and GP_UV, respectively.
+
+        # * A WCS example with SIP distortion
+        CTYPE1  = 'RA---TAN-SIP'
+        CTYPE2  = 'DEC--TAN-SIP'
+        
+        CRPIX1  =               2044.0 
+        CRPIX2  =               2044.0 
+        
+        CD1_1   = 2.65458074767927E-05
+        CD1_2   = -1.2630331175158E-05 
+        CD2_1   = 1.38917634264203E-05 
+        CD2_2   = 2.57785917553667E-05
+        
+        CRVAL1  =    9.299707734492053 
+        CRVAL2  =  -43.984663860136145
+
+        A_ORDER =                    4 
+        A_0_2   =     -4.774423702E-10 
+        A_0_3   =     -2.462744787E-14 
+        A_0_4   =       2.28913156E-17 
+        A_1_1   =      1.076615801E-09 
+        A_1_2   =     -9.939904553E-14 
+        A_1_3   =     -5.845336863E-17 
+        A_2_0   =     -3.717865118E-10 
+        A_2_1   =     -4.966118494E-15 
+        A_2_2   =      6.615859199E-17 
+        A_3_0   =     -3.817793356E-15 
+        A_3_1   =      3.310020049E-17 
+        A_4_0   =     -5.166048303E-19 
+        
+        B_ORDER =                    4 
+        B_0_2   =      1.504792792E-09 
+        B_0_3   =     -2.662833066E-15 
+        B_0_4   =     -8.383877471E-20 
+        B_1_1   =      1.204553316E-10 
+        B_1_2   =       6.57748238E-14 
+        B_1_3   =     -6.998508554E-18 
+        B_2_0   =      4.136013664E-10 
+        B_2_1   =      5.339208582E-14 
+        B_2_2   =      6.063403412E-17 
+        B_3_0   =      1.486316541E-14 
+        B_3_1   =     -9.102668806E-17 
+        B_4_0   =     -5.174323631E-17
+
+        """
+        assert hdr_wcs["CTYPE1"] == "RA---TAN-SIP"
+        assert hdr_wcs["CTYPE2"] == "DEC--TAN-SIP"
+
+        N0 = int(hdr_wcs["NAXIS1"])
+        N1 = int(hdr_wcs["NAXIS2"])
+
+        CRPIX1 = float(hdr_wcs["CRPIX1"])
+        CRPIX2 = float(hdr_wcs["CRPIX2"])
+
+        CRVAL1 = float(hdr_wcs["CRVAL1"])
+        CRVAL2 = float(hdr_wcs["CRVAL2"])
+
+        A_ORDER = int(hdr_wcs["A_ORDER"])
+        B_ORDER = int(hdr_wcs["B_ORDER"])
+
+        KEYDICT = {
+            "N0": N0, "N1": N1, 
+            "CRPIX1": CRPIX1, "CRPIX2": CRPIX2,
+            "CRVAL1": CRVAL1, "CRVAL2": CRVAL2, 
+            "A_ORDER": A_ORDER, "B_ORDER": B_ORDER
+        }
+
+        CD_GPU = cp.array([
+            [hdr_wcs["CD1_1"], hdr_wcs["CD1_2"]], 
+            [hdr_wcs["CD2_1"], hdr_wcs["CD2_2"]]
+        ], dtype=cp.float64)
+
+        A_SIP_GPU = cp.zeros((A_ORDER+1, A_ORDER+1), dtype=cp.float64)
+        B_SIP_GPU = cp.zeros((B_ORDER+1, B_ORDER+1), dtype=cp.float64)
+
+        for p in range(A_ORDER + 1):
+            for q in range(0, A_ORDER - p + 1):
+                keyword = f"A_{p}_{q}"
+                if keyword in hdr_wcs:
+                    A_SIP_GPU[p, q] = hdr_wcs[keyword]
+        
+        for p in range(B_ORDER + 1):
+            for q in range(0, B_ORDER - p + 1):
+                keyword = f"B_{p}_{q}"
+                if keyword in hdr_wcs:
+                    B_SIP_GPU[p, q] = hdr_wcs[keyword]
+        
+        return KEYDICT, CD_GPU, A_SIP_GPU, B_SIP_GPU
+
+    def cd_transform(self, IMAGE_X_GPU, IMAGE_Y_GPU, CD_GPU):
+        """CD matrix transformation from image to world"""
+        WORLD_X_GPU, WORLD_Y_GPU = cp.matmul(CD_GPU, cp.array([IMAGE_X_GPU, IMAGE_Y_GPU]))
+        return WORLD_X_GPU, WORLD_Y_GPU
+    
+    def cd_transform_inv(self, WORLD_X_GPU, WORLD_Y_GPU, CD_GPU):
+        """CD matrix transformation from world to image"""
+        # CD_inv_GPU = cp.linalg.inv(CD_GPU)
+        CD_inv_GPU = 1. / (CD_GPU[0, 0] * CD_GPU[1, 1] - CD_GPU[0, 1] * CD_GPU[1, 0]) * \
+            cp.array([[CD_GPU[1, 1], -CD_GPU[0, 1]], [-CD_GPU[1, 0], CD_GPU[0, 0]]])
+        IMAGE_X_GPU, IMAGE_Y_GPU = cp.matmul(CD_inv_GPU, cp.array([WORLD_X_GPU, WORLD_Y_GPU]))
+        return IMAGE_X_GPU, IMAGE_Y_GPU
+    
+    def sip_forward_transform(self, u_GPU, v_GPU, A_SIP_GPU, B_SIP_GPU):
+        """Forward SIP transformation from (u, v) to (U, V)"""
+        A_ORDER = A_SIP_GPU.shape[0] - 1
+        U_GPU = u_GPU + cp.zeros_like(u_GPU)
+        for p in range(A_ORDER + 1):
+            for q in range(A_ORDER - p + 1):
+                U_GPU += A_SIP_GPU[p, q] * u_GPU**p * v_GPU**q
+            
+        B_ORDER = B_SIP_GPU.shape[0] - 1
+        V_GPU = v_GPU + cp.zeros_like(v_GPU)
+        for p in range(B_ORDER + 1):
+            for q in range(B_ORDER - p + 1):
+                V_GPU += B_SIP_GPU[p, q] * u_GPU**p * v_GPU**q
+        return U_GPU, V_GPU
+
+    def sip_backward_matrix(self, U_GPU, V_GPU, ORDER):
+        """Compute the backward matrix P_UV (FP_UV or GP_UV)"""
+        Nsamp = len(U_GPU)
+        P_UV_GPU = cp.zeros((Nsamp, (ORDER+1)*(ORDER+2)//2), dtype=cp.float64)
+        idx = 0
+        for p in range(ORDER + 1):
+            for q in range(ORDER - p + 1):
+                P_UV_GPU[:, idx] = U_GPU**p * V_GPU**q
+                idx += 1
+        return P_UV_GPU
+
+    def lstsq_sip_backward_transform(self, u_GPU, v_GPU, U_GPU, V_GPU, A_ORDER, B_ORDER):
+        """Compute the backward transformation from (U, V) to (u, v)"""
+        FP_UV_GPU = self.sip_backward_matrix(U_GPU=U_GPU, V_GPU=V_GPU, ORDER=A_ORDER)
+        GP_UV_GPU = self.sip_backward_matrix(U_GPU=U_GPU, V_GPU=V_GPU, ORDER=B_ORDER)
+
+        AP_lstsq_GPU = cp.linalg.lstsq(FP_UV_GPU, (u_GPU - U_GPU).reshape(-1, 1), rcond=None)[0]
+        BP_lstsq_GPU = cp.linalg.lstsq(GP_UV_GPU, (v_GPU - V_GPU).reshape(-1, 1), rcond=None)[0]
+        return FP_UV_GPU, GP_UV_GPU, AP_lstsq_GPU, BP_lstsq_GPU
+
+    def sip_backward_transform(self, U_GPU, V_GPU, FP_UV_GPU, GP_UV_GPU, AP_GPU, BP_GPU):
+        """Backward transformation from (U, V) to (u, v)"""
+        u_GPU = U_GPU + cp.matmul(FP_UV_GPU, AP_GPU)[:, 0]
+        v_GPU = V_GPU + cp.matmul(GP_UV_GPU, BP_GPU)[:, 0]
+        return u_GPU, v_GPU
+
+    def random_coord_sampling(self, N0, N1, CRPIX1, CRPIX2, Nsamp=1024, RANDOM_SEED=10086):
+        """Random sampling of coordinates"""
+        if RANDOM_SEED is not None:
+            cp.random.seed(RANDOM_SEED)
+        u_GPU = cp.random.uniform(0.5, N0+0.5, Nsamp, dtype=cp.float64) - CRPIX1
+        v_GPU = cp.random.uniform(0.5, N1+0.5, Nsamp, dtype=cp.float64) - CRPIX2
+        return u_GPU, v_GPU
diff --git a/sfft/utils/DeCorrelationCalculator.py b/sfft/utils/DeCorrelationCalculator.py
index a0592a0..aa70d70 100644
--- a/sfft/utils/DeCorrelationCalculator.py
+++ b/sfft/utils/DeCorrelationCalculator.py
@@ -1,5 +1,6 @@
 import math
 import numpy as np
+import cupy
 from sfft.utils.ConvKernelConvertion import ConvKernel_Convertion
 # version: Mar 10, 2023
 
@@ -69,23 +70,24 @@ def DCC(MK_JLst, SkySig_JLst, MK_ILst=[], SkySig_ILst=[], MK_Fin=None, KERatio=2
         # construct the DeNonimator (a real-positive map) in Fourier Space
         uMK = np.array([[0, 0, 0], [0, 1, 0], [0, 0, 0]]).astype(float)
         def Get_JTerm(MKj, skysig):
-            if MKj is not None: KIMG_CSZ = ConvKernel_Convertion.CSZ(MKj, N0, N1)
-            if MKj is None: KIMG_CSZ = ConvKernel_Convertion.CSZ(uMK, N0, N1)
-            kft = np.fft.fft2(KIMG_CSZ)
-            kft2 = (np.conj(kft) * kft).real
-            term = (skysig**2 * kft2) / NumJ**2
+            if MKj is not None: KIMG_CSZ = ConvKernel_Convertion.CSZ(cupy.array( MKj ), N0, N1)
+            if MKj is None: KIMG_CSZ = ConvKernel_Convertion.CSZ(cupy.array( uMK ), N0, N1)
+            kft = cupy.fft.fft2(KIMG_CSZ)
+            kft2 = (cupy.conj(kft) * kft).real
+            term = (skysig**2 * kft2) / NumJ**2 
             return term
         
-        if MK_Fin is not None: KIMG_CSZ = ConvKernel_Convertion.CSZ(MK_Fin, N0, N1)
-        if MK_Fin is None: KIMG_CSZ = ConvKernel_Convertion.CSZ(uMK, N0, N1)
-        kft = np.fft.fft2(KIMG_CSZ)
-        kft2_Fin = (np.conj(kft) * kft).real
+        if MK_Fin is not None: KIMG_CSZ = ConvKernel_Convertion.CSZ(cupy.array( MK_Fin ), N0, N1)
+        if MK_Fin is None: KIMG_CSZ = ConvKernel_Convertion.CSZ(cupy.array( uMK ), N0, N1)
+        cu_KIMG_CSZ = cupy.array( KIMG_CSZ )
+        kft = cupy.fft.fft2(cu_KIMG_CSZ)
+        kft2_Fin = (cupy.conj(kft) * kft).real
 
         def Get_ITerm(MKi, skysig):
-            if MKi is not None: KIMG_CSZ = ConvKernel_Convertion.CSZ(MKi, N0, N1)
-            if MKi is None: KIMG_CSZ = ConvKernel_Convertion.CSZ(uMK, N0, N1)
-            kft = np.fft.fft2(KIMG_CSZ)
-            kft2 = (np.conj(kft) * kft).real
+            if MKi is not None: KIMG_CSZ = ConvKernel_Convertion.CSZ(cupy.array( MKi ), N0, N1) 
+            if MKi is None: KIMG_CSZ = ConvKernel_Convertion.CSZ(cupy.array( uMK ), N0, N1)
+            kft = cupy.fft.fft2(KIMG_CSZ)
+            kft2 = (cupy.conj(kft) * kft).real
             term = (skysig**2 * kft2 * kft2_Fin) / NumI**2 
             return term
         
@@ -96,9 +98,12 @@ def Get_ITerm(MKi, skysig):
             for MKi, skysig in zip(MK_ILst, SkySig_ILst):
                 DeNo += Get_ITerm(MKi, skysig)
 
-        FDeCo = np.sqrt(1.0 / DeNo)        # real & conjugate-symmetric
-        DeCo = np.fft.ifft2(FDeCo).real    # no imaginary part
+        FDeCo = cupy.sqrt(1.0 / DeNo)        # real & conjugate-symmetric
+        DeCo = cupy.fft.ifft2(FDeCo).real    # no imaginary part
+        
+        # made this cupy:
         KDeCo = ConvKernel_Convertion.iCSZ(DeCo, L0_KDeCo, L1_KDeCo)
-        KDeCo = KDeCo / np.sum(KDeCo)      # rescale to have Unit kernel sum
+        KDeCo = KDeCo / cupy.sum(KDeCo)      # rescale to have Unit kernel sum
+        KDeCo = cupy.asnumpy( KDeCo )
 
         return KDeCo
diff --git a/sfft/utils/ImageZoomRotate.py b/sfft/utils/ImageZoomRotate.py
index 712f94d..48efa3e 100644
--- a/sfft/utils/ImageZoomRotate.py
+++ b/sfft/utils/ImageZoomRotate.py
@@ -1,19 +1,19 @@
 import os
 import math
+import cupy as cp
 import numpy as np
 from astropy.io import fits
 from astropy.wcs import WCS
 from tempfile import mkdtemp
-from sfft.utils.pyAstroMatic.PYSWarp import PY_SWarp
-# version: Apr 22, 2024
+from sfft.utils.CudaResampling import Cuda_Resampling
 
+__last_update__ = "2024-09-19"
 __author__ = "Lei Hu <leihu@andrew.cmu.edu>"
-__version__ = "v1.4"
 
 class Image_ZoomRotate:
     @staticmethod
     def IZR(PixA_obj, ZOOM_SCALE_X=1.0, ZOOM_SCALE_Y=1.0, OUTSIZE_PARIRY_X='UNCHANGED', OUTSIZE_PARIRY_Y='UNCHANGED', \
-        PATTERN_ROTATE_ANGLE=0.0, RESAMPLING_TYPE='LANCZOS4', FILL_VALUE=0.0, VERBOSE_LEVEL=2):
+        PATTERN_ROTATE_ANGLE=0.0, RESAMPLING_TYPE='LANCZOS3', FILL_VALUE=0.0, VERBOSE_LEVEL=2):
         
         """
         # Remarks on Image Zoom & Rotate
@@ -164,19 +164,57 @@ def func_coordtrans(X_ORI, Y_ORI):
         PixA_empty = np.zeros((wcsinfo_TARG['NAXIS1'], wcsinfo_TARG['NAXIS2'])).astype(float)
         hdl_TARG = fits.HDUList([fits.PrimaryHDU(data=PixA_empty.T, header=header_TARG)])
         
+        """
+        # DEPRECATED
         hdl_TARG[0].header['GAIN'] = (1.0, 'MeLOn: PlaceHolder')    
         hdl_TARG[0].header['SATURATE'] = (50000.0, 'MeLOn: PlaceHolder')   
         hdl_TARG[0].header['EXPTIME'] = (60.0, 'MeLOn: PlaceHolder')
         hdl_TARG[0].header['MJD-OBS'] = (58849.0, 'MeLOn: PlaceHolder')
+        
+        """
 
         FITS_TARG = TDIR + '/target_empty_image.fits'
         hdl_TARG.writeto(FITS_TARG, overwrite=True)
 
         # * run SWarp to perform the image resampling
         FITS_resamp = TDIR + '/resampled_image.fits'
-        PY_SWarp.PS(FITS_obj=FITS_ORI, FITS_ref=FITS_TARG, FITS_resamp=FITS_resamp, \
-            GAIN_KEY='GAIN', SATUR_KEY='SATURATE', OVERSAMPLING=1, RESAMPLING_TYPE=RESAMPLING_TYPE, \
-            SUBTRACT_BACK='N', FILL_VALUE=FILL_VALUE, VERBOSE_LEVEL=VERBOSE_LEVEL)
+        """
+        # DEPRECATED
+        CR = Cuda_Resampling(FITS_obj=FITS_ORI, FITS_targ=FITS_TARG, 
+            METHOD=RESAMPLING_TYPE, VERBOSE_LEVEL=VERBOSE_LEVEL)
+        PixA_Eobj, MappingDICT = CR.mapping()
+        PixA_resamp = CR.resampling(PixA_Eobj=PixA_Eobj, MappingDICT=MappingDICT)
+        
+        """
+        def run_resample(FITS_obj, FITS_targ):
+            """run resampling using CUDA"""
+            hdr_obj = fits.getheader(FITS_obj, ext=0)
+            hdr_targ = fits.getheader(FITS_targ, ext=0)
+
+            PixA_obj = fits.getdata(FITS_obj, ext=0).T
+            PixA_targ = fits.getdata(FITS_targ, ext=0).T
+
+            if not PixA_obj.flags['C_CONTIGUOUS']:
+                PixA_obj = np.ascontiguousarray(PixA_obj, np.float64)
+                PixA_obj_GPU = cp.array(PixA_obj)
+            else: PixA_obj_GPU = cp.array(PixA_obj.astype(np.float64))
+
+            if not PixA_targ.flags['C_CONTIGUOUS']:
+                PixA_targ = np.ascontiguousarray(PixA_targ, np.float64)
+                PixA_targ_GPU = cp.array(PixA_targ)
+            else: PixA_targ_GPU = cp.array(PixA_targ.astype(np.float64))
+
+            CR = Cuda_Resampling(RESAMP_METHOD='BILINEAR', VERBOSE_LEVEL=1)
+            XX_proj_GPU, YY_proj_GPU = CR.projection_cd(hdr_obj=hdr_obj, hdr_targ=hdr_targ, CDKEY="PC")
+            PixA_Eobj_GPU, EProjDict = CR.frame_extension(XX_proj_GPU=XX_proj_GPU, YY_proj_GPU=YY_proj_GPU, PixA_obj_GPU=PixA_obj_GPU)
+            PixA_resamp = cp.asnumpy(CR.resampling(PixA_Eobj_GPU=PixA_Eobj_GPU, EProjDict=EProjDict))
+            return PixA_resamp
+
+        PixA_resamp = run_resample(FITS_obj=FITS_ORI, FITS_targ=FITS_TARG)
+        PixA_resamp[np.isnan(PixA_resamp)] = FILL_VALUE
+        with fits.open(FITS_TARG) as hdl:
+            hdl[0].data[:, :] = PixA_resamp.T
+            hdl.writeto(FITS_resamp, overwrite=True)
         
         PixA_resamp = fits.getdata(FITS_resamp, ext=0).T
         os.system('rm -rf %s' %TDIR)
diff --git a/sfft/utils/SExSkySubtract.py b/sfft/utils/SExSkySubtract.py
index ad423c6..fa0e439 100644
--- a/sfft/utils/SExSkySubtract.py
+++ b/sfft/utils/SExSkySubtract.py
@@ -119,4 +119,4 @@ def SSS(FITS_obj, FITS_skysub=None, FITS_sky=None, FITS_skyrms=None, SATUR_KEY='
                 hdl[0].data[:, :] = PixA_skyrms.T
                 hdl.writeto(FITS_skyrms, overwrite=True)
 
-        return SKYDIP, SKYPEAK, PixA_skysub, PixA_sky, PixA_skyrms
+        return SKYDIP, SKYPEAK, PixA_skysub, PixA_sky, PixA_skyrms, DETECT_MASK
diff --git a/sfft/utils/pyAstroMatic/PYSEx.py b/sfft/utils/pyAstroMatic/PYSEx.py
index c58af2b..758473c 100644
--- a/sfft/utils/pyAstroMatic/PYSEx.py
+++ b/sfft/utils/pyAstroMatic/PYSEx.py
@@ -19,11 +19,11 @@
 class PY_SEx:
     @staticmethod
     def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_LDAC', \
-        GAIN_KEY='GAIN', SATUR_KEY='SATURATE', PIXEL_SCALE=1.0, SEEING_FWHM=1.2, BACK_TYPE='AUTO', \
-        BACK_VALUE=0.0, BACK_SIZE=64, BACK_FILTERSIZE=3, USE_FILT=True, DETECT_THRESH=1.5, ANALYSIS_THRESH=1.5, \
-        DETECT_MINAREA=5, DETECT_MAXAREA=0, DEBLEND_NTHRESH=32, DEBLEND_MINCONT=0.005, CLEAN='Y', \
-        BACKPHOTO_TYPE='LOCAL', PHOT_APERTURES=5.0, NegativeCorr=True, CHECKIMAGE_TYPE='NONE', \
-        VIGNET=None, STAMP_IMGSIZE=None, AddRD=False, ONLY_FLAGS=None, XBoundary=0.0, YBoundary=0.0, \
+        GAIN_KEY='GAIN', SATUR_KEY='SATURATE', DEFAULT_GAIN=0., DEFAULT_SATUR=50000., PIXEL_SCALE=1.0, 
+        SEEING_FWHM=1.2, BACK_TYPE='AUTO', BACK_VALUE=0.0, BACK_SIZE=64, BACK_FILTERSIZE=3, USE_FILT=True, \
+        DETECT_THRESH=1.5, ANALYSIS_THRESH=1.5, DETECT_MINAREA=5, DETECT_MAXAREA=0, DEBLEND_NTHRESH=32, \
+        DEBLEND_MINCONT=0.005, CLEAN='Y', BACKPHOTO_TYPE='LOCAL', PHOT_APERTURES=5.0, NegativeCorr=True, \
+        CHECKIMAGE_TYPE='NONE', VIGNET=None, STAMP_IMGSIZE=None, AddRD=False, ONLY_FLAGS=None, XBoundary=0.0, YBoundary=0.0, \
         Coor4Match='XY_', XY_Quest=None, Match_xytol=2.0, RD_Quest=None, Match_rdtol=1.0, \
         Preserve_NoMatch=False, MDIR=None, VERBOSE_TYPE='QUIET', VERBOSE_LEVEL=2):
 
@@ -40,7 +40,7 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
                                         # (b) -FITS_ref != None mean dual image mode:
                                         #     SEx detection on -FITS_ref & SEx photometry on -FITS_obj
 
-        -SExParam [None]               # Parameter List (Python list here) of SExtractor output catalog
+        -SExParam [None]                # Parameter List (Python list here) of SExtractor output catalog
                                         # one can use command line 'sex -dp' to find all available parameters
 
         # Configurations for SExtractor:
@@ -55,6 +55,10 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
         -SATUR_KEY ['SATURATE']         # SExtractor Parameter SATUR_KEY
                                         # i.e., keyword of the saturation level in the FITS image header
 
+        -DEFAULT_GAIN [0.]              # Set default gain if GAIN_KEY not found
+
+        -DEFAULT_SATUR [50000.]           # Set default saturation if SATUR_KEY not found
+
         -PIXEL_SCALE [1.0]              # SExtractor Parameter PIXEL_SCALE
                                         # size of pixel in arcsec (0=use FITS WCS info)
                                         # P.S. it only works for surface brightness parameters, 
@@ -487,9 +491,9 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
                 _message = 'SExtractor uses GAIN = [%s] from keyword [%s]!' %(GAIN, GAIN_KEY)
                 print('MeLOn CheckPoint [%s]: %s' %(objname, _message))
         else: 
-            GAIN = 0.0  # infinite GAIN, Poission noise ignored
+            GAIN = DEFAULT_GAIN  # infinite GAIN, Poission noise ignored
             if VERBOSE_LEVEL in [0, 1, 2]:
-                _warn_message = 'SExtractor has to use default GAIN = 0!'
+                _warn_message = f'SExtractor has to use default GAIN = {DEFAULT_GAIN}!'
                 warnings.warn('MeLOn WARNING [%s]: %s' %(objname, _warn_message))
         
         if SATUR_KEY in phr_obj:
@@ -498,9 +502,9 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
                 _message = 'SExtractor uses SATURATION = [%s] from keyword [%s]!' %(SATURATION, SATUR_KEY)
                 print('MeLOn CheckPoint [%s]: %s' %(objname, _message))
         else: 
-            SATURATION = 50000.0
+            SATURATION = DEFAULT_SATUR
             if VERBOSE_LEVEL in [0, 1, 2]:
-                _warn_message = 'SExtractor has to use default SATURATION = 50000.0!'
+                _warn_message = f'SExtractor has to use default SATURATION = {DEFAULT_SATUR}!'
                 warnings.warn('MeLOn WARNING [%s]: %s' %(objname, _warn_message))
         
         """
diff --git a/sfft/utils/pyAstroMatic/PYSWarp.py b/sfft/utils/pyAstroMatic/PYSWarp.py
index bb38afd..5fce77c 100644
--- a/sfft/utils/pyAstroMatic/PYSWarp.py
+++ b/sfft/utils/pyAstroMatic/PYSWarp.py
@@ -14,20 +14,13 @@
 
 class PY_SWarp:
     @staticmethod
-    def PS(FITS_obj, FITS_ref, FITS_resamp=None, \
-        GAIN_KEY='GAIN', SATUR_KEY='SATURATE', GAIN_DEFAULT=0.0, SATLEV_DEFAULT=50000.0, \
-        OVERSAMPLING=1, RESAMPLING_TYPE='LANCZOS3', SUBTRACT_BACK='N', FILL_VALUE=None, \
-        TMPDIR_ROOT=None, VERBOSE_TYPE='NORMAL', VERBOSE_LEVEL=2):
-        
-        """
-        # Inputs & Outputs:
-
-        -FITS_obj []                        # FITS file path of the input image to be resampled
-
-        -FITS_ref []                        # FITS file path of the input image as resampling reference
-
-        -FITS_resamp [None]                 # FITS file path of the output image of resampled -FITS_obj
+    def Mk_ConfigDict(GAIN_KEY='GAIN', SATUR_KEY='SATURATE', GAIN_DEFAULT=0.0, SATLEV_DEFAULT=50000.0, OVERSAMPLING=1, \
+                      RESAMPLE='Y', RESAMPLING_TYPE='LANCZOS3', SUBTRACT_BACK='N', PROJECTION_TYPE='TAN', RESCALE_WEIGHTS='Y', 
+                      WEIGHT_TYPE='NONE', COMBINE='Y', CENTER_TYPE='ALL', CENTER = 0.0, CELESTIAL_TYPE='NATIVE', 
+                      PIXELSCALE_TYPE='MEDIAN', PIXEL_SCALE=0.0, NTHREADS=0, \
+                      COMBINE_TYPE='MEDIAN', IMAGE_SIZE=0, WEIGHT_SUFFIX='.weight.fits', WRITE_XML='N', VERBOSE_TYPE='NORMAL'):
 
+        """
         # SWarp parameters:
 
         -GAIN_KEY ['GAIN']                  # SWarp Parameter GAIN_KEYWORD
@@ -46,6 +39,7 @@ def PS(FITS_obj, FITS_ref, FITS_resamp=None, \
                                             # P.S. Here I changed the default value from 0 to 1
                                             # NOTE: large OVERSAMPLING may cause higher pixel correlation
 
+        -RESAMPLE ['Y']                     # SWarp parameter RESAMPLE
         -RESAMPLING_TYPE ['LANCZOS3']       # SWarp Parameter RESAMPLING_TYPE
                                             # NEAREST,BILINEAR,LANCZOS2,LANCZOS3
                                             # LANCZOS4 (1 per axis) or FLAGS
@@ -54,9 +48,77 @@ def PS(FITS_obj, FITS_ref, FITS_resamp=None, \
         -SUBTRACT_BACK ['N']                # SWarp Parameter SUBTRACT_BACK 
                                             # Subtraction sky background (Y/N)? (all or for each image)
                                             # P.S. Here I changed the default value from 'Y' to 'N'
+
+        -PROJECTION_TYPE ['TAN']            # SWarp Parameter PROJECTION_TYPE
+
+        -CENTER_TYPE ['ALL']                # SWarp parameter CENTER_TYPE
+        -CENTER [0.0]                       # SWarp parameter CENTER
+        -CELESTIAL_TYPE ['NATIVE']          # SWarp parameter CELESTIAL_TYPE
+
+        -PIXELSCALE_TYPE ['MEDIAN']         # SWarp parameter PIXELSCALE_TYPE
+        -PIXEL_SCALE [0.0]                  # SWarp parameter PIXEL_SCALE
+
+        -NTHREADS [0]                       # SWarp parameter NTHREADS
+
+        -RESCALE_WEIGHTS ['Y']              # SWarp Parameter RESCALE_WEIGHTS
+        -WEIGHT_TYPE ['NONE']               # SWarp Parameter WEIGHT_TYPE
+
+        -IMAGE_SIZE [0]                     # SWarp parameter IMAGE_SIZE
                                        
         -VERBOSE_TYPE ['NORMAL']            # SWarp Parameter VERBOSE_TYPE
-                                            # QUIET,LOG,NORMAL, or FULL
+                                            # QUIET, LOG, NORMAL, or FULL
+
+        """
+
+        ConfigDict = {}
+        ConfigDict['GAIN_KEYWORD'] = '%s' %GAIN_KEY
+        ConfigDict['SATLEV_KEYWORD'] = '%s' %SATUR_KEY
+
+        ConfigDict['GAIN_DEFAULT'] = '%s' %GAIN_DEFAULT
+        ConfigDict['SATLEV_DEFAULT'] = '%s' %SATLEV_DEFAULT
+
+        ConfigDict['OVERSAMPLING'] = '%d' %OVERSAMPLING
+        ConfigDict['RESAMPLE'] = '%s' %RESAMPLE
+        ConfigDict['RESAMPLING_TYPE'] = '%s' %RESAMPLING_TYPE
+        ConfigDict['SUBTRACT_BACK'] = '%s' %SUBTRACT_BACK
+
+        ConfigDict['PROJECTION_TYPE'] = '%s' %PROJECTION_TYPE
+        ConfigDict['CENTER_TYPE'] = '%s' %CENTER_TYPE
+        ConfigDict['CENTER'] = '%s' %CENTER
+        ConfigDict['CELESTIAL_TYPE'] = '%s' %CELESTIAL_TYPE
+
+        ConfigDict['PIXELSCALE_TYPE'] = '%s' %PIXELSCALE_TYPE
+        ConfigDict['PIXEL_SCALE'] = '%d' %PIXEL_SCALE
+
+        ConfigDict['NTHREADS'] = '%s' %NTHREADS
+
+        ConfigDict['WEIGHT_TYPE'] = '%s' %WEIGHT_TYPE
+        ConfigDict['RESCALE_WEIGHTS'] = '%s' %RESCALE_WEIGHTS
+        ConfigDict['IMAGE_SIZE'] = '%d' %IMAGE_SIZE
+
+        ConfigDict['COMBINE'] = '%s' %COMBINE  # trivial here
+        ConfigDict['COMBINE_TYPE'] = '%s' %COMBINE_TYPE  # trivial here
+        
+        ConfigDict['WEIGHT_SUFFIX'] = '%s' %WEIGHT_SUFFIX  # trivial here
+        ConfigDict['WRITE_XML'] = 'N'
+        ConfigDict['VERBOSE_TYPE'] = '%s' %VERBOSE_TYPE
+        
+        return ConfigDict
+
+    @staticmethod
+    def PS(FITS_obj, FITS_ref, ConfigDict, FITS_resamp=None, \
+        FILL_VALUE=None, TMPDIR_ROOT=None, VERBOSE_LEVEL=2):
+        
+        """
+        # Inputs & Outputs:
+
+        -FITS_obj []                        # FITS file path of the input image to be resampled
+
+        -FITS_ref []                        # FITS file path of the input image as resampling reference
+
+        -ConfigDict []                      # Dictionary for config file, input from PY_SWarp.Mk_ConfigDict(). 
+
+        -FITS_resamp [None]                 # FITS file path of the output image of resampled -FITS_obj
 
         # Other parameters:
 
@@ -125,25 +187,7 @@ def PS(FITS_obj, FITS_ref, FITS_resamp=None, \
         # * make directory as a workplace
         TDIR = mkdtemp(suffix=None, prefix='PYSWarp_', dir=TMPDIR_ROOT)
 
-        # * create SWarp configuration file in TDIR
-        ConfigDict = {}
-        ConfigDict['GAIN_KEYWORD'] = '%s' %GAIN_KEY
-        ConfigDict['SATLEV_KEYWORD'] = '%s' %SATUR_KEY
-
-        ConfigDict['GAIN_DEFAULT'] = '%s' %GAIN_DEFAULT
-        ConfigDict['SATLEV_DEFAULT'] = '%s' %SATLEV_DEFAULT
-
-        ConfigDict['OVERSAMPLING'] = '%d' %OVERSAMPLING
-        ConfigDict['RESAMPLING_TYPE'] = '%s' %RESAMPLING_TYPE
-        ConfigDict['SUBTRACT_BACK'] = '%s' %SUBTRACT_BACK
-
-        ConfigDict['COMBINE'] = 'Y'  # trivial here
-        ConfigDict['COMBINE_TYPE'] = 'MEDIAN'  # trivial here
-        
-        ConfigDict['WEIGHT_SUFFIX'] = '.weight.fits'  # trivial here
-        ConfigDict['WRITE_XML'] = 'N'
-        ConfigDict['VERBOSE_TYPE'] = '%s' %VERBOSE_TYPE
-        
+        # * create SWarp configuration file in TDIR        
         swarp_config_path = AMConfig_Maker.AMCM(MDIR=TDIR, \
             AstroMatic_KEY='swarp', ConfigDict=ConfigDict, tag='PYSWarp')
 
@@ -178,8 +222,8 @@ def PS(FITS_obj, FITS_ref, FITS_resamp=None, \
         
         # * update header by the SWarp generated saturation level
         NEW_SATUR = fits.getheader(tFITS_resamp, ext=0)['SATURATE']
-        if SATUR_KEY in hdr_op:
-            hdr_op[SATUR_KEY] = (NEW_SATUR, 'MeLOn: PYSWarp')
+        if ConfigDict['SATLEV_KEYWORD'] in hdr_op:
+            hdr_op[ConfigDict['SATLEV_KEYWORD']] = (NEW_SATUR, 'MeLOn: PYSWarp')
 
         # * add history
         hdr_op['SWARP_O'] = (pa.basename(FITS_obj), 'MeLOn: PYSWarp')
@@ -208,3 +252,108 @@ def PS(FITS_obj, FITS_ref, FITS_resamp=None, \
         os.system('rm -rf %s'%TDIR)
 
         return PixA_resamp, MissingMask
+
+    @staticmethod
+    def Coadd(FITS_obj, FITS_ref, ConfigDict, OUT_path, \
+        FILL_VALUE=None, TMPDIR_ROOT=None, VERBOSE_LEVEL=2):
+
+        """
+        # Inputs & Outputs:
+
+        -FITS_obj []                        # List of FITS file paths of the input images to be resampled
+
+        -FITS_ref []                        # FITS file path of the input image as resampling reference
+
+        -ConfigDict []                      # Dictionary for config file, input from PY_SWarp.Mk_ConfigDict(). 
+
+        -FITS_resamp [None]                 # FITS file path of the output image of resampled, coadded -FITS_obj
+
+        # Other parameters:
+
+        -FILL_VALUE [None]                  # How to fill the invalid (boundary) pixels in -FITS_resamp and -FITS_resamp_weight
+                                            # e.g., -FILL_VALUE = 0.0, -FILL_VALUE = np.nan
+
+        -VERBOSE_LEVEL [2]                  # The level of verbosity, can be [0, 1, 2]
+                                            # 0/1/2: QUIET/NORMAL/FULL mode
+                                            # NOTE: it only controls the verbosity out of SWarp.
+
+        -TMPDIR_ROOT [None]                 # Specify root directory of temporary working directory. 
+
+        # Returns:
+
+            PixA_resamp                     # Pixel array of the resampled, coadded image
+                                            # P.S. PixA_resamp = fits.getdata(OUT_path, ext=0).T
+            
+            PixA_resamp_weight              # Pixel array of the weight map
+        
+        # * Remarks on PYSWarp
+        #       
+        #   [1] In SWarp, one need provide a target WCS-frame stored in fname.head, with name coincident with FITS_resamp.
+        #       If no target WCS-frame is available, the output image will have a canonical N-E orientation.
+        #       However, the current python wrapper PYSWarp must read a target WCS-frame from a given FITS_ref.
+        #
+        #   [2] SWarp allows users to feed a corresponding weight map as input.
+        #       However, the current python wrapper PYSWarp does not support this feature.
+        #
+        #   [3] SWarp will automatically calculate the following keywords and set them into the output image.
+        #       > EXPTIME: sum of exposure times in the part of coadd with the most overlaps.
+        #       > GAIN: effective gain with flux and weighting applied.
+        #         Note that any change in pixel scale only redistribute the flux with total flux conservative,
+        #         therefore, the image resampling does not alter the GAIN value.
+        #       > SATURATE: Minimum of all input saturation levels with flux scaling applied.
+        #         Note that a change in pixel scale can alter the saturation level.
+        #       > MJD-OBS: MJD of earliest start of exposures 
+        #
+        """
+
+        # * check FITS_obj headers
+        for obj in FITS_obj:
+            phr_obj = fits.getheader(obj, ext=0)
+
+            # a mild warning
+            if VERBOSE_LEVEL in [1, 2]:
+                if 'EXPTIME' not in phr_obj:
+                    _warn_message = 'SWarp cannot find keyword EXPTIME in FITS header of [%s]!' %obj
+                    warnings.warn('MeLOn WARNING: %s' %_warn_message)
+            
+            # a mild warning
+            if VERBOSE_LEVEL in [1, 2]:
+                if 'MJD-OBS' not in phr_obj:
+                    _warn_message = 'SWarp cannot find keyword MJD-OBS in FITS header of [%s]!' %obj
+                    warnings.warn('MeLOn WARNING: %s' %_warn_message)
+
+        # * make directory as a workplace
+        TDIR = mkdtemp(suffix=None, prefix='PYSWarp_', dir=TMPDIR_ROOT)
+
+        # * create SWarp configuration file in TDIR        
+        swarp_config_path = AMConfig_Maker.AMCM(MDIR=TDIR, \
+            AstroMatic_KEY='swarp', ConfigDict=ConfigDict, tag='PYSWarp')
+
+        FITS_obj_str = ' '.join(FITS_obj)
+        weightOUT_path = OUT_path[:-5] + '.weight.fits'
+
+        # * trigger SWarp 
+        command = "cd %s && swarp %s -IMAGEOUT_NAME %s -WEIGHTOUT_NAME %s -c %s"  \
+        %(TDIR, FITS_obj_str, OUT_path, weightOUT_path, swarp_config_path)
+        print('MeLOn CheckPoint: Run SWarp Command ... \n %s' %command)
+        os.system(command)
+        os.system('rm -rf %s'%TDIR)
+
+        # * fill the missing data in resampled image [SWarp default 0]
+        PixA_resamp, MissingMask = None, None
+        try: 
+            PixA_resamp = fits.getdata(OUT_path, ext=0).T
+            PixA_resamp_weight = fits.getdata(weightOUT_path, ext=0).T
+            MissingMask = PixA_resamp_weight == 0
+            if FILL_VALUE is not None:
+                PixA_resamp[MissingMask] = FILL_VALUE
+            if OUT_path is not None:
+                hdr_op = fits.getheader(OUT_path, ext=0)
+                hdl_op = fits.HDUList(fits.PrimaryHDU(PixA_resamp.T, header=hdr_op))
+                hdl_op.writeto(OUT_path, overwrite=True)
+        except:
+            if VERBOSE_LEVEL in [0, 1, 2]:
+                _warn_message = 'SWarp FAILED on [%s]!' %FITS_obj
+                warnings.warn('MeLOn WARNING: %s' %_warn_message)
+
+        return PixA_resamp, PixA_resamp_weight
\ No newline at end of file