From 52e02c694150cc928e5d087ea3b2d104cf0a3218 Mon Sep 17 00:00:00 2001
From: royinx <royfork@hotmail.com>
Date: Mon, 7 Aug 2023 00:24:55 -0400
Subject: [PATCH 1/8] feat: add cupy encoder

---
 samples/SampleCupy.py | 243 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 243 insertions(+)
 create mode 100644 samples/SampleCupy.py

diff --git a/samples/SampleCupy.py b/samples/SampleCupy.py
new file mode 100644
index 00000000..f3ffec60
--- /dev/null
+++ b/samples/SampleCupy.py
@@ -0,0 +1,243 @@
+#
+# Copyright 2023 @royinx
+
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Starting from Python 3.8 DLL search policy has changed.
+# We need to add path to CUDA DLLs explicitly.
+import sys
+import os
+from typing import Any
+import PyNvCodec as nvc
+import numpy as np
+import cupy as cp
+
+class cconverter:
+    """
+    Colorspace conversion chain.
+    """
+
+    def __init__(self, width: int, height: int, gpu_id: int):
+        self.gpu_id = gpu_id
+        self.w = width
+        self.h = height
+        self.chain = []
+
+    def add(self, src_fmt: nvc.PixelFormat, dst_fmt: nvc.PixelFormat) -> None:
+        self.chain.append(
+            nvc.PySurfaceConverter(self.w, self.h, src_fmt, dst_fmt, self.gpu_id)
+        )
+
+    def run(self, src_surface: nvc.Surface) -> nvc.Surface:
+        surf = src_surface
+        cc = nvc.ColorspaceConversionContext(nvc.ColorSpace.BT_601, nvc.ColorRange.MPEG)
+
+        for cvt in self.chain:
+            surf = cvt.Execute(surf, cc)
+            if surf.Empty():
+                raise RuntimeError("Failed to perform color conversion")
+
+        return surf.Clone(self.gpu_id)
+
+class CupyNVC:
+    def get_memptr(self, surface: nvc.Surface) -> int:
+        return surface.PlanePtr().GpuMem()
+
+    def SurfaceToArray(self, surface: nvc.Surface) -> cp.array:
+        """
+        Converts surface to cupy unit8 tensor.
+
+        - surface: nvc.Surface
+        - return: cp.array (height, width, 3)
+        """
+        if surface.Format() != nvc.PixelFormat.RGB:
+            raise RuntimeError("Surface shall be of RGB PLANAR format , got {}".format(surface.Format()))
+        plane = surface.PlanePtr()
+        # cuPy array zero copy non ownned
+        height, width, pitch = (plane.Height(), plane.Width(), plane.Pitch())
+        cupy_mem = cp.cuda.UnownedMemory(self.get_memptr(surface), height * width * 1, surface)
+        cupy_memptr = cp.cuda.MemoryPointer(cupy_mem, 0)
+        # cupy_frame = cp.ndarray((height//3, width, 3), cp.uint8, cupy_memptr, strides=(pitch,1,int(pitch*height/width))) # RGB_PLANAR
+        cupy_frame = cp.ndarray((height, width // 3, 3), cp.uint8, cupy_memptr, strides=(pitch, 3, 1)) # RGB
+
+        return cupy_frame
+
+    # def _memcpy(self, surface: nvc.Surface, img_array: cp.array) -> None:
+    #     ker_string = """
+    #     extern "C"{
+    #     __global__ void memcpyKer(unsigned char *dst, unsigned char *src, int len)
+    #     {
+    #     int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    #     for (; idx < len; idx += blockDim.x) memcpy(&dst[idx], &src[idx], sizeof(unsigned char));
+    #     }
+    #     }
+    #     """
+    #     module = cp.RawModule(code=ker_string)
+    #     memcpyKer = module.get_function("memcpyKer")
+    #     with cp.cuda.stream.Stream() as stream:
+    #         memcpyKer((1,), (1024,), #(int(img_array.size//1024)+1,1)
+    #                 (int(self.get_memptr(surface)), img_array.data.ptr, img_array.size)
+    #         )
+    #         stream.synchronize()
+    #     return
+    def _validate(self, surface: nvc.Surface, img_array: cp.array) -> None:
+        validate_string = """
+        extern "C"{
+        __global__ void validate_value(unsigned char *dst, unsigned char *src, int len)
+        {
+            int idx = blockIdx.x * blockDim.x + threadIdx.x;
+            for (; idx < len; idx += blockDim.x)
+                printf("%d %d %d\\n", idx, src[idx], dst[idx]);
+        }
+        }
+        """
+        module = cp.RawModule(code=validate_string)
+        validateKer = module.get_function("validate_value")
+        print(img_array.size)
+        with cp.cuda.stream.Stream(null=True) as stream:
+            validateKer((1,), (1024,), #(int(img_array.size//1024)+1,1)
+                    (int(self.get_memptr(surface)), img_array.data.ptr, img_array.size)
+            )
+            stream.synchronize()
+        return
+
+    def _memcpy(self, surface: nvc.Surface, img_array: cp.array) -> None:
+        cp.cuda.runtime.memcpy(dst = self.get_memptr(surface), # dst_ptr
+                                src = img_array.data.ptr, # src_ptr
+                                size=img_array.nbytes,
+                                kind=4) #
+        # with cp.cuda.stream.Stream(null=True) as stream:
+        #     cp.cuda.runtime.memcpy2D(self.get_memptr(surface),
+        #                                     surface.Pitch(),
+        #                                     img_array.data.ptr,
+        #                                     surface.Width(),
+        #                                     surface.Width(),
+        #                                     surface.Height(),
+        #                                     cp.cuda.runtime.memcpyDeviceToDevice) # stream.ptr: 0
+        #     stream.synchronize()
+        # self._validate(surface, img_array)
+        return
+
+    def ArrayToSurface(self, img_array: cp.array, gpu_id: int) -> nvc.Surface:
+        """
+        Converts cupy ndarray to rgb surface.
+        - surface: cp.array
+        - return: nvc.Surface
+        """
+        img_array = img_array.astype(cp.uint8)
+        img_array = cp.transpose(img_array, (2,0,1)) # HWC
+        img_array = cp.ascontiguousarray(img_array)
+        _ , tensor_h , tensor_w = img_array.shape
+        surface = nvc.Surface.Make(nvc.PixelFormat.RGB_PLANAR, tensor_w, tensor_h, gpu_id) # HWC
+        self._memcpy(surface, img_array)
+        return surface
+
+def to_grayscale(img_array: cp.array) -> cp.array:
+    img_array = cp.matmul(img_array, cp.array([0.299, 0.587, 0.114]).T)
+    img_array = cp.expand_dims(img_array, axis=-1)
+    img_array = cp.tile(img_array, (1,1,3)) # view as 3 channel image (packed RGB: HWC)
+    return img_array
+
+def contrast_boost(img_array: cp.array) -> cp.array:
+    """
+    histogram equalization
+    """
+    channel_min = cp.quantile(img_array, 0.05, axis=(0,1))
+    channel_max = cp.quantile(img_array, 0.95, axis=(0,1))
+    img_array = img_array.astype(cp.float32)
+    for c, (cmin, cmax) in enumerate(zip(channel_min, channel_max)):
+        img_array[c] = cp.clip(img_array[c], cmin, cmax)
+    img_array = img_array- channel_min.reshape(1,1,-1)
+    img_array /= (channel_max - channel_min).reshape(1,1,-1)
+    img_array = cp.multiply(img_array, 255.0)
+    return img_array
+
+def main(gpu_id, encFilePath, dstFilePath):
+    dstFile = open(dstFilePath, "wb")
+    nvDec = nvc.PyNvDecoder(encFilePath, gpu_id)
+    cpnvc = CupyNVC()
+
+    w = nvDec.Width()
+    h = nvDec.Height()
+    res = str(w) + "x" + str(h)
+    nvEnc = nvc.PyNvEncoder(
+        {"preset": "P4", "codec": "h264", "s": res, "bitrate": "10M"}, gpu_id
+    )
+
+    # Surface converters
+    to_rgb = cconverter(w, h, gpu_id)
+    to_rgb.add(nvc.PixelFormat.NV12, nvc.PixelFormat.YUV420)
+    to_rgb.add(nvc.PixelFormat.YUV420, nvc.PixelFormat.RGB)
+
+    to_nv12 = cconverter(w, h, gpu_id)
+    to_nv12.add(nvc.PixelFormat.RGB_PLANAR, nvc.PixelFormat.RGB)
+    to_nv12.add(nvc.PixelFormat.RGB, nvc.PixelFormat.YUV420)
+    to_nv12.add(nvc.PixelFormat.YUV420, nvc.PixelFormat.NV12)
+
+    # Encoded video frame
+    encFrame = np.ndarray(shape=(0), dtype=np.uint8)
+    while True:
+        # Decode NV12 surface
+        src_surface = nvDec.DecodeSingleSurface()
+        if src_surface.Empty():
+            break
+
+        # Convert to packed RGB: HWC , planar CHW
+        rgb_sur = to_rgb.run(src_surface)
+        if rgb_sur.Empty():
+            break
+
+        # PROCESS YOUR TENSOR HERE.
+        # THIS DUMMY PROCESSING JUST ADDS RANDOM ROTATION.
+        src_array = cpnvc.SurfaceToArray(rgb_sur)
+        dst_array = contrast_boost(src_array)
+        dst_array = to_grayscale(dst_array)
+        surface_rgb = cpnvc.ArrayToSurface(dst_array, gpu_id)
+
+        # Convert back to NV12
+        dst_surface = to_nv12.run(surface_rgb)
+        if src_surface.Empty():
+            break
+
+        # Encode
+        success = nvEnc.EncodeSingleSurface(dst_surface, encFrame)
+        if success:
+            byteArray = bytearray(encFrame)
+            dstFile.write(byteArray)
+
+    # Encoder is asynchronous, so we need to flush it
+    while True:
+        success = nvEnc.FlushSinglePacket(encFrame)
+        if success:
+            byteArray = bytearray(encFrame)
+            dstFile.write(byteArray)
+        else:
+            break
+
+
+if __name__ == "__main__":
+
+
+    if len(sys.argv) < 4:
+        print("This sample transcode and process with pytorch an input video on given GPU.")
+        print("Provide gpu ID, path to input and output files")
+        print("Usage: SamplePyTorch.py $gpu_id $input_file $output_file.")
+        print("Example: \npython3 samples/SampleCupy.py 0 tests/test.mp4 tests/dec_test.mp4")
+        exit(1)
+
+    gpu_id = int(sys.argv[1])
+    encFilePath = sys.argv[2]
+    decFilePath = sys.argv[3]
+    main(gpu_id, encFilePath, decFilePath)

From 3fe49440e7b949697e4ae568fc0664e2c6811ae0 Mon Sep 17 00:00:00 2001
From: royinx <royfork@hotmail.com>
Date: Wed, 9 Aug 2023 16:37:49 -0400
Subject: [PATCH 2/8] feat: support cupy pointer

---
 samples/SampleCupy.py | 73 +++++++++----------------------------------
 1 file changed, 14 insertions(+), 59 deletions(-)

diff --git a/samples/SampleCupy.py b/samples/SampleCupy.py
index f3ffec60..f108fe6b 100644
--- a/samples/SampleCupy.py
+++ b/samples/SampleCupy.py
@@ -69,67 +69,22 @@ def SurfaceToArray(self, surface: nvc.Surface) -> cp.array:
         height, width, pitch = (plane.Height(), plane.Width(), plane.Pitch())
         cupy_mem = cp.cuda.UnownedMemory(self.get_memptr(surface), height * width * 1, surface)
         cupy_memptr = cp.cuda.MemoryPointer(cupy_mem, 0)
-        # cupy_frame = cp.ndarray((height//3, width, 3), cp.uint8, cupy_memptr, strides=(pitch,1,int(pitch*height/width))) # RGB_PLANAR
         cupy_frame = cp.ndarray((height, width // 3, 3), cp.uint8, cupy_memptr, strides=(pitch, 3, 1)) # RGB
 
         return cupy_frame
 
-    # def _memcpy(self, surface: nvc.Surface, img_array: cp.array) -> None:
-    #     ker_string = """
-    #     extern "C"{
-    #     __global__ void memcpyKer(unsigned char *dst, unsigned char *src, int len)
-    #     {
-    #     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    #     for (; idx < len; idx += blockDim.x) memcpy(&dst[idx], &src[idx], sizeof(unsigned char));
-    #     }
-    #     }
-    #     """
-    #     module = cp.RawModule(code=ker_string)
-    #     memcpyKer = module.get_function("memcpyKer")
-    #     with cp.cuda.stream.Stream() as stream:
-    #         memcpyKer((1,), (1024,), #(int(img_array.size//1024)+1,1)
-    #                 (int(self.get_memptr(surface)), img_array.data.ptr, img_array.size)
-    #         )
-    #         stream.synchronize()
-    #     return
-    def _validate(self, surface: nvc.Surface, img_array: cp.array) -> None:
-        validate_string = """
-        extern "C"{
-        __global__ void validate_value(unsigned char *dst, unsigned char *src, int len)
-        {
-            int idx = blockIdx.x * blockDim.x + threadIdx.x;
-            for (; idx < len; idx += blockDim.x)
-                printf("%d %d %d\\n", idx, src[idx], dst[idx]);
-        }
-        }
-        """
-        module = cp.RawModule(code=validate_string)
-        validateKer = module.get_function("validate_value")
-        print(img_array.size)
+    def _memcpy(self, surface: nvc.Surface, img_array: cp.array) -> None:
         with cp.cuda.stream.Stream(null=True) as stream:
-            validateKer((1,), (1024,), #(int(img_array.size//1024)+1,1)
-                    (int(self.get_memptr(surface)), img_array.data.ptr, img_array.size)
-            )
+            cp.cuda.runtime.memcpy2D(self.get_memptr(surface),
+                                            surface.Pitch(),
+                                            img_array.data.ptr,
+                                            surface.Width(),
+                                            surface.Width(),
+                                            surface.Height()*3,
+                                            cp.cuda.runtime.memcpyDeviceToDevice) # stream.ptr: 0
             stream.synchronize()
         return
 
-    def _memcpy(self, surface: nvc.Surface, img_array: cp.array) -> None:
-        cp.cuda.runtime.memcpy(dst = self.get_memptr(surface), # dst_ptr
-                                src = img_array.data.ptr, # src_ptr
-                                size=img_array.nbytes,
-                                kind=4) #
-        # with cp.cuda.stream.Stream(null=True) as stream:
-        #     cp.cuda.runtime.memcpy2D(self.get_memptr(surface),
-        #                                     surface.Pitch(),
-        #                                     img_array.data.ptr,
-        #                                     surface.Width(),
-        #                                     surface.Width(),
-        #                                     surface.Height(),
-        #                                     cp.cuda.runtime.memcpyDeviceToDevice) # stream.ptr: 0
-        #     stream.synchronize()
-        # self._validate(surface, img_array)
-        return
-
     def ArrayToSurface(self, img_array: cp.array, gpu_id: int) -> nvc.Surface:
         """
         Converts cupy ndarray to rgb surface.
@@ -137,14 +92,14 @@ def ArrayToSurface(self, img_array: cp.array, gpu_id: int) -> nvc.Surface:
         - return: nvc.Surface
         """
         img_array = img_array.astype(cp.uint8)
-        img_array = cp.transpose(img_array, (2,0,1)) # HWC
+        img_array = cp.transpose(img_array, (2,0,1)) # HWC to CHW
         img_array = cp.ascontiguousarray(img_array)
-        _ , tensor_h , tensor_w = img_array.shape
-        surface = nvc.Surface.Make(nvc.PixelFormat.RGB_PLANAR, tensor_w, tensor_h, gpu_id) # HWC
+        _ ,tensor_h , tensor_w= img_array.shape
+        surface = nvc.Surface.Make(nvc.PixelFormat.RGB_PLANAR, tensor_w, tensor_h, gpu_id)
         self._memcpy(surface, img_array)
         return surface
 
-def to_grayscale(img_array: cp.array) -> cp.array:
+def grayscale(img_array: cp.array) -> cp.array:
     img_array = cp.matmul(img_array, cp.array([0.299, 0.587, 0.114]).T)
     img_array = cp.expand_dims(img_array, axis=-1)
     img_array = cp.tile(img_array, (1,1,3)) # view as 3 channel image (packed RGB: HWC)
@@ -164,7 +119,7 @@ def contrast_boost(img_array: cp.array) -> cp.array:
     img_array = cp.multiply(img_array, 255.0)
     return img_array
 
-def main(gpu_id, encFilePath, dstFilePath):
+def main(gpu_id: int, encFilePath: str, dstFilePath: str):
     dstFile = open(dstFilePath, "wb")
     nvDec = nvc.PyNvDecoder(encFilePath, gpu_id)
     cpnvc = CupyNVC()
@@ -203,7 +158,7 @@ def main(gpu_id, encFilePath, dstFilePath):
         # THIS DUMMY PROCESSING JUST ADDS RANDOM ROTATION.
         src_array = cpnvc.SurfaceToArray(rgb_sur)
         dst_array = contrast_boost(src_array)
-        dst_array = to_grayscale(dst_array)
+        dst_array = grayscale(dst_array)
         surface_rgb = cpnvc.ArrayToSurface(dst_array, gpu_id)
 
         # Convert back to NV12

From 7e01549e43558e2c81fb9ca1b54df9c0b92446fa Mon Sep 17 00:00:00 2001
From: royinx <royfork@hotmail.com>
Date: Wed, 9 Aug 2023 16:43:39 -0400
Subject: [PATCH 3/8] feat: support add cupy example

---
 samples/SampleCupy.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/samples/SampleCupy.py b/samples/SampleCupy.py
index f108fe6b..09c095a0 100644
--- a/samples/SampleCupy.py
+++ b/samples/SampleCupy.py
@@ -74,15 +74,14 @@ def SurfaceToArray(self, surface: nvc.Surface) -> cp.array:
         return cupy_frame
 
     def _memcpy(self, surface: nvc.Surface, img_array: cp.array) -> None:
-        with cp.cuda.stream.Stream(null=True) as stream:
-            cp.cuda.runtime.memcpy2D(self.get_memptr(surface),
-                                            surface.Pitch(),
-                                            img_array.data.ptr,
-                                            surface.Width(),
-                                            surface.Width(),
-                                            surface.Height()*3,
-                                            cp.cuda.runtime.memcpyDeviceToDevice) # stream.ptr: 0
-            stream.synchronize()
+        cp.cuda.runtime.memcpy2DAsync(self.get_memptr(surface),
+                                        surface.Pitch(),
+                                        img_array.data.ptr,
+                                        surface.Width(),
+                                        surface.Width(),
+                                        surface.Height()*3,
+                                        cp.cuda.runtime.memcpyDeviceToDevice,
+                                        0) # null_stream.ptr: 0
         return
 
     def ArrayToSurface(self, img_array: cp.array, gpu_id: int) -> nvc.Surface:

From 37ca2d21b4a27c59d56aed8cd6e69f91a1f4efa0 Mon Sep 17 00:00:00 2001
From: royinx <royfork@hotmail.com>
Date: Mon, 14 Aug 2023 19:10:59 -0400
Subject: [PATCH 4/8] chore: update error msg

---
 samples/SampleCupy.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/SampleCupy.py b/samples/SampleCupy.py
index 09c095a0..994cdc6a 100644
--- a/samples/SampleCupy.py
+++ b/samples/SampleCupy.py
@@ -185,9 +185,9 @@ def main(gpu_id: int, encFilePath: str, dstFilePath: str):
 
 
     if len(sys.argv) < 4:
-        print("This sample transcode and process with pytorch an input video on given GPU.")
+        print("This sample transcode and process with cupy an input video on given GPU.")
         print("Provide gpu ID, path to input and output files")
-        print("Usage: SamplePyTorch.py $gpu_id $input_file $output_file.")
+        print("Usage: SampleCupy.py $gpu_id $input_file $output_file.")
         print("Example: \npython3 samples/SampleCupy.py 0 tests/test.mp4 tests/dec_test.mp4")
         exit(1)
 

From 4dee59659b49f4f809dcc039fe59a6fb722faca4 Mon Sep 17 00:00:00 2001
From: royinx <royfork@hotmail.com>
Date: Tue, 15 Aug 2023 11:27:10 -0400
Subject: [PATCH 5/8] chore: update memos

---
 samples/SampleCupy.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/samples/SampleCupy.py b/samples/SampleCupy.py
index 994cdc6a..d7dd172f 100644
--- a/samples/SampleCupy.py
+++ b/samples/SampleCupy.py
@@ -154,7 +154,7 @@ def main(gpu_id: int, encFilePath: str, dstFilePath: str):
             break
 
         # PROCESS YOUR TENSOR HERE.
-        # THIS DUMMY PROCESSING JUST ADDS RANDOM ROTATION.
+        # THIS DUMMY PROCESSING JUST ADDS GRAYSCALE AND ENCHANCE CONTRAST.
         src_array = cpnvc.SurfaceToArray(rgb_sur)
         dst_array = contrast_boost(src_array)
         dst_array = grayscale(dst_array)
@@ -182,13 +182,9 @@ def main(gpu_id: int, encFilePath: str, dstFilePath: str):
 
 
 if __name__ == "__main__":
-
-
     if len(sys.argv) < 4:
         print("This sample transcode and process with cupy an input video on given GPU.")
-        print("Provide gpu ID, path to input and output files")
-        print("Usage: SampleCupy.py $gpu_id $input_file $output_file.")
-        print("Example: \npython3 samples/SampleCupy.py 0 tests/test.mp4 tests/dec_test.mp4")
+        print("[Usage]: python3 samples/SampleCupy.py <gpu_id> <input_file> <output_file>")
         exit(1)
 
     gpu_id = int(sys.argv[1])

From c3380a80cd44866b9ba3add46e7319d181a73d75 Mon Sep 17 00:00:00 2001
From: royinx <royfork@hotmail.com>
Date: Tue, 15 Aug 2023 11:27:51 -0400
Subject: [PATCH 6/8] feat: add Cupy TensorRT

---
 samples/SampleCupyTensorRT.py | 229 ++++++++++++++++++++++++++++++++++
 1 file changed, 229 insertions(+)
 create mode 100644 samples/SampleCupyTensorRT.py

diff --git a/samples/SampleCupyTensorRT.py b/samples/SampleCupyTensorRT.py
new file mode 100644
index 00000000..fb07b3b6
--- /dev/null
+++ b/samples/SampleCupyTensorRT.py
@@ -0,0 +1,229 @@
+#
+# Copyright 2023 @royinx
+
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Starting from Python 3.8 DLL search policy has changed.
+# We need to add path to CUDA DLLs explicitly.
+import sys
+sys.path.append(".")
+import os
+from typing import Any
+import PyNvCodec as nvc
+import tensorrt as trt
+import numpy as np
+import cupy as cp
+from samples.SampleTensorRTResnet import resnet_categories
+
+class TensorRT:
+    def __init__(self,engine_file):
+        super().__init__()
+        self.TRT_LOGGER = trt.Logger()
+        self.engine = self.get_engine(engine_file)
+        self.context = self.engine.create_execution_context()
+        self.allocate_buffers()
+
+    def get_engine(self, engine_file_path):
+        if not os.path.exists(engine_file_path):
+            raise "run ./samples/SampleTensorRTResnet.py to generate engine file"
+        print("Reading engine from file {}".format(engine_file_path))
+        with open(engine_file_path, "rb") as f, \
+            trt.Runtime(self.TRT_LOGGER) as runtime:
+            return runtime.deserialize_cuda_engine(f.read())
+
+    def allocate_buffers(self):
+        """
+        In this Application, we use cupy for in and out
+
+        trt use gpu array to run inference.
+        while bindings store the gpu array ptr , via the method :
+            cupy.ndarray.data.ptr
+            cupu.cuda.alloc_pinned_memory
+            cupy.cuda.runtime.malloc.mem_alloc
+        """
+        self.inputs = []
+        self.outputs = []
+        self.bindings = []
+        self.stream = cp.cuda.Stream(non_blocking=False)
+
+        for binding in self.engine:
+            shape = self.engine.get_tensor_shape(binding)
+            dtype = trt.nptype(self.engine.get_tensor_dtype(binding))
+            device_array = cp.empty(shape, dtype)
+            self.bindings.append(device_array.data.ptr) # cupy array ptr
+            # Append to the appropriate list.
+            if self.engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
+                self.inputs.append(device_array)
+            elif self.engine.get_tensor_mode(binding) == trt.TensorIOMode.OUTPUT:
+                self.outputs.append(device_array)
+
+    def inference(self,inputs:cp.ndarray) -> list:
+        inputs = cp.ascontiguousarray(inputs)
+        cp.cuda.runtime.memcpyAsync(dst = self.inputs[0].data.ptr,
+                                    src = inputs.data.ptr,
+                                    size= inputs.nbytes,
+                                    kind = cp.cuda.runtime.memcpyDeviceToDevice,
+                                    stream = self.stream.ptr)
+        self.context.execute_async_v2(bindings=self.bindings,
+                                    stream_handle=self.stream.ptr)
+        self.stream.synchronize()
+        return [out for out in self.outputs]
+
+
+class cconverter:
+    """
+    Colorspace conversion chain.
+    """
+
+    def __init__(self, width: int, height: int, gpu_id: int):
+        self.gpu_id = gpu_id
+        self.w = width
+        self.h = height
+        self.chain = []
+
+    def add(self, src_fmt: nvc.PixelFormat, dst_fmt: nvc.PixelFormat) -> None:
+        self.chain.append(
+            nvc.PySurfaceConverter(self.w, self.h, src_fmt, dst_fmt, self.gpu_id)
+        )
+    def resize(self, width: int, height: int, src_fmt: nvc.PixelFormat) -> None:
+        self.chain.append(
+            nvc.PySurfaceResizer(width, height, src_fmt, self.gpu_id)
+        )
+        self.h = height
+        self.w = width
+
+    def run(self, src_surface: nvc.Surface) -> nvc.Surface:
+        surf = src_surface
+        cc = nvc.ColorspaceConversionContext(nvc.ColorSpace.BT_601, nvc.ColorRange.MPEG)
+
+        for cvt in self.chain:
+            if isinstance(cvt, nvc.PySurfaceResizer):
+                surf = cvt.Execute(surf)
+            else:
+                surf = cvt.Execute(surf, cc)
+            if surf.Empty():
+                raise RuntimeError("Failed to perform color conversion")
+
+        return surf.Clone(self.gpu_id)
+
+class CupyNVC:
+    def get_memptr(self, surface: nvc.Surface) -> int:
+        return surface.PlanePtr().GpuMem()
+
+    def SurfaceToArray(self, surface: nvc.Surface) -> cp.array:
+        """
+        Converts surface to cupy unit8 tensor.
+
+        - surface: nvc.Surface
+        - return: cp.array (height, width, 3)
+        """
+        if surface.Format() != nvc.PixelFormat.RGB:
+            raise RuntimeError("Surface shall be of RGB PLANAR format , got {}".format(surface.Format()))
+        plane = surface.PlanePtr()
+        # cuPy array zero copy non ownned
+        height, width, pitch = (plane.Height(), plane.Width(), plane.Pitch())
+        cupy_mem = cp.cuda.UnownedMemory(self.get_memptr(surface), height * width * 1, surface)
+        cupy_memptr = cp.cuda.MemoryPointer(cupy_mem, 0)
+        cupy_frame = cp.ndarray((height, width // 3, 3), cp.uint8, cupy_memptr, strides=(pitch, 3, 1)) # RGB
+
+        return cupy_frame
+
+    def _memcpy(self, surface: nvc.Surface, img_array: cp.array) -> None:
+        cp.cuda.runtime.memcpy2DAsync(self.get_memptr(surface),
+                                        surface.Pitch(),
+                                        img_array.data.ptr,
+                                        surface.Width(),
+                                        surface.Width(),
+                                        surface.Height()*3,
+                                        cp.cuda.runtime.memcpyDeviceToDevice,
+                                        0) # null_stream.ptr: 0
+        return
+
+    def ArrayToSurface(self, img_array: cp.array, gpu_id: int) -> nvc.Surface:
+        """
+        Converts cupy ndarray to rgb surface.
+        - surface: cp.array
+        - return: nvc.Surface
+        """
+        img_array = img_array.astype(cp.uint8)
+        img_array = cp.transpose(img_array, (2,0,1)) # HWC to CHW
+        img_array = cp.ascontiguousarray(img_array)
+        _ ,tensor_h , tensor_w= img_array.shape
+        surface = nvc.Surface.Make(nvc.PixelFormat.RGB_PLANAR, tensor_w, tensor_h, gpu_id)
+        self._memcpy(surface, img_array)
+        return surface
+
+def normalize(tensor: cp.array, mean:list , std:list) -> cp.array:
+    """
+    normalize along the last axis
+    """
+    tensor -= cp.array(mean).reshape(1,1,-1)
+    tensor /= cp.array(std).reshape(1,1,-1)
+    return tensor
+
+def main(gpu_id: int, encFilePath: str):
+    engine = TensorRT("resnet50.trt")
+    nvDec = nvc.PyNvDecoder(encFilePath, gpu_id)
+    cpnvc = CupyNVC()
+
+    w = nvDec.Width()
+    h = nvDec.Height()
+
+    # Surface converters
+    to_rgb = cconverter(w, h, gpu_id)
+    to_rgb.add(nvc.PixelFormat.NV12, nvc.PixelFormat.YUV420)
+    to_rgb.resize(224,224, nvc.PixelFormat.YUV420)
+    to_rgb.add(nvc.PixelFormat.YUV420, nvc.PixelFormat.RGB)
+
+    # Encoded video frame
+    while True:
+        # Decode NV12 surface
+        src_surface = nvDec.DecodeSingleSurface()
+        if src_surface.Empty():
+            break
+
+        # Convert to packed RGB: HWC , planar CHW
+        rgb_sur = to_rgb.run(src_surface)
+        if rgb_sur.Empty():
+            break
+
+        # PROCESS YOUR TENSOR HERE.
+        src_array = cpnvc.SurfaceToArray(rgb_sur)
+        src_array = src_array.astype(cp.float32)
+
+        # preprocess
+        src_array /= 255.0
+        src_array = normalize(src_array,
+                              mean= [0.485, 0.456, 0.406],
+                              std = [0.229, 0.224, 0.225])
+        src_array = cp.transpose(src_array, (2,0,1))
+        src_array = cp.expand_dims(src_array, axis=0) # NCHW
+
+        pred = engine.inference(src_array)
+        pred = pred[0] # extract first output layer
+
+        idx = cp.argmax(pred)
+        print("Image type: ", resnet_categories[cp.asnumpy(idx)])
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("This sample decode and inference an input video with cupy on given GPU.")
+        print("[Usage]: python3 samples/SampleCupyTensorRT.py <gpu_id> <video_path>")
+        exit(1)
+
+    gpu_id = int(sys.argv[1])
+    encFilePath = sys.argv[2]
+    main(gpu_id, encFilePath)

From 0d764354fc19cb0a96751d19623b3a4b2b1e2402 Mon Sep 17 00:00:00 2001
From: royinx <royfork@hotmail.com>
Date: Tue, 15 Aug 2023 11:29:05 -0400
Subject: [PATCH 7/8] debug: fix the first batch output unsync issue

---
 samples/SampleTensorRTResnet.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/samples/SampleTensorRTResnet.py b/samples/SampleTensorRTResnet.py
index f91b24e1..a5a71fba 100644
--- a/samples/SampleTensorRTResnet.py
+++ b/samples/SampleTensorRTResnet.py
@@ -1169,6 +1169,7 @@ def run_inference(self, tensor_image) -> str:
         for out in self.outputs:
             cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
 
+        self.stream.synchronize()
         # Find most probable image type and return resnet categoy description
         [result] = [out.host for out in self.outputs]
         return resnet_categories[np.argmax(result)]
@@ -1311,7 +1312,7 @@ def infer_on_video(gpu_id: int, input_video: str, trt_nn_file: str):
 if __name__ == "__main__":
     if len(sys.argv) < 3:
         print("Provide gpu id and path to input video file.")
-        exit
+        exit(1)
 
     gpu_id = int(sys.argv[1])
     input_video = sys.argv[2]

From 52a269ce22d711e61da7e47f8081f9612a740eb2 Mon Sep 17 00:00:00 2001
From: royinx <royfork@hotmail.com>
Date: Tue, 15 Aug 2023 11:31:14 -0400
Subject: [PATCH 8/8] feat: add cupy for build

---
 setup.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6912485e..a8294d6a 100644
--- a/setup.py
+++ b/setup.py
@@ -13,10 +13,18 @@
     print("Error: version of setuptools is too old (<42)!")
     sys.exit(1)
 
+def get_cupy() -> str:
+    CUDA_VERSION = os.environ.get("CUDA_VERSION", None)
+    if CUDA_VERSION>="11.2": # after 11.2 use
+        cupy_pack = f"cupy-cuda{CUDA_VERSION[:2]}x"
+    else:
+        cupy_pack = f"cupy-cuda{CUDA_VERSION[:4].replace('.','')}"
+    return cupy_pack
 
 if __name__ == "__main__":
     import skbuild
 
+    cupy = get_cupy()
     PytorchNvCodec = "PytorchNvCodec @ git+https://github.com/NVIDIA/VideoProcessingFramework.git#subdirectory=src/PytorchNvCodec/"
     skbuild.setup(
         name="PyNvCodec",
@@ -28,7 +36,7 @@
         extras_require={
             # , "PyOpenGL-accelerate" # does not compile on 3.10
             "dev": ["pycuda", "pyopengl", "torch", "torchvision", "opencv-python", "onnx", "tensorrt", f"PytorchNvCodec @ file://{os.getcwd()}/src/PytorchNvCodec/"],
-            "samples": ["pycuda", "pyopengl", "torch", "torchvision", "opencv-python", "onnx", "tensorrt", "tqdm", PytorchNvCodec],
+            "samples": ["pycuda", "pyopengl", "torch", "torchvision", "opencv-python", "onnx", "tensorrt", "tqdm", cupy, PytorchNvCodec],
             "tests": ["pycuda", "pyopengl", "torch", "torchvision", "opencv-python", PytorchNvCodec],
             "torch": ["torch", "torchvision", "opencv-python", PytorchNvCodec],
             "tensorrt": ["torch", "torchvision", PytorchNvCodec],