From 52e02c694150cc928e5d087ea3b2d104cf0a3218 Mon Sep 17 00:00:00 2001 From: royinx Date: Mon, 7 Aug 2023 00:24:55 -0400 Subject: [PATCH 1/8] feat: add cupy encoder --- samples/SampleCupy.py | 243 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 samples/SampleCupy.py diff --git a/samples/SampleCupy.py b/samples/SampleCupy.py new file mode 100644 index 00000000..f3ffec60 --- /dev/null +++ b/samples/SampleCupy.py @@ -0,0 +1,243 @@ +# +# Copyright 2023 @royinx + +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Starting from Python 3.8 DLL search policy has changed. +# We need to add path to CUDA DLLs explicitly. +import sys +import os +from typing import Any +import PyNvCodec as nvc +import numpy as np +import cupy as cp + +class cconverter: + """ + Colorspace conversion chain. + """ + + def __init__(self, width: int, height: int, gpu_id: int): + self.gpu_id = gpu_id + self.w = width + self.h = height + self.chain = [] + + def add(self, src_fmt: nvc.PixelFormat, dst_fmt: nvc.PixelFormat) -> None: + self.chain.append( + nvc.PySurfaceConverter(self.w, self.h, src_fmt, dst_fmt, self.gpu_id) + ) + + def run(self, src_surface: nvc.Surface) -> nvc.Surface: + surf = src_surface + cc = nvc.ColorspaceConversionContext(nvc.ColorSpace.BT_601, nvc.ColorRange.MPEG) + + for cvt in self.chain: + surf = cvt.Execute(surf, cc) + if surf.Empty(): + raise RuntimeError("Failed to perform color conversion") + + return surf.Clone(self.gpu_id) + +class CupyNVC: + def get_memptr(self, surface: nvc.Surface) -> int: + return surface.PlanePtr().GpuMem() + + def SurfaceToArray(self, surface: nvc.Surface) -> cp.array: + """ + Converts surface to cupy unit8 tensor. + + - surface: nvc.Surface + - return: cp.array (height, width, 3) + """ + if surface.Format() != nvc.PixelFormat.RGB: + raise RuntimeError("Surface shall be of RGB PLANAR format , got {}".format(surface.Format())) + plane = surface.PlanePtr() + # cuPy array zero copy non ownned + height, width, pitch = (plane.Height(), plane.Width(), plane.Pitch()) + cupy_mem = cp.cuda.UnownedMemory(self.get_memptr(surface), height * width * 1, surface) + cupy_memptr = cp.cuda.MemoryPointer(cupy_mem, 0) + # cupy_frame = cp.ndarray((height//3, width, 3), cp.uint8, cupy_memptr, strides=(pitch,1,int(pitch*height/width))) # RGB_PLANAR + cupy_frame = cp.ndarray((height, width // 3, 3), cp.uint8, cupy_memptr, strides=(pitch, 3, 1)) # RGB + + return cupy_frame + + # def _memcpy(self, surface: nvc.Surface, img_array: cp.array) -> None: + # ker_string = """ + # extern "C"{ + # __global__ void memcpyKer(unsigned char *dst, unsigned char *src, int len) + # { + # int idx = blockIdx.x * blockDim.x + threadIdx.x; + # for (; idx < len; idx += blockDim.x) memcpy(&dst[idx], &src[idx], sizeof(unsigned char)); + # } + # } + # """ + # module = cp.RawModule(code=ker_string) + # memcpyKer = module.get_function("memcpyKer") + # with cp.cuda.stream.Stream() as stream: + # memcpyKer((1,), (1024,), #(int(img_array.size//1024)+1,1) + # (int(self.get_memptr(surface)), img_array.data.ptr, img_array.size) + # ) + # stream.synchronize() + # return + def _validate(self, surface: nvc.Surface, img_array: cp.array) -> None: + validate_string = """ + extern "C"{ + __global__ void validate_value(unsigned char *dst, unsigned char *src, int len) + { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + for (; idx < len; idx += blockDim.x) + printf("%d %d %d\\n", idx, src[idx], dst[idx]); + } + } + """ + module = cp.RawModule(code=validate_string) + validateKer = module.get_function("validate_value") + print(img_array.size) + with cp.cuda.stream.Stream(null=True) as stream: + validateKer((1,), (1024,), #(int(img_array.size//1024)+1,1) + (int(self.get_memptr(surface)), img_array.data.ptr, img_array.size) + ) + stream.synchronize() + return + + def _memcpy(self, surface: nvc.Surface, img_array: cp.array) -> None: + cp.cuda.runtime.memcpy(dst = self.get_memptr(surface), # dst_ptr + src = img_array.data.ptr, # src_ptr + size=img_array.nbytes, + kind=4) # + # with cp.cuda.stream.Stream(null=True) as stream: + # cp.cuda.runtime.memcpy2D(self.get_memptr(surface), + # surface.Pitch(), + # img_array.data.ptr, + # surface.Width(), + # surface.Width(), + # surface.Height(), + # cp.cuda.runtime.memcpyDeviceToDevice) # stream.ptr: 0 + # stream.synchronize() + # self._validate(surface, img_array) + return + + def ArrayToSurface(self, img_array: cp.array, gpu_id: int) -> nvc.Surface: + """ + Converts cupy ndarray to rgb surface. + - surface: cp.array + - return: nvc.Surface + """ + img_array = img_array.astype(cp.uint8) + img_array = cp.transpose(img_array, (2,0,1)) # HWC + img_array = cp.ascontiguousarray(img_array) + _ , tensor_h , tensor_w = img_array.shape + surface = nvc.Surface.Make(nvc.PixelFormat.RGB_PLANAR, tensor_w, tensor_h, gpu_id) # HWC + self._memcpy(surface, img_array) + return surface + +def to_grayscale(img_array: cp.array) -> cp.array: + img_array = cp.matmul(img_array, cp.array([0.299, 0.587, 0.114]).T) + img_array = cp.expand_dims(img_array, axis=-1) + img_array = cp.tile(img_array, (1,1,3)) # view as 3 channel image (packed RGB: HWC) + return img_array + +def contrast_boost(img_array: cp.array) -> cp.array: + """ + histogram equalization + """ + channel_min = cp.quantile(img_array, 0.05, axis=(0,1)) + channel_max = cp.quantile(img_array, 0.95, axis=(0,1)) + img_array = img_array.astype(cp.float32) + for c, (cmin, cmax) in enumerate(zip(channel_min, channel_max)): + img_array[c] = cp.clip(img_array[c], cmin, cmax) + img_array = img_array- channel_min.reshape(1,1,-1) + img_array /= (channel_max - channel_min).reshape(1,1,-1) + img_array = cp.multiply(img_array, 255.0) + return img_array + +def main(gpu_id, encFilePath, dstFilePath): + dstFile = open(dstFilePath, "wb") + nvDec = nvc.PyNvDecoder(encFilePath, gpu_id) + cpnvc = CupyNVC() + + w = nvDec.Width() + h = nvDec.Height() + res = str(w) + "x" + str(h) + nvEnc = nvc.PyNvEncoder( + {"preset": "P4", "codec": "h264", "s": res, "bitrate": "10M"}, gpu_id + ) + + # Surface converters + to_rgb = cconverter(w, h, gpu_id) + to_rgb.add(nvc.PixelFormat.NV12, nvc.PixelFormat.YUV420) + to_rgb.add(nvc.PixelFormat.YUV420, nvc.PixelFormat.RGB) + + to_nv12 = cconverter(w, h, gpu_id) + to_nv12.add(nvc.PixelFormat.RGB_PLANAR, nvc.PixelFormat.RGB) + to_nv12.add(nvc.PixelFormat.RGB, nvc.PixelFormat.YUV420) + to_nv12.add(nvc.PixelFormat.YUV420, nvc.PixelFormat.NV12) + + # Encoded video frame + encFrame = np.ndarray(shape=(0), dtype=np.uint8) + while True: + # Decode NV12 surface + src_surface = nvDec.DecodeSingleSurface() + if src_surface.Empty(): + break + + # Convert to packed RGB: HWC , planar CHW + rgb_sur = to_rgb.run(src_surface) + if rgb_sur.Empty(): + break + + # PROCESS YOUR TENSOR HERE. + # THIS DUMMY PROCESSING JUST ADDS RANDOM ROTATION. + src_array = cpnvc.SurfaceToArray(rgb_sur) + dst_array = contrast_boost(src_array) + dst_array = to_grayscale(dst_array) + surface_rgb = cpnvc.ArrayToSurface(dst_array, gpu_id) + + # Convert back to NV12 + dst_surface = to_nv12.run(surface_rgb) + if src_surface.Empty(): + break + + # Encode + success = nvEnc.EncodeSingleSurface(dst_surface, encFrame) + if success: + byteArray = bytearray(encFrame) + dstFile.write(byteArray) + + # Encoder is asynchronous, so we need to flush it + while True: + success = nvEnc.FlushSinglePacket(encFrame) + if success: + byteArray = bytearray(encFrame) + dstFile.write(byteArray) + else: + break + + +if __name__ == "__main__": + + + if len(sys.argv) < 4: + print("This sample transcode and process with pytorch an input video on given GPU.") + print("Provide gpu ID, path to input and output files") + print("Usage: SamplePyTorch.py $gpu_id $input_file $output_file.") + print("Example: \npython3 samples/SampleCupy.py 0 tests/test.mp4 tests/dec_test.mp4") + exit(1) + + gpu_id = int(sys.argv[1]) + encFilePath = sys.argv[2] + decFilePath = sys.argv[3] + main(gpu_id, encFilePath, decFilePath) From 3fe49440e7b949697e4ae568fc0664e2c6811ae0 Mon Sep 17 00:00:00 2001 From: royinx Date: Wed, 9 Aug 2023 16:37:49 -0400 Subject: [PATCH 2/8] feat: support cupy pointer --- samples/SampleCupy.py | 73 +++++++++---------------------------------- 1 file changed, 14 insertions(+), 59 deletions(-) diff --git a/samples/SampleCupy.py b/samples/SampleCupy.py index f3ffec60..f108fe6b 100644 --- a/samples/SampleCupy.py +++ b/samples/SampleCupy.py @@ -69,67 +69,22 @@ def SurfaceToArray(self, surface: nvc.Surface) -> cp.array: height, width, pitch = (plane.Height(), plane.Width(), plane.Pitch()) cupy_mem = cp.cuda.UnownedMemory(self.get_memptr(surface), height * width * 1, surface) cupy_memptr = cp.cuda.MemoryPointer(cupy_mem, 0) - # cupy_frame = cp.ndarray((height//3, width, 3), cp.uint8, cupy_memptr, strides=(pitch,1,int(pitch*height/width))) # RGB_PLANAR cupy_frame = cp.ndarray((height, width // 3, 3), cp.uint8, cupy_memptr, strides=(pitch, 3, 1)) # RGB return cupy_frame - # def _memcpy(self, surface: nvc.Surface, img_array: cp.array) -> None: - # ker_string = """ - # extern "C"{ - # __global__ void memcpyKer(unsigned char *dst, unsigned char *src, int len) - # { - # int idx = blockIdx.x * blockDim.x + threadIdx.x; - # for (; idx < len; idx += blockDim.x) memcpy(&dst[idx], &src[idx], sizeof(unsigned char)); - # } - # } - # """ - # module = cp.RawModule(code=ker_string) - # memcpyKer = module.get_function("memcpyKer") - # with cp.cuda.stream.Stream() as stream: - # memcpyKer((1,), (1024,), #(int(img_array.size//1024)+1,1) - # (int(self.get_memptr(surface)), img_array.data.ptr, img_array.size) - # ) - # stream.synchronize() - # return - def _validate(self, surface: nvc.Surface, img_array: cp.array) -> None: - validate_string = """ - extern "C"{ - __global__ void validate_value(unsigned char *dst, unsigned char *src, int len) - { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - for (; idx < len; idx += blockDim.x) - printf("%d %d %d\\n", idx, src[idx], dst[idx]); - } - } - """ - module = cp.RawModule(code=validate_string) - validateKer = module.get_function("validate_value") - print(img_array.size) + def _memcpy(self, surface: nvc.Surface, img_array: cp.array) -> None: with cp.cuda.stream.Stream(null=True) as stream: - validateKer((1,), (1024,), #(int(img_array.size//1024)+1,1) - (int(self.get_memptr(surface)), img_array.data.ptr, img_array.size) - ) + cp.cuda.runtime.memcpy2D(self.get_memptr(surface), + surface.Pitch(), + img_array.data.ptr, + surface.Width(), + surface.Width(), + surface.Height()*3, + cp.cuda.runtime.memcpyDeviceToDevice) # stream.ptr: 0 stream.synchronize() return - def _memcpy(self, surface: nvc.Surface, img_array: cp.array) -> None: - cp.cuda.runtime.memcpy(dst = self.get_memptr(surface), # dst_ptr - src = img_array.data.ptr, # src_ptr - size=img_array.nbytes, - kind=4) # - # with cp.cuda.stream.Stream(null=True) as stream: - # cp.cuda.runtime.memcpy2D(self.get_memptr(surface), - # surface.Pitch(), - # img_array.data.ptr, - # surface.Width(), - # surface.Width(), - # surface.Height(), - # cp.cuda.runtime.memcpyDeviceToDevice) # stream.ptr: 0 - # stream.synchronize() - # self._validate(surface, img_array) - return - def ArrayToSurface(self, img_array: cp.array, gpu_id: int) -> nvc.Surface: """ Converts cupy ndarray to rgb surface. @@ -137,14 +92,14 @@ def ArrayToSurface(self, img_array: cp.array, gpu_id: int) -> nvc.Surface: - return: nvc.Surface """ img_array = img_array.astype(cp.uint8) - img_array = cp.transpose(img_array, (2,0,1)) # HWC + img_array = cp.transpose(img_array, (2,0,1)) # HWC to CHW img_array = cp.ascontiguousarray(img_array) - _ , tensor_h , tensor_w = img_array.shape - surface = nvc.Surface.Make(nvc.PixelFormat.RGB_PLANAR, tensor_w, tensor_h, gpu_id) # HWC + _ ,tensor_h , tensor_w= img_array.shape + surface = nvc.Surface.Make(nvc.PixelFormat.RGB_PLANAR, tensor_w, tensor_h, gpu_id) self._memcpy(surface, img_array) return surface -def to_grayscale(img_array: cp.array) -> cp.array: +def grayscale(img_array: cp.array) -> cp.array: img_array = cp.matmul(img_array, cp.array([0.299, 0.587, 0.114]).T) img_array = cp.expand_dims(img_array, axis=-1) img_array = cp.tile(img_array, (1,1,3)) # view as 3 channel image (packed RGB: HWC) @@ -164,7 +119,7 @@ def contrast_boost(img_array: cp.array) -> cp.array: img_array = cp.multiply(img_array, 255.0) return img_array -def main(gpu_id, encFilePath, dstFilePath): +def main(gpu_id: int, encFilePath: str, dstFilePath: str): dstFile = open(dstFilePath, "wb") nvDec = nvc.PyNvDecoder(encFilePath, gpu_id) cpnvc = CupyNVC() @@ -203,7 +158,7 @@ def main(gpu_id, encFilePath, dstFilePath): # THIS DUMMY PROCESSING JUST ADDS RANDOM ROTATION. src_array = cpnvc.SurfaceToArray(rgb_sur) dst_array = contrast_boost(src_array) - dst_array = to_grayscale(dst_array) + dst_array = grayscale(dst_array) surface_rgb = cpnvc.ArrayToSurface(dst_array, gpu_id) # Convert back to NV12 From 7e01549e43558e2c81fb9ca1b54df9c0b92446fa Mon Sep 17 00:00:00 2001 From: royinx Date: Wed, 9 Aug 2023 16:43:39 -0400 Subject: [PATCH 3/8] feat: support add cupy example --- samples/SampleCupy.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/samples/SampleCupy.py b/samples/SampleCupy.py index f108fe6b..09c095a0 100644 --- a/samples/SampleCupy.py +++ b/samples/SampleCupy.py @@ -74,15 +74,14 @@ def SurfaceToArray(self, surface: nvc.Surface) -> cp.array: return cupy_frame def _memcpy(self, surface: nvc.Surface, img_array: cp.array) -> None: - with cp.cuda.stream.Stream(null=True) as stream: - cp.cuda.runtime.memcpy2D(self.get_memptr(surface), - surface.Pitch(), - img_array.data.ptr, - surface.Width(), - surface.Width(), - surface.Height()*3, - cp.cuda.runtime.memcpyDeviceToDevice) # stream.ptr: 0 - stream.synchronize() + cp.cuda.runtime.memcpy2DAsync(self.get_memptr(surface), + surface.Pitch(), + img_array.data.ptr, + surface.Width(), + surface.Width(), + surface.Height()*3, + cp.cuda.runtime.memcpyDeviceToDevice, + 0) # null_stream.ptr: 0 return def ArrayToSurface(self, img_array: cp.array, gpu_id: int) -> nvc.Surface: From 37ca2d21b4a27c59d56aed8cd6e69f91a1f4efa0 Mon Sep 17 00:00:00 2001 From: royinx Date: Mon, 14 Aug 2023 19:10:59 -0400 Subject: [PATCH 4/8] chore: update error msg --- samples/SampleCupy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/SampleCupy.py b/samples/SampleCupy.py index 09c095a0..994cdc6a 100644 --- a/samples/SampleCupy.py +++ b/samples/SampleCupy.py @@ -185,9 +185,9 @@ def main(gpu_id: int, encFilePath: str, dstFilePath: str): if len(sys.argv) < 4: - print("This sample transcode and process with pytorch an input video on given GPU.") + print("This sample transcode and process with cupy an input video on given GPU.") print("Provide gpu ID, path to input and output files") - print("Usage: SamplePyTorch.py $gpu_id $input_file $output_file.") + print("Usage: SampleCupy.py $gpu_id $input_file $output_file.") print("Example: \npython3 samples/SampleCupy.py 0 tests/test.mp4 tests/dec_test.mp4") exit(1) From 4dee59659b49f4f809dcc039fe59a6fb722faca4 Mon Sep 17 00:00:00 2001 From: royinx Date: Tue, 15 Aug 2023 11:27:10 -0400 Subject: [PATCH 5/8] chore: update memos --- samples/SampleCupy.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/samples/SampleCupy.py b/samples/SampleCupy.py index 994cdc6a..d7dd172f 100644 --- a/samples/SampleCupy.py +++ b/samples/SampleCupy.py @@ -154,7 +154,7 @@ def main(gpu_id: int, encFilePath: str, dstFilePath: str): break # PROCESS YOUR TENSOR HERE. - # THIS DUMMY PROCESSING JUST ADDS RANDOM ROTATION. + # THIS DUMMY PROCESSING JUST ADDS GRAYSCALE AND ENCHANCE CONTRAST. src_array = cpnvc.SurfaceToArray(rgb_sur) dst_array = contrast_boost(src_array) dst_array = grayscale(dst_array) @@ -182,13 +182,9 @@ def main(gpu_id: int, encFilePath: str, dstFilePath: str): if __name__ == "__main__": - - if len(sys.argv) < 4: print("This sample transcode and process with cupy an input video on given GPU.") - print("Provide gpu ID, path to input and output files") - print("Usage: SampleCupy.py $gpu_id $input_file $output_file.") - print("Example: \npython3 samples/SampleCupy.py 0 tests/test.mp4 tests/dec_test.mp4") + print("[Usage]: python3 samples/SampleCupy.py ") exit(1) gpu_id = int(sys.argv[1]) From c3380a80cd44866b9ba3add46e7319d181a73d75 Mon Sep 17 00:00:00 2001 From: royinx Date: Tue, 15 Aug 2023 11:27:51 -0400 Subject: [PATCH 6/8] feat: add Cupy TensorRT --- samples/SampleCupyTensorRT.py | 229 ++++++++++++++++++++++++++++++++++ 1 file changed, 229 insertions(+) create mode 100644 samples/SampleCupyTensorRT.py diff --git a/samples/SampleCupyTensorRT.py b/samples/SampleCupyTensorRT.py new file mode 100644 index 00000000..fb07b3b6 --- /dev/null +++ b/samples/SampleCupyTensorRT.py @@ -0,0 +1,229 @@ +# +# Copyright 2023 @royinx + +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Starting from Python 3.8 DLL search policy has changed. +# We need to add path to CUDA DLLs explicitly. +import sys +sys.path.append(".") +import os +from typing import Any +import PyNvCodec as nvc +import tensorrt as trt +import numpy as np +import cupy as cp +from samples.SampleTensorRTResnet import resnet_categories + +class TensorRT: + def __init__(self,engine_file): + super().__init__() + self.TRT_LOGGER = trt.Logger() + self.engine = self.get_engine(engine_file) + self.context = self.engine.create_execution_context() + self.allocate_buffers() + + def get_engine(self, engine_file_path): + if not os.path.exists(engine_file_path): + raise "run ./samples/SampleTensorRTResnet.py to generate engine file" + print("Reading engine from file {}".format(engine_file_path)) + with open(engine_file_path, "rb") as f, \ + trt.Runtime(self.TRT_LOGGER) as runtime: + return runtime.deserialize_cuda_engine(f.read()) + + def allocate_buffers(self): + """ + In this Application, we use cupy for in and out + + trt use gpu array to run inference. + while bindings store the gpu array ptr , via the method : + cupy.ndarray.data.ptr + cupu.cuda.alloc_pinned_memory + cupy.cuda.runtime.malloc.mem_alloc + """ + self.inputs = [] + self.outputs = [] + self.bindings = [] + self.stream = cp.cuda.Stream(non_blocking=False) + + for binding in self.engine: + shape = self.engine.get_tensor_shape(binding) + dtype = trt.nptype(self.engine.get_tensor_dtype(binding)) + device_array = cp.empty(shape, dtype) + self.bindings.append(device_array.data.ptr) # cupy array ptr + # Append to the appropriate list. + if self.engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT: + self.inputs.append(device_array) + elif self.engine.get_tensor_mode(binding) == trt.TensorIOMode.OUTPUT: + self.outputs.append(device_array) + + def inference(self,inputs:cp.ndarray) -> list: + inputs = cp.ascontiguousarray(inputs) + cp.cuda.runtime.memcpyAsync(dst = self.inputs[0].data.ptr, + src = inputs.data.ptr, + size= inputs.nbytes, + kind = cp.cuda.runtime.memcpyDeviceToDevice, + stream = self.stream.ptr) + self.context.execute_async_v2(bindings=self.bindings, + stream_handle=self.stream.ptr) + self.stream.synchronize() + return [out for out in self.outputs] + + +class cconverter: + """ + Colorspace conversion chain. + """ + + def __init__(self, width: int, height: int, gpu_id: int): + self.gpu_id = gpu_id + self.w = width + self.h = height + self.chain = [] + + def add(self, src_fmt: nvc.PixelFormat, dst_fmt: nvc.PixelFormat) -> None: + self.chain.append( + nvc.PySurfaceConverter(self.w, self.h, src_fmt, dst_fmt, self.gpu_id) + ) + def resize(self, width: int, height: int, src_fmt: nvc.PixelFormat) -> None: + self.chain.append( + nvc.PySurfaceResizer(width, height, src_fmt, self.gpu_id) + ) + self.h = height + self.w = width + + def run(self, src_surface: nvc.Surface) -> nvc.Surface: + surf = src_surface + cc = nvc.ColorspaceConversionContext(nvc.ColorSpace.BT_601, nvc.ColorRange.MPEG) + + for cvt in self.chain: + if isinstance(cvt, nvc.PySurfaceResizer): + surf = cvt.Execute(surf) + else: + surf = cvt.Execute(surf, cc) + if surf.Empty(): + raise RuntimeError("Failed to perform color conversion") + + return surf.Clone(self.gpu_id) + +class CupyNVC: + def get_memptr(self, surface: nvc.Surface) -> int: + return surface.PlanePtr().GpuMem() + + def SurfaceToArray(self, surface: nvc.Surface) -> cp.array: + """ + Converts surface to cupy unit8 tensor. + + - surface: nvc.Surface + - return: cp.array (height, width, 3) + """ + if surface.Format() != nvc.PixelFormat.RGB: + raise RuntimeError("Surface shall be of RGB PLANAR format , got {}".format(surface.Format())) + plane = surface.PlanePtr() + # cuPy array zero copy non ownned + height, width, pitch = (plane.Height(), plane.Width(), plane.Pitch()) + cupy_mem = cp.cuda.UnownedMemory(self.get_memptr(surface), height * width * 1, surface) + cupy_memptr = cp.cuda.MemoryPointer(cupy_mem, 0) + cupy_frame = cp.ndarray((height, width // 3, 3), cp.uint8, cupy_memptr, strides=(pitch, 3, 1)) # RGB + + return cupy_frame + + def _memcpy(self, surface: nvc.Surface, img_array: cp.array) -> None: + cp.cuda.runtime.memcpy2DAsync(self.get_memptr(surface), + surface.Pitch(), + img_array.data.ptr, + surface.Width(), + surface.Width(), + surface.Height()*3, + cp.cuda.runtime.memcpyDeviceToDevice, + 0) # null_stream.ptr: 0 + return + + def ArrayToSurface(self, img_array: cp.array, gpu_id: int) -> nvc.Surface: + """ + Converts cupy ndarray to rgb surface. + - surface: cp.array + - return: nvc.Surface + """ + img_array = img_array.astype(cp.uint8) + img_array = cp.transpose(img_array, (2,0,1)) # HWC to CHW + img_array = cp.ascontiguousarray(img_array) + _ ,tensor_h , tensor_w= img_array.shape + surface = nvc.Surface.Make(nvc.PixelFormat.RGB_PLANAR, tensor_w, tensor_h, gpu_id) + self._memcpy(surface, img_array) + return surface + +def normalize(tensor: cp.array, mean:list , std:list) -> cp.array: + """ + normalize along the last axis + """ + tensor -= cp.array(mean).reshape(1,1,-1) + tensor /= cp.array(std).reshape(1,1,-1) + return tensor + +def main(gpu_id: int, encFilePath: str): + engine = TensorRT("resnet50.trt") + nvDec = nvc.PyNvDecoder(encFilePath, gpu_id) + cpnvc = CupyNVC() + + w = nvDec.Width() + h = nvDec.Height() + + # Surface converters + to_rgb = cconverter(w, h, gpu_id) + to_rgb.add(nvc.PixelFormat.NV12, nvc.PixelFormat.YUV420) + to_rgb.resize(224,224, nvc.PixelFormat.YUV420) + to_rgb.add(nvc.PixelFormat.YUV420, nvc.PixelFormat.RGB) + + # Encoded video frame + while True: + # Decode NV12 surface + src_surface = nvDec.DecodeSingleSurface() + if src_surface.Empty(): + break + + # Convert to packed RGB: HWC , planar CHW + rgb_sur = to_rgb.run(src_surface) + if rgb_sur.Empty(): + break + + # PROCESS YOUR TENSOR HERE. + src_array = cpnvc.SurfaceToArray(rgb_sur) + src_array = src_array.astype(cp.float32) + + # preprocess + src_array /= 255.0 + src_array = normalize(src_array, + mean= [0.485, 0.456, 0.406], + std = [0.229, 0.224, 0.225]) + src_array = cp.transpose(src_array, (2,0,1)) + src_array = cp.expand_dims(src_array, axis=0) # NCHW + + pred = engine.inference(src_array) + pred = pred[0] # extract first output layer + + idx = cp.argmax(pred) + print("Image type: ", resnet_categories[cp.asnumpy(idx)]) + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print("This sample decode and inference an input video with cupy on given GPU.") + print("[Usage]: python3 samples/SampleCupyTensorRT.py ") + exit(1) + + gpu_id = int(sys.argv[1]) + encFilePath = sys.argv[2] + main(gpu_id, encFilePath) From 0d764354fc19cb0a96751d19623b3a4b2b1e2402 Mon Sep 17 00:00:00 2001 From: royinx Date: Tue, 15 Aug 2023 11:29:05 -0400 Subject: [PATCH 7/8] debug: fix the first batch output unsync issue --- samples/SampleTensorRTResnet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/SampleTensorRTResnet.py b/samples/SampleTensorRTResnet.py index f91b24e1..a5a71fba 100644 --- a/samples/SampleTensorRTResnet.py +++ b/samples/SampleTensorRTResnet.py @@ -1169,6 +1169,7 @@ def run_inference(self, tensor_image) -> str: for out in self.outputs: cuda.memcpy_dtoh_async(out.host, out.device, self.stream) + self.stream.synchronize() # Find most probable image type and return resnet categoy description [result] = [out.host for out in self.outputs] return resnet_categories[np.argmax(result)] @@ -1311,7 +1312,7 @@ def infer_on_video(gpu_id: int, input_video: str, trt_nn_file: str): if __name__ == "__main__": if len(sys.argv) < 3: print("Provide gpu id and path to input video file.") - exit + exit(1) gpu_id = int(sys.argv[1]) input_video = sys.argv[2] From 52a269ce22d711e61da7e47f8081f9612a740eb2 Mon Sep 17 00:00:00 2001 From: royinx Date: Tue, 15 Aug 2023 11:31:14 -0400 Subject: [PATCH 8/8] feat: add cupy for build --- setup.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6912485e..a8294d6a 100644 --- a/setup.py +++ b/setup.py @@ -13,10 +13,18 @@ print("Error: version of setuptools is too old (<42)!") sys.exit(1) +def get_cupy() -> str: + CUDA_VERSION = os.environ.get("CUDA_VERSION", None) + if CUDA_VERSION>="11.2": # after 11.2 use + cupy_pack = f"cupy-cuda{CUDA_VERSION[:2]}x" + else: + cupy_pack = f"cupy-cuda{CUDA_VERSION[:4].replace('.','')}" + return cupy_pack if __name__ == "__main__": import skbuild + cupy = get_cupy() PytorchNvCodec = "PytorchNvCodec @ git+https://github.com/NVIDIA/VideoProcessingFramework.git#subdirectory=src/PytorchNvCodec/" skbuild.setup( name="PyNvCodec", @@ -28,7 +36,7 @@ extras_require={ # , "PyOpenGL-accelerate" # does not compile on 3.10 "dev": ["pycuda", "pyopengl", "torch", "torchvision", "opencv-python", "onnx", "tensorrt", f"PytorchNvCodec @ file://{os.getcwd()}/src/PytorchNvCodec/"], - "samples": ["pycuda", "pyopengl", "torch", "torchvision", "opencv-python", "onnx", "tensorrt", "tqdm", PytorchNvCodec], + "samples": ["pycuda", "pyopengl", "torch", "torchvision", "opencv-python", "onnx", "tensorrt", "tqdm", cupy, PytorchNvCodec], "tests": ["pycuda", "pyopengl", "torch", "torchvision", "opencv-python", PytorchNvCodec], "torch": ["torch", "torchvision", "opencv-python", PytorchNvCodec], "tensorrt": ["torch", "torchvision", PytorchNvCodec],