update README (wang-xinyu#673)

freedenS · web-flow · commit ccdf0e6316b1 · 2021-08-16T10:59:57.000+08:00
add latency

improve nms
diff --git a/rcnn/BatchedNms.cu b/rcnn/BatchedNms.cu
@@ -16,34 +16,32 @@
 namespace nvinfer1 {
 
 __global__ void batched_nms_kernel(
-    const int num_per_thread, const float threshold, const int num_detections,
+    const float threshold, const int num_detections,
     const int *indices, float *scores, const float *classes, const float4 *boxes) {
 
     // Go through detections by descending score
     for (int m = 0; m < num_detections; m++) {
-        for (int n = 0; n < num_per_thread; n++) {
-            int i = threadIdx.x * num_per_thread + n;
-            if (i < num_detections && m < i && scores[m] > 0.0f) {
-                int idx = indices[i];
-                int max_idx = indices[m];
-                int icls = classes[idx];
-                int mcls = classes[max_idx];
-                if (mcls == icls) {
-                    float4 ibox = boxes[idx];
-                    float4 mbox = boxes[max_idx];
-                    float x1 = max(ibox.x, mbox.x);
-                    float y1 = max(ibox.y, mbox.y);
-                    float x2 = min(ibox.z, mbox.z);
-                    float y2 = min(ibox.w, mbox.w);
-                    float w = max(0.0f, x2 - x1);
-                    float h = max(0.0f, y2 - y1);
-                    float iarea = (ibox.z - ibox.x) * (ibox.w - ibox.y);
-                    float marea = (mbox.z - mbox.x) * (mbox.w - mbox.y);
-                    float inter = w * h;
-                    float overlap = inter / (iarea + marea - inter);
-                    if (overlap > threshold) {
-                        scores[i] = 0.0f;
-                    }
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i < num_detections && m < i && scores[m] > 0.0f) {
+            int idx = indices[i];
+            int max_idx = indices[m];
+            int icls = classes[idx];
+            int mcls = classes[max_idx];
+            if (mcls == icls) {
+                float4 ibox = boxes[idx];
+                float4 mbox = boxes[max_idx];
+                float x1 = max(ibox.x, mbox.x);
+                float y1 = max(ibox.y, mbox.y);
+                float x2 = min(ibox.z, mbox.z);
+                float y2 = min(ibox.w, mbox.w);
+                float w = max(0.0f, x2 - x1);
+                float h = max(0.0f, y2 - y1);
+                float iarea = (ibox.z - ibox.x) * (ibox.w - ibox.y);
+                float marea = (mbox.z - mbox.x) * (mbox.w - mbox.y);
+                float inter = w * h;
+                float overlap = inter / (iarea + marea - inter);
+                if (overlap > threshold) {
+                    scores[i] = 0.0f;
                 }
             }
         }
@@ -104,7 +102,7 @@ int batchedNms(int batch_size,
         // TODO: different device has differnet max threads
         const int max_threads = 1024;
         int num_per_thread = ceil(static_cast<float>(num_detections) / max_threads);
-        batched_nms_kernel << <1, max_threads, 0, stream >> > (num_per_thread, nms_thresh, num_detections,
+        batched_nms_kernel << <num_per_thread, max_threads, 0, stream >> > (nms_thresh, num_detections,
             indices_sorted, scores_sorted, in_classes, in_boxes);
 
         // Re-sort with updated scores
diff --git a/rcnn/README.md b/rcnn/README.md
@@ -101,7 +101,7 @@ sudo ./rcnn -d faster.engine ../samples
   R101-faster: ./configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml
   R50-mask: ./configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml
   R101-mask: ./configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml
-3.set BACKBONE_RESNETTYPE = R50(R101) rcnn.cpp line 13
+3.set BACKBONE_RESNETTYPE = R50(R101) rcnn.cpp line 14
 4.set STRIDE_IN_1X1=true in backbone.hpp
 5.follow how to run
 ```
@@ -130,7 +130,18 @@ sudo ./rcnn -d faster.engine ../samples
 
 1. quantizationType:fp32,fp16,int8. see BuildRcnnModel(rcnn.cpp line 345) for detail.
 
-2. the usage of int8 is same with [tensorrtx/yolov5](../yolov5/README.md), but it has no improvement comparing to fp16.
+2. the usage of int8 is same with [tensorrtx/yolov5](../yolov5/README.md).
+
+## Latency
+
+average cost of doInference(in rcnn.cpp) from second time with batch=1 under the ubuntu environment above, input size: 640(w)*480(h)
+
+|               | fp32  | fp16 | int8 |
+| ------------- | ----- | ---- | ---- |
+| Faster-R50C4  | 138ms | 36ms | 30ms |
+| Faster-R101C4 | 146ms | 38ms | 32ms |
+| Mask-R50C4    | 153ms | 44ms | 33ms |
+| Mask-R101C4   | 168ms | 45ms | 35ms |
 
 ## Plugins
 
diff --git a/rcnn/RpnNms.cu b/rcnn/RpnNms.cu
@@ -16,31 +16,29 @@
 namespace nvinfer1 {
 
     __global__ void rpn_nms_kernel(
-        const int num_per_thread, const float threshold, const int num_detections,
+        const float threshold, const int num_detections,
         const int *indices, float *scores, const float4 *boxes) {
         // Go through detections by descending score
         for (int m = 0; m < num_detections; m++) {
-            for (int n = 0; n < num_per_thread; n++) {
-                int i = threadIdx.x * num_per_thread + n;
-                if (i < num_detections && m < i && scores[m] > -FLT_MAX) {
-                    int idx = indices[i];
-                    int max_idx = indices[m];
-
-                    float4 ibox = boxes[idx];
-                    float4 mbox = boxes[max_idx];
-                    float x1 = max(ibox.x, mbox.x);
-                    float y1 = max(ibox.y, mbox.y);
-                    float x2 = min(ibox.z, mbox.z);
-                    float y2 = min(ibox.w, mbox.w);
-                    float w = max(0.0f, x2 - x1);
-                    float h = max(0.0f, y2 - y1);
-                    float iarea = (ibox.z - ibox.x) * (ibox.w - ibox.y);
-                    float marea = (mbox.z - mbox.x) * (mbox.w - mbox.y);
-                    float inter = w * h;
-                    float overlap = inter / (iarea + marea - inter);
-                    if (overlap > threshold) {
-                        scores[i] = -FLT_MAX;
-                    }
+            int i = blockIdx.x * blockDim.x + threadIdx.x;
+            if (i < num_detections && m < i && scores[m] > -FLT_MAX) {
+                int idx = indices[i];
+                int max_idx = indices[m];
+
+                float4 ibox = boxes[idx];
+                float4 mbox = boxes[max_idx];
+                float x1 = max(ibox.x, mbox.x);
+                float y1 = max(ibox.y, mbox.y);
+                float x2 = min(ibox.z, mbox.z);
+                float y2 = min(ibox.w, mbox.w);
+                float w = max(0.0f, x2 - x1);
+                float h = max(0.0f, y2 - y1);
+                float iarea = (ibox.z - ibox.x) * (ibox.w - ibox.y);
+                float marea = (mbox.z - mbox.x) * (mbox.w - mbox.y);
+                float inter = w * h;
+                float overlap = inter / (iarea + marea - inter);
+                if (overlap > threshold) {
+                    scores[i] = -FLT_MAX;
                 }
             }
 
@@ -98,7 +96,7 @@ namespace nvinfer1 {
             // TODO: different device has differnet max threads
             const int max_threads = 1024;
             int num_per_thread = ceil(static_cast<float>(num_detections) / max_threads);
-            rpn_nms_kernel << <1, max_threads, 0, stream >> > (num_per_thread, nms_thresh, num_detections,
+            rpn_nms_kernel << <num_per_thread, max_threads, 0, stream >> > (nms_thresh, num_detections,
                 indices_sorted, scores_sorted, in_boxes);
 
             // Re-sort with updated scores