v2.3.3: fix some problem in int8

traveller59 · Feb 2, 2023 · 2309ebe · 2309ebe
1 parent b52636d
commit 2309ebe
Show file tree

Hide file tree

Showing 8 changed files with 73 additions and 36 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## [2.3.3] - 2023-02-02
+### Fixed 
+- Fix int8 nvrtc error when use prebuilt
+- Fix int8 kernel when run on turing GPU
+
 ## [2.3.2] - 2023-01-20
 ### Changed 
 - change version

diff --git a/README.md b/README.md
@@ -57,11 +57,9 @@
 | CUDA 11.4 | [![PyPI Version][pypi-ver-114]][pypi-url-114] | ```pip install spconv-cu114```| [![pypi monthly download][pypi-download-114]][pypi-url-114]|
 | CUDA 11.6 | [![PyPI Version][pypi-ver-116]][pypi-url-116] | ```pip install spconv-cu116```| [![pypi monthly download][pypi-download-116]][pypi-url-116]|
 | CUDA 11.7 | [![PyPI Version][pypi-ver-117]][pypi-url-117] | ```pip install spconv-cu117```| [![pypi monthly download][pypi-download-117]][pypi-url-117]| 
-| CUDA 11.8* | [![PyPI Version][pypi-ver-118]][pypi-url-118] | ```pip install spconv-cu118```| [![pypi monthly download][pypi-download-118]][pypi-url-118]| 
+| CUDA 11.8 | [![PyPI Version][pypi-ver-118]][pypi-url-118] | ```pip install spconv-cu118```| [![pypi monthly download][pypi-download-118]][pypi-url-118]| 
+| CUDA 12.0 | [![PyPI Version][pypi-ver-120]][pypi-url-120] | ```pip install spconv-cu120```| [![pypi monthly download][pypi-download-120]][pypi-url-120]| 
 
-*: sm_89 and sm_90 is added in CUDA 11.8. If you use RTX 4090 or H100, you should use this version.
-
-<!-- | CUDA 12.0 | [![PyPI Version][pypi-ver-120]][pypi-url-120] | ```pip install spconv-cu120```| [![pypi monthly download][pypi-download-120]][pypi-url-120]| -->
 
 ```spconv``` is a project that provide heavily-optimized sparse convolution implementation with tensor core support. check [benchmark](docs/BENCHMARK.md) to see how fast spconv 2.x runs.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,4 +1,5 @@
 [build-system]
 requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm>=0.4.5"]
 # requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm @ file:///io/dist/cumm_cu120-0.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"]
+# requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm @ file:///io/dist/cumm_cu117-0.4.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"]
 build-backend = "setuptools.build_meta"
diff --git a/setup.py b/setup.py
@@ -167,8 +167,8 @@ def run(self):
     all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS + SHUFFLE_AMPERE_PARAMS
     all_imp = (IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS +
                IMPLGEMM_TURING_PARAMS + IMPLGEMM_AMPERE_PARAMS)
-    all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
-    all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
+    # all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
+    # all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
 
     cu = GemmMainUnitTest(all_shuffle)
     convcu = ConvMainUnitTest(all_imp)

diff --git a/spconv/core.py b/spconv/core.py
@@ -840,7 +840,7 @@ class AlgoHint(Enum):
                      NHWC,
                      NHWC,
                      GemmAlgo.Turing,
-                     TensorOp((16, 8, 16)),
+                     TensorOp((8, 8, 16)),
                      mask_sparse=True,
                      increment_k_first=True,
                      access_per_vector=1,
@@ -855,7 +855,7 @@ class AlgoHint(Enum):
                      NHWC,
                      NHWC,
                      GemmAlgo.Turing,
-                     TensorOp((16, 8, 16)),
+                     TensorOp((8, 8, 16)),
                      mask_sparse=True,
                      increment_k_first=True,
                      access_per_vector=0,
@@ -1127,7 +1127,7 @@ class AlgoHint(Enum):
                         NHWC,
                         NHWC,
                         GemmAlgo.Turing,
-                        TensorOp((16, 8, 16)),
+                        TensorOp((8, 8, 16)),
                         mask_sparse=True,
                         increment_k_first=True,
                         access_per_vector=1,
@@ -1142,7 +1142,7 @@ class AlgoHint(Enum):
                         NHWC,
                         NHWC,
                         GemmAlgo.Turing,
-                        TensorOp((16, 8, 16)),
+                        TensorOp((8, 8, 16)),
                         mask_sparse=True,
                         increment_k_first=True,
                         access_per_vector=1,
@@ -1157,13 +1157,13 @@ class AlgoHint(Enum):
                         NHWC,
                         NHWC,
                         GemmAlgo.Turing,
-                        TensorOp((16, 8, 16)),
+                        TensorOp((8, 8, 16)),
                         mask_sparse=True,
                         increment_k_first=True,
                         access_per_vector=1,
                         is_nvrtc=True,
                         int8_inference=True),
-        *gen_conv_params(ConvFwdAndBwdInput, (64, 64, 64), (32, 32, 64),
+        *gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (16, 32, 32),
                         NDIM_DONT_CARE,
                         ConvIterAlgo.Optimized,
                         2,
@@ -1172,14 +1172,13 @@ class AlgoHint(Enum):
                         NHWC,
                         NHWC,
                         GemmAlgo.Turing,
-                        TensorOp((16, 8, 32)),
+                        TensorOp((8, 8, 16)),
                         mask_sparse=True,
                         increment_k_first=True,
                         access_per_vector=1,
                         is_nvrtc=True,
                         int8_inference=True),
-
-        *gen_conv_params(ConvFwdAndBwdInput, (64, 128, 64), (32, 64, 64),
+        *gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (16, 16, 32),
                         NDIM_DONT_CARE,
                         ConvIterAlgo.Optimized,
                         2,
@@ -1188,14 +1187,13 @@ class AlgoHint(Enum):
                         NHWC,
                         NHWC,
                         GemmAlgo.Turing,
-                        TensorOp((16, 8, 32)),
+                        TensorOp((8, 8, 16)),
                         mask_sparse=True,
                         increment_k_first=True,
                         access_per_vector=1,
                         is_nvrtc=True,
                         int8_inference=True),
-
-        *gen_conv_params(ConvFwdAndBwdInput, (64, 128, 32), (32, 64, 32),
+        *gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (32, 16, 32),
                         NDIM_DONT_CARE,
                         ConvIterAlgo.Optimized,
                         2,
@@ -1204,14 +1202,14 @@ class AlgoHint(Enum):
                         NHWC,
                         NHWC,
                         GemmAlgo.Turing,
-                        TensorOp((16, 8, 16)),
+                        TensorOp((8, 8, 16)),
                         mask_sparse=True,
                         increment_k_first=True,
                         access_per_vector=1,
                         is_nvrtc=True,
                         int8_inference=True),
 
-        *gen_conv_params(ConvFwdAndBwdInput, (128, 64, 64), (64, 32, 64),
+        *gen_conv_params(ConvFwdAndBwdInput, (64, 64, 64), (32, 32, 64),
                         NDIM_DONT_CARE,
                         ConvIterAlgo.Optimized,
                         2,
@@ -1220,14 +1218,14 @@ class AlgoHint(Enum):
                         NHWC,
                         NHWC,
                         GemmAlgo.Turing,
-                        TensorOp((16, 8, 32)),
+                        TensorOp((8, 8, 16)),
                         mask_sparse=True,
                         increment_k_first=True,
                         access_per_vector=1,
                         is_nvrtc=True,
                         int8_inference=True),
-        # TODO 16,8,32 produce wrong result.
-        *gen_conv_params(ConvFwdAndBwdInput, (128, 64, 32), (64, 32, 32),
+
+        *gen_conv_params(ConvFwdAndBwdInput, (64, 128, 64), (32, 64, 64),
                         NDIM_DONT_CARE,
                         ConvIterAlgo.Optimized,
                         2,
@@ -1236,14 +1234,30 @@ class AlgoHint(Enum):
                         NHWC,
                         NHWC,
                         GemmAlgo.Turing,
-                        TensorOp((16, 8, 16)),
+                        TensorOp((8, 8, 16)),
+                        mask_sparse=True,
+                        increment_k_first=True,
+                        access_per_vector=1,
+                        is_nvrtc=True,
+                        int8_inference=True),
+
+        *gen_conv_params(ConvFwdAndBwdInput, (64, 128, 32), (32, 64, 32),
+                        NDIM_DONT_CARE,
+                        ConvIterAlgo.Optimized,
+                        2,
+                        ["s8,s8,s8,s32,f32", "s8,s8,s8,s32,f16"],
+                        NHWC,
+                        NHWC,
+                        NHWC,
+                        GemmAlgo.Turing,
+                        TensorOp((8, 8, 16)),
                         mask_sparse=True,
                         increment_k_first=True,
                         access_per_vector=1,
                         is_nvrtc=True,
                         int8_inference=True),
 
-        *gen_conv_params(ConvFwdAndBwdInput, (128, 256, 64), (64, 128, 64),
+        *gen_conv_params(ConvFwdAndBwdInput, (128, 64, 64), (64, 32, 64),
                         NDIM_DONT_CARE,
                         ConvIterAlgo.Optimized,
                         2,
@@ -1252,14 +1266,14 @@ class AlgoHint(Enum):
                         NHWC,
                         NHWC,
                         GemmAlgo.Turing,
-                        TensorOp((16, 8, 32)),
+                        TensorOp((8, 8, 16)),
                         mask_sparse=True,
                         increment_k_first=True,
                         access_per_vector=1,
                         is_nvrtc=True,
                         int8_inference=True),
-
-        *gen_conv_params(ConvFwdAndBwdInput, (256, 128, 64), (128, 64, 64),
+        # TODO 16,8,32 produce wrong result.
+        *gen_conv_params(ConvFwdAndBwdInput, (128, 64, 32), (64, 32, 32),
                         NDIM_DONT_CARE,
                         ConvIterAlgo.Optimized,
                         2,
@@ -1268,14 +1282,14 @@ class AlgoHint(Enum):
                         NHWC,
                         NHWC,
                         GemmAlgo.Turing,
-                        TensorOp((16, 8, 32)),
+                        TensorOp((8, 8, 16)),
                         mask_sparse=True,
                         increment_k_first=True,
                         access_per_vector=1,
                         is_nvrtc=True,
                         int8_inference=True),
 
-        *gen_conv_params(ConvFwdAndBwdInput, (128, 128, 128), (64, 64, 128),
+        *gen_conv_params(ConvFwdAndBwdInput, (128, 256, 64), (64, 128, 64),
                         NDIM_DONT_CARE,
                         ConvIterAlgo.Optimized,
                         2,
@@ -1284,13 +1298,29 @@ class AlgoHint(Enum):
                         NHWC,
                         NHWC,
                         GemmAlgo.Turing,
-                        TensorOp((16, 8, 32)),
+                        TensorOp((8, 8, 16)),
                         mask_sparse=True,
                         increment_k_first=True,
                         access_per_vector=1,
                         is_nvrtc=True,
                         int8_inference=True),
 
+        *gen_conv_params(ConvFwdAndBwdInput, (256, 128, 64), (128, 64, 64),
+                        NDIM_DONT_CARE,
+                        ConvIterAlgo.Optimized,
+                        2,
+                        ["s8,s8,s8,s32,f32", "s8,s8,s8,s32,f16"],
+                        NHWC,
+                        NHWC,
+                        NHWC,
+                        GemmAlgo.Turing,
+                        TensorOp((8, 8, 16)),
+                        mask_sparse=True,
+                        increment_k_first=True,
+                        access_per_vector=1,
+                        is_nvrtc=True,
+                        int8_inference=True),
+
         *gen_conv_params(ConvFwdAndBwdInput, (128, 128, 64), (64, 64, 64),
                         NDIM_DONT_CARE,
                         ConvIterAlgo.Optimized,
@@ -1300,7 +1330,7 @@ class AlgoHint(Enum):
                         NHWC,
                         NHWC,
                         GemmAlgo.Turing,
-                        TensorOp((16, 8, 32)),
+                        TensorOp((8, 8, 16)),
                         mask_sparse=True,
                         increment_k_first=True,
                         access_per_vector=1,

diff --git a/test/test_all_algo.py b/test/test_all_algo.py
@@ -330,17 +330,20 @@ def _test_impgemm_conv_cuda(subm: bool):
     device = torch.device("cuda:0")
     shapes = [[19, 18, 17]]
     batchsizes = [1]
-    dtypes = [(np.float32, np.float32), (np.float16, np.float16)]
+    # dtypes = [(np.float32, np.float32), (np.float16, np.float16)]
     # dtypes = [np.float16]
     # dtypes = [(np.int8, np.int8), (np.int8, np.float32), (np.int8, np.float16)]
-    # dtypes = [(np.int8, np.int8)]
+    dtypes = [(np.int8, np.int8)]
     # dtypes = [(np.float16, np.float16)]
 
     test_case = TestCase()
     # in_channels = [32]
     # out_channels = [32, 48, 64]
     in_channels = [32, 47]
     out_channels = [32, 48, 62]
+    in_channels = [16]
+    out_channels = [16]
+
     # in_channels = [16]
     # out_channels = [16]
 

diff --git a/tools/build-wheels-dev.sh b/tools/build-wheels-dev.sh
@@ -26,7 +26,7 @@ function repair_wheel {
 }
 gcc -v
 export SPCONV_DISABLE_JIT="1"
-export CUMM_CUDA_ARCH_LIST="7.5"
+export CUMM_CUDA_ARCH_LIST="8.6"
 # export SPCONV_PYTHON_LIST="3.7;3.8;3.9;3.10"
 # Compile wheels, we only support 3.6-3.10.
 # "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp

diff --git a/version.txt b/version.txt
@@ -1 +1 @@
-2.3.2
+2.3.3