add cuda 11.7, remove cuda 11.1

traveller59 · Sep 24, 2022 · dac35ad · dac35ad
1 parent 77f1cf0
commit dac35ad
Show file tree

Hide file tree

Showing 17 changed files with 199 additions and 163 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -15,8 +15,8 @@ jobs:
     runs-on: windows-2019
     strategy:
       matrix:
-        python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] 
-        cuda-version: ['10.2', '11.1', '11.4']
+        python-version: ['3.7', '3.8', '3.9', '3.10', '3.11.0-rc.2'] 
+        cuda-version: ['10.2', '11.3', '11.4', '11.7']
     steps:
       - uses: actions/checkout@master
       - uses: dorny/paths-filter@v2
@@ -115,8 +115,8 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] # this version is only used for upload.
-        cuda-version: ['102', '111', '113', '114', '']
+        python-version: ['3.7', '3.8', '3.9', '3.10', '3.11.0-rc.2'] # this version is only used for upload.
+        cuda-version: ['102', '113', '114', '117', '']
 
     steps:
       - uses: actions/checkout@master

diff --git a/.gitignore b/.gitignore
@@ -114,4 +114,6 @@ wheelhouse_tmp
 
 example/libspconv/cumm
 example/libspconv/spconv/include
-example/libspconv/spconv/src
+example/libspconv/spconv/src
+
+third_party/boost
diff --git a/README.md b/README.md
@@ -16,6 +16,8 @@
 [pypi-ver-cpu]: https://img.shields.io/pypi/v/spconv
 [pypi-ver-114]: https://img.shields.io/pypi/v/spconv-cu114
 [pypi-ver-111]: https://img.shields.io/pypi/v/spconv-cu111
+[pypi-ver-117]: https://img.shields.io/pypi/v/spconv-cu117
+
 [pypi-ver-113]: https://img.shields.io/pypi/v/spconv-cu113
 [pypi-ver-120]: https://img.shields.io/pypi/v/spconv-cu120
 [pypi-ver-102]: https://img.shields.io/pypi/v/spconv-cu102
@@ -28,6 +30,8 @@
 [pypi-download-113]: https://img.shields.io/pypi/dm/spconv-cu113
 [pypi-url-114]: https://pypi.org/project/spconv-cu114/
 [pypi-download-114]: https://img.shields.io/pypi/dm/spconv-cu114
+[pypi-url-117]: https://pypi.org/project/spconv-cu117/
+[pypi-download-117]: https://img.shields.io/pypi/dm/spconv-cu117
 [pypi-url-120]: https://pypi.org/project/spconv-cu120/
 [pypi-download-120]: https://img.shields.io/pypi/dm/spconv-cu120
 [pypi-url-cpu]: https://pypi.org/project/spconv/
@@ -41,9 +45,9 @@
 | -------------- |:---------------------:| ---------------------:| ---------------------:| 
 | CPU (Linux Only) | [![PyPI Version][pypi-ver-cpu]][pypi-url-cpu] | ```pip install spconv``` | [![pypi monthly download][pypi-download-cpu]][pypi-url-cpu] | 
 | CUDA 10.2 | [![PyPI Version][pypi-ver-102]][pypi-url-102] | ```pip install spconv-cu102```| [![pypi monthly download][pypi-download-102]][pypi-url-102]| 
-| CUDA 11.1 | [![PyPI Version][pypi-ver-111]][pypi-url-111] | ```pip install spconv-cu111```| [![pypi monthly download][pypi-download-111]][pypi-url-111]| 
 | CUDA 11.3 (Linux Only) | [![PyPI Version][pypi-ver-113]][pypi-url-113] | ```pip install spconv-cu113```| [![pypi monthly download][pypi-download-113]][pypi-url-113]| 
 | CUDA 11.4 | [![PyPI Version][pypi-ver-114]][pypi-url-114] | ```pip install spconv-cu114```| [![pypi monthly download][pypi-download-114]][pypi-url-114]|
+| CUDA 11.7 | [![PyPI Version][pypi-ver-117]][pypi-url-117] | ```pip install spconv-cu117```| [![pypi monthly download][pypi-download-117]][pypi-url-117]| 
 <!-- | CUDA 12.0 | [![PyPI Version][pypi-ver-120]][pypi-url-120] | ```pip install spconv-cu120```| [![pypi monthly download][pypi-download-120]][pypi-url-120]| -->
 
 ```spconv``` is a project that provide heavily-optimized sparse convolution implementation with tensor core support. check [benchmark](docs/BENCHMARK.md) to see how fast spconv 2.x runs.
@@ -52,15 +56,19 @@
 
 Check [spconv 2.x algorithm introduction](docs/spconv2_algo.pdf) to understand sparse convolution algorithm in spconv 2.x!
 
+## WARNING
+
+Use spconv >= cu114 if possible. cuda 11.4 can compile greatly faster kernel in some situation.
+
 ## NEWS
 
 * spconv 2.2: ampere feature support (by [EvernightAurora](https://github.com/EvernightAurora)), pure c++ code generation, nvrtc, drop python 3.6
 
 ## Spconv 2.2 vs Spconv 2.1
 
-* faster fp16 kernels (~5-30%) in ampere GPUs (tested in RTX 3090)
-* greatly faster int8 kernels (~1.2x~2.7x) in ampere GPUs (tested in RTX 3090)
-* no python 3.6 support
+* faster fp16 conv kernels (~5-30%) in ampere GPUs (tested in RTX 3090)
+* greatly faster int8 conv kernels (~1.2x~2.7x) in ampere GPUs (tested in RTX 3090)
+* drop python 3.6 support
 * nvrtc support: kernel in old GPUs will be compiled in runtime.
 * [libspconv](docs/PURE_CPP_BUILD.md): pure c++ build of all spconv ops. see [example](example/libspconv/run_build.sh)
 * tf32 kernels, faster fp32 training, disabled by default. set ```import spconv as spconv_core; spconv_core.constants.SPCONV_ALLOW_TF32 = True``` to enable them.
@@ -84,6 +92,10 @@ Then see [this](docs/USAGE.md).
 
 Don't forget to check [performance guide](docs/PERFORMANCE_GUIDE.md).
 
+### Common Solution for Some Bugs
+
+see [common problems](docs/COMMON_PROBLEMS.md).
+
 ## Install
 
 You need to install python >= 3.7 first to use spconv 2.x.
@@ -94,22 +106,22 @@ You need at least CUDA 11.0 to build and run spconv 2.x. We won't offer any supp
 
 ### Prebuilt
 
-We offer python 3.7-3.11 and cuda 10.2/11.1/11.3/11.4/12.0 prebuilt binaries for linux (manylinux).
+We offer python 3.7-3.11 and cuda 10.2/11.3/11.4/11.7/12.0 prebuilt binaries for linux (manylinux).
 
-We offer python 3.7-3.11 and cuda 10.2/11.1/11.4/12.0 prebuilt binaries for windows 10/11.
+We offer python 3.7-3.11 and cuda 10.2/11.4/11.7/12.0 prebuilt binaries for windows 10/11.
 
 For Linux users, you need to install pip >= 20.3 first to install prebuilt.
 
 ```pip install spconv``` for CPU only (**Linux Only**). you should only use this for debug usage, the performance isn't optimized due to manylinux limit (no omp support).
 
 ```pip install spconv-cu102``` for CUDA 10.2
 
-```pip install spconv-cu111``` for CUDA 11.1
-
 ```pip install spconv-cu113``` for CUDA 11.3 (**Linux Only**)
 
 ```pip install spconv-cu114``` for CUDA 11.4
 
+```pip install spconv-cu117``` for CUDA 11.7
+
 ```pip install spconv-cu120``` for CUDA 12.0
 
 **NOTE** It's safe to have different **minor** cuda version between system and conda (pytorch) in **CUDA >= 11.0** because of [CUDA Minor Version Compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/#minor-version-compatibility). For example, you can use spconv-cu114 with anaconda version of pytorch cuda 11.1 in a OS with CUDA 11.2 installed.

diff --git a/docs/COMMON_PROBLEMS.md b/docs/COMMON_PROBLEMS.md
@@ -0,0 +1,37 @@
+<!--
+ Copyright 2022 Yan Yan
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+# Common Problems
+
+## the provided PTX was compiled with an unsupported toolchain
+
+Update your GPU driver or downgrad your spconv/cumm cuda version.
+
+## CUDA kernel launch blocks must be positive, but got N= 0
+
+Your coordinates generate nothing with some conv params. Modify your conv params to make sure all input points have at least one output point.
+
+Example:
+
+Conv Params:
+```spatial shape=[8, 200, 200],ksize=[3, 3, 3],stride=[2, 2, 2],padding=[0, 1, 1],dilation=[1, 1, 1]```
+Coordinates:
+```
+[[0, 7, 153, 142]]
+```
+
+The convolution in z axis will drop ALL points in z == 7. change the padding-z to solve this problem.
+
diff --git a/docs/PERFORMANCE_GUIDE.md b/docs/PERFORMANCE_GUIDE.md
@@ -26,3 +26,4 @@
 * spconv 2.x in Windows 10 is 1.5x~2x slower than Linux. use Linux if possible.
 * If you train with float32 and ampere or later GPUs, you can set ```spconv.constants.SPCONV_ALLOW_TF32``` to enable faster fp32 training.
 See [benchmark](BENCHMARK.md) for more performance details of different algorithms.
+* Different CUDA version of spconv may have different performance. Use newest cuda version if possible. For example, spconv-cu117 is faster than spconv-cu114, spconv-cu114 is faster than spconv-cu111.
diff --git a/docs/SPCONV_DEVELOP_PLAN.md b/docs/SPCONV_DEVELOP_PLAN.md
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,3 @@
 [build-system]
-requires = ["setuptools>=41.0", "wheel", "pccm>=0.2.21", "cumm>=0.2.3"]
+requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm>=0.3.0"]
 build-backend = "setuptools.build_meta"
diff --git a/setup.py b/setup.py
@@ -163,9 +163,14 @@ def run(self):
     from spconv.csrc.sparse.convops import GemmTunerSimple, ExternalSpconvMatmul
     from spconv.csrc.sparse.convops import ConvTunerSimple, ConvGemmOps
     from spconv.csrc.sparse.inference import InferenceOps
-
-    cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS + SHUFFLE_AMPERE_PARAMS)
-    convcu = ConvMainUnitTest(IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + IMPLGEMM_TURING_PARAMS + IMPLGEMM_AMPERE_PARAMS)
+    all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS + SHUFFLE_AMPERE_PARAMS
+    all_imp = (IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS +
+               IMPLGEMM_TURING_PARAMS + IMPLGEMM_AMPERE_PARAMS)
+    all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
+    all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
+
+    cu = GemmMainUnitTest(all_shuffle)
+    convcu = ConvMainUnitTest(all_imp)
     convcu.namespace = "cumm.conv.main"
 
     cu.namespace = "cumm.gemm.main"

diff --git a/spconv/algo.py b/spconv/algo.py
@@ -40,7 +40,7 @@
 from spconv.core import ALL_IMPGEMM_PARAMS, AlgoHint, ConvAlgo, ALL_NATIVE_PARAMS
 from spconv.core_cc.cumm.conv.main import ConvMainUnitTest
 from spconv.core_cc.cumm.gemm.main import GemmMainUnitTest
-from spconv.cppconstants import COMPILED_CUDA_ARCHS
+from spconv.cppconstants import COMPILED_CUDA_GEMM_ARCHS
 from cumm.tensorview.gemm import NVRTCParams
 from spconv.tools import CUDAKernelTimer
 from cumm.gemm.constants import NVRTCConstants, NVRTCMode
@@ -337,7 +337,7 @@ def get_all_available(
             ldb = b.stride[0]
             ldc = c.stride[0]
             if desp.supported_ldx(lda, ldb, ldc):
-                if arch not in COMPILED_CUDA_ARCHS:
+                if arch not in COMPILED_CUDA_GEMM_ARCHS:
                     desp = desp.copy()
                     desp.is_nvrtc = True
                 if SPCONV_DEBUG_NVRTC_KERNELS:
@@ -720,7 +720,7 @@ def get_all_available(self,
                 assert mask_width > 0
                 mask_width_valid = mask_width % desp.tile_shape[2] == 0
             if desp.supported_ldx_conv(ldi, ldw, ldo) and mask_width_valid:
-                if arch not in COMPILED_CUDA_ARCHS:
+                if arch not in COMPILED_CUDA_GEMM_ARCHS:
                     desp = desp.copy()
                     desp.is_nvrtc = True
                 if SPCONV_DEBUG_NVRTC_KERNELS:
@@ -822,6 +822,7 @@ def tune_and_cache(self,
 
         times: List[float] = []
         all_profile_res: List[BestConvAlgoByProfile] = []
+        group_by_algo = {}
         for desp in avail:
             # for sparse conv, ndim isn't used, so we just provide a constant value.
             params = ConvParams(NDIM_DONT_CARE, ConvOpTypeCpp(op_type.value))
@@ -865,7 +866,9 @@ def tune_and_cache(self,
                     this_times.append(measure.duration)
                 times.append(np.mean(this_times[1:]))
                 spk_speeds.append(times[-1])
-
+                if desp.algo not in group_by_algo:
+                    group_by_algo[desp.algo] = 10000.0
+                group_by_algo[desp.algo] = min(times[-1], group_by_algo[desp.algo])
                 all_profile_res.append(
                     BestConvAlgoByProfile(desp, arch, splitk=spk))
         if not all_profile_res:

diff --git a/spconv/benchmark/__init__.py b/spconv/benchmark/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2022 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/spconv/benchmark/__main__.py b/spconv/benchmark/__main__.py
@@ -1,14 +1,6 @@
-from .basic import bench_basic
+from .basic import bench_basic, bench_large
 
 import fire
 
-def bench_me_basic(dtype_str: str):
-    from spconv.benchmark.me import bench_me_basic
-    return bench_me_basic(dtype_str)
-
-def bench_torchsparse_basic(dtype_str: str):
-    from spconv.benchmark.thsp import bench_torchsparse_basic
-    return bench_torchsparse_basic(dtype_str)
-
 if __name__ == "__main__":
     fire.Fire()