diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ffd7d9..c50cbff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## [2.3.3] - 2023-02-02 +### Fixed +- Fix int8 nvrtc error when use prebuilt +- Fix int8 kernel when run on turing GPU + ## [2.3.2] - 2023-01-20 ### Changed - change version diff --git a/README.md b/README.md index 3154274..4203237 100644 --- a/README.md +++ b/README.md @@ -57,11 +57,9 @@ | CUDA 11.4 | [![PyPI Version][pypi-ver-114]][pypi-url-114] | ```pip install spconv-cu114```| [![pypi monthly download][pypi-download-114]][pypi-url-114]| | CUDA 11.6 | [![PyPI Version][pypi-ver-116]][pypi-url-116] | ```pip install spconv-cu116```| [![pypi monthly download][pypi-download-116]][pypi-url-116]| | CUDA 11.7 | [![PyPI Version][pypi-ver-117]][pypi-url-117] | ```pip install spconv-cu117```| [![pypi monthly download][pypi-download-117]][pypi-url-117]| -| CUDA 11.8* | [![PyPI Version][pypi-ver-118]][pypi-url-118] | ```pip install spconv-cu118```| [![pypi monthly download][pypi-download-118]][pypi-url-118]| +| CUDA 11.8 | [![PyPI Version][pypi-ver-118]][pypi-url-118] | ```pip install spconv-cu118```| [![pypi monthly download][pypi-download-118]][pypi-url-118]| +| CUDA 12.0 | [![PyPI Version][pypi-ver-120]][pypi-url-120] | ```pip install spconv-cu120```| [![pypi monthly download][pypi-download-120]][pypi-url-120]| -*: sm_89 and sm_90 is added in CUDA 11.8. If you use RTX 4090 or H100, you should use this version. - - ```spconv``` is a project that provide heavily-optimized sparse convolution implementation with tensor core support. check [benchmark](docs/BENCHMARK.md) to see how fast spconv 2.x runs. diff --git a/pyproject.toml b/pyproject.toml index ed88cde..0ab4d82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,4 +1,5 @@ [build-system] requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm>=0.4.5"] # requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm @ file:///io/dist/cumm_cu120-0.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"] +# requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm @ file:///io/dist/cumm_cu117-0.4.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"] build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py index dffd6d0..2bed731 100644 --- a/setup.py +++ b/setup.py @@ -167,8 +167,8 @@ def run(self): all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS + SHUFFLE_AMPERE_PARAMS all_imp = (IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + IMPLGEMM_TURING_PARAMS + IMPLGEMM_AMPERE_PARAMS) - all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle)) - all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp)) + # all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle)) + # all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp)) cu = GemmMainUnitTest(all_shuffle) convcu = ConvMainUnitTest(all_imp) diff --git a/spconv/core.py b/spconv/core.py index 2e0f9d7..827bf04 100644 --- a/spconv/core.py +++ b/spconv/core.py @@ -840,7 +840,7 @@ class AlgoHint(Enum): NHWC, NHWC, GemmAlgo.Turing, - TensorOp((16, 8, 16)), + TensorOp((8, 8, 16)), mask_sparse=True, increment_k_first=True, access_per_vector=1, @@ -855,7 +855,7 @@ class AlgoHint(Enum): NHWC, NHWC, GemmAlgo.Turing, - TensorOp((16, 8, 16)), + TensorOp((8, 8, 16)), mask_sparse=True, increment_k_first=True, access_per_vector=0, @@ -1127,7 +1127,7 @@ class AlgoHint(Enum): NHWC, NHWC, GemmAlgo.Turing, - TensorOp((16, 8, 16)), + TensorOp((8, 8, 16)), mask_sparse=True, increment_k_first=True, access_per_vector=1, @@ -1142,7 +1142,7 @@ class AlgoHint(Enum): NHWC, NHWC, GemmAlgo.Turing, - TensorOp((16, 8, 16)), + TensorOp((8, 8, 16)), mask_sparse=True, increment_k_first=True, access_per_vector=1, @@ -1157,13 +1157,13 @@ class AlgoHint(Enum): NHWC, NHWC, GemmAlgo.Turing, - TensorOp((16, 8, 16)), + TensorOp((8, 8, 16)), mask_sparse=True, increment_k_first=True, access_per_vector=1, is_nvrtc=True, int8_inference=True), - *gen_conv_params(ConvFwdAndBwdInput, (64, 64, 64), (32, 32, 64), + *gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (16, 32, 32), NDIM_DONT_CARE, ConvIterAlgo.Optimized, 2, @@ -1172,14 +1172,13 @@ class AlgoHint(Enum): NHWC, NHWC, GemmAlgo.Turing, - TensorOp((16, 8, 32)), + TensorOp((8, 8, 16)), mask_sparse=True, increment_k_first=True, access_per_vector=1, is_nvrtc=True, int8_inference=True), - - *gen_conv_params(ConvFwdAndBwdInput, (64, 128, 64), (32, 64, 64), + *gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (16, 16, 32), NDIM_DONT_CARE, ConvIterAlgo.Optimized, 2, @@ -1188,14 +1187,13 @@ class AlgoHint(Enum): NHWC, NHWC, GemmAlgo.Turing, - TensorOp((16, 8, 32)), + TensorOp((8, 8, 16)), mask_sparse=True, increment_k_first=True, access_per_vector=1, is_nvrtc=True, int8_inference=True), - - *gen_conv_params(ConvFwdAndBwdInput, (64, 128, 32), (32, 64, 32), + *gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (32, 16, 32), NDIM_DONT_CARE, ConvIterAlgo.Optimized, 2, @@ -1204,14 +1202,14 @@ class AlgoHint(Enum): NHWC, NHWC, GemmAlgo.Turing, - TensorOp((16, 8, 16)), + TensorOp((8, 8, 16)), mask_sparse=True, increment_k_first=True, access_per_vector=1, is_nvrtc=True, int8_inference=True), - *gen_conv_params(ConvFwdAndBwdInput, (128, 64, 64), (64, 32, 64), + *gen_conv_params(ConvFwdAndBwdInput, (64, 64, 64), (32, 32, 64), NDIM_DONT_CARE, ConvIterAlgo.Optimized, 2, @@ -1220,14 +1218,14 @@ class AlgoHint(Enum): NHWC, NHWC, GemmAlgo.Turing, - TensorOp((16, 8, 32)), + TensorOp((8, 8, 16)), mask_sparse=True, increment_k_first=True, access_per_vector=1, is_nvrtc=True, int8_inference=True), - # TODO 16,8,32 produce wrong result. - *gen_conv_params(ConvFwdAndBwdInput, (128, 64, 32), (64, 32, 32), + + *gen_conv_params(ConvFwdAndBwdInput, (64, 128, 64), (32, 64, 64), NDIM_DONT_CARE, ConvIterAlgo.Optimized, 2, @@ -1236,14 +1234,30 @@ class AlgoHint(Enum): NHWC, NHWC, GemmAlgo.Turing, - TensorOp((16, 8, 16)), + TensorOp((8, 8, 16)), + mask_sparse=True, + increment_k_first=True, + access_per_vector=1, + is_nvrtc=True, + int8_inference=True), + + *gen_conv_params(ConvFwdAndBwdInput, (64, 128, 32), (32, 64, 32), + NDIM_DONT_CARE, + ConvIterAlgo.Optimized, + 2, + ["s8,s8,s8,s32,f32", "s8,s8,s8,s32,f16"], + NHWC, + NHWC, + NHWC, + GemmAlgo.Turing, + TensorOp((8, 8, 16)), mask_sparse=True, increment_k_first=True, access_per_vector=1, is_nvrtc=True, int8_inference=True), - *gen_conv_params(ConvFwdAndBwdInput, (128, 256, 64), (64, 128, 64), + *gen_conv_params(ConvFwdAndBwdInput, (128, 64, 64), (64, 32, 64), NDIM_DONT_CARE, ConvIterAlgo.Optimized, 2, @@ -1252,14 +1266,14 @@ class AlgoHint(Enum): NHWC, NHWC, GemmAlgo.Turing, - TensorOp((16, 8, 32)), + TensorOp((8, 8, 16)), mask_sparse=True, increment_k_first=True, access_per_vector=1, is_nvrtc=True, int8_inference=True), - - *gen_conv_params(ConvFwdAndBwdInput, (256, 128, 64), (128, 64, 64), + # TODO 16,8,32 produce wrong result. + *gen_conv_params(ConvFwdAndBwdInput, (128, 64, 32), (64, 32, 32), NDIM_DONT_CARE, ConvIterAlgo.Optimized, 2, @@ -1268,14 +1282,14 @@ class AlgoHint(Enum): NHWC, NHWC, GemmAlgo.Turing, - TensorOp((16, 8, 32)), + TensorOp((8, 8, 16)), mask_sparse=True, increment_k_first=True, access_per_vector=1, is_nvrtc=True, int8_inference=True), - *gen_conv_params(ConvFwdAndBwdInput, (128, 128, 128), (64, 64, 128), + *gen_conv_params(ConvFwdAndBwdInput, (128, 256, 64), (64, 128, 64), NDIM_DONT_CARE, ConvIterAlgo.Optimized, 2, @@ -1284,13 +1298,29 @@ class AlgoHint(Enum): NHWC, NHWC, GemmAlgo.Turing, - TensorOp((16, 8, 32)), + TensorOp((8, 8, 16)), mask_sparse=True, increment_k_first=True, access_per_vector=1, is_nvrtc=True, int8_inference=True), + *gen_conv_params(ConvFwdAndBwdInput, (256, 128, 64), (128, 64, 64), + NDIM_DONT_CARE, + ConvIterAlgo.Optimized, + 2, + ["s8,s8,s8,s32,f32", "s8,s8,s8,s32,f16"], + NHWC, + NHWC, + NHWC, + GemmAlgo.Turing, + TensorOp((8, 8, 16)), + mask_sparse=True, + increment_k_first=True, + access_per_vector=1, + is_nvrtc=True, + int8_inference=True), + *gen_conv_params(ConvFwdAndBwdInput, (128, 128, 64), (64, 64, 64), NDIM_DONT_CARE, ConvIterAlgo.Optimized, @@ -1300,7 +1330,7 @@ class AlgoHint(Enum): NHWC, NHWC, GemmAlgo.Turing, - TensorOp((16, 8, 32)), + TensorOp((8, 8, 16)), mask_sparse=True, increment_k_first=True, access_per_vector=1, diff --git a/test/test_all_algo.py b/test/test_all_algo.py index 2e440d9..7cc7ed7 100644 --- a/test/test_all_algo.py +++ b/test/test_all_algo.py @@ -330,10 +330,10 @@ def _test_impgemm_conv_cuda(subm: bool): device = torch.device("cuda:0") shapes = [[19, 18, 17]] batchsizes = [1] - dtypes = [(np.float32, np.float32), (np.float16, np.float16)] + # dtypes = [(np.float32, np.float32), (np.float16, np.float16)] # dtypes = [np.float16] # dtypes = [(np.int8, np.int8), (np.int8, np.float32), (np.int8, np.float16)] - # dtypes = [(np.int8, np.int8)] + dtypes = [(np.int8, np.int8)] # dtypes = [(np.float16, np.float16)] test_case = TestCase() @@ -341,6 +341,9 @@ def _test_impgemm_conv_cuda(subm: bool): # out_channels = [32, 48, 64] in_channels = [32, 47] out_channels = [32, 48, 62] + in_channels = [16] + out_channels = [16] + # in_channels = [16] # out_channels = [16] diff --git a/tools/build-wheels-dev.sh b/tools/build-wheels-dev.sh index a41d6e2..6917095 100755 --- a/tools/build-wheels-dev.sh +++ b/tools/build-wheels-dev.sh @@ -26,7 +26,7 @@ function repair_wheel { } gcc -v export SPCONV_DISABLE_JIT="1" -export CUMM_CUDA_ARCH_LIST="7.5" +export CUMM_CUDA_ARCH_LIST="8.6" # export SPCONV_PYTHON_LIST="3.7;3.8;3.9;3.10" # Compile wheels, we only support 3.6-3.10. # "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp diff --git a/version.txt b/version.txt index f90b1af..0bee604 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -2.3.2 +2.3.3