v2.1.3: fix a bug in cpu only

traveller59 · Nov 8, 2021 · f31eee3 · f31eee3
1 parent abd3638
commit f31eee3
Show file tree

Hide file tree

Showing 7 changed files with 41 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -14,9 +14,13 @@
  limitations under the License.
 -->
 
+[pypi-download]: https://img.shields.io/pypi/dm/spconv-cu114
+[pypi-url]: https://pypi.org/project/spconv-cu114/
+[pypi-image]: https://badge.fury.io/py/spconv-cu114.svg
+
 # SpConv: Spatially Sparse Convolution Library
+[![Build Status](https://github.com/traveller59/spconv/workflows/build/badge.svg)](https://github.com/traveller59/spconv/actions?query=workflow%3Abuild) [![PyPI Version][pypi-image]][pypi-url] [![pypi monthly download][pypi-download]][pypi-url]
 
-[![Build Status](https://github.com/traveller59/spconv/workflows/build/badge.svg)](https://github.com/traveller59/spconv/actions?query=workflow%3Abuild)
 
 ```spconv``` is a project that provide heavily-optimized sparse convolution implementation with tensor core support.
 
@@ -28,7 +32,7 @@ Spconv 1.x users **NEED READ [THIS](docs/SPCONV_2_BREAKING_CHANGEs.md)** before
 
 ## Spconv 2.1 vs Spconv 1.x
 
-* spconv now can be installed by **pip**. see install section in readme for more details.
+* spconv now can be installed by **pip**. see install section in readme for more details. Users don't need to build manually anymore!
 * Microsoft Windows support (only windows 10 has been tested).
 * fp32 (not tf32) training/inference speed is increased (+50~80%)
 * fp16 training/inference speed is greatly increased when your layer support tensor core (channel size must be multiple of 8).
@@ -87,6 +91,7 @@ CUDA 11.1 will be removed in spconv 2.2 because pytorch 1.10 don't provide prebu
 
 **NOTE** It's safe to have different **minor** cuda version between system and conda (pytorch) **in Linux**. for example, you can use spconv-cu114 with anaconda version of pytorch cuda 11.1 in a OS with CUDA 11.2 installed.
 
+**NOTE** In Linux, you can install spconv-cuxxx without install CUDA to system! only suitable NVIDIA driver is required. for CUDA 11, we need driver >= 450.82.
 
 ### Build from source for development (JIT, recommend)
 
@@ -147,7 +152,7 @@ You need to rebuild ```cumm``` first if you are build along a CUDA version that
 
 ## Note
 
-The work is done when the author is an employee at Tusimple.
+The work is done when the author is an employee at [Tusimple](https://www.tusimple.com/).
 
 ## LICENSE
 

diff --git a/docs/PERFORMANCE_GUIDE.md b/docs/PERFORMANCE_GUIDE.md
@@ -18,11 +18,12 @@
 
 ## Short Guide
 
-* If you train without Tensor Core (i.e. FP32 training), set all ```algo``` in convolution/maxpool to ```ConvAlgo.Native``` manually.
+* If you train without Tensor Core (i.e. FP32 training), set all ```algo``` in convolution/maxpool to ```ConvAlgo.Native``` manually. Default Algorithm is ```ConvAlgo.MaskImplicitGemm```, which is **SLOWER** than ```ConvAlgo.Native``` when use float32. this will be fixed in spconv 2.2.
 * If your GPU support Tensor Core, use FP16 (mixed precision training) if possible. 
 * If you train with mixed precision training (use Tensor Core), you don't need to set algorithm manually.
 * Currently fast algorithm only support kernel volume (prod of kernel size) <= 32, so don't use large kernel size.
 * make sure your channel size is multiple of 8 when using fp16. multiple of 32 is better.
+* spconv 2.x in Windows 10 is 1.5x~2x slower than Linux. use Linux if possible.
 
 Network Benchmark without batchnorm (F32/F16) in RTX 3080 Laptop GPU
 

diff --git a/example/mnist_sparse.py b/example/mnist_sparse.py
@@ -65,7 +65,7 @@ def forward(self, x: torch.Tensor):
 def train(args, model, device, train_loader, optimizer, epoch):
     model.train()
     scaler = torch.cuda.amp.grad_scaler.GradScaler()
-    amp_ctx = identity_ctx()
+    amp_ctx = contextlib.nullcontext()
     if args.fp16:
         amp_ctx = torch.cuda.amp.autocast()
     for batch_idx, (data, target) in enumerate(train_loader):
@@ -105,7 +105,7 @@ def test(args, model, device, test_loader):
     model.eval()
     test_loss = 0
     correct = 0
-    amp_ctx = identity_ctx()
+    amp_ctx = contextlib.nullcontext()
     if args.fp16:
         amp_ctx = torch.cuda.amp.autocast()
 

diff --git a/spconv/cppconstants.py b/spconv/cppconstants.py
@@ -0,0 +1,20 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import spconv.core_cc as _ext
+
+if hasattr(_ext, "cumm"):
+    CPU_ONLY_BUILD = False
+else:
+    CPU_ONLY_BUILD = True 
diff --git a/spconv/pytorch/conv.py b/spconv/pytorch/conv.py
@@ -26,6 +26,7 @@
 from spconv.core import ConvAlgo
 import spconv.pytorch.functional as Fsp
 from spconv.pytorch import ops
+from spconv.cppconstants import CPU_ONLY_BUILD
 from spconv.pytorch.core import IndiceData, SparseConvTensor, ImplicitGemmIndiceData
 from spconv.pytorch.modules import SparseModule
 from spconv.constants import FILTER_HWIO
@@ -117,7 +118,7 @@ def __init__(self,
         self.subm = subm
         self.indice_key = indice_key
         if algo is None:
-            if kv <= 32:
+            if kv <= 32 and not CPU_ONLY_BUILD:
                 if kv < 8:
                     algo = ConvAlgo.MaskImplicitGemm
                 else:
@@ -126,7 +127,8 @@ def __init__(self,
                 algo = ConvAlgo.Native
         if kv > 32:
             assert algo == ConvAlgo.Native, "implicit gemm don't support kv >= 32 for now"
-
+        if CPU_ONLY_BUILD:
+            assert algo == ConvAlgo.Native, "cpu only build only support native algorithm"
         self.algo = algo
         # self.algo = ConvAlgo.Native
         if self.algo == ConvAlgo.Native:

diff --git a/spconv/pytorch/pool.py b/spconv/pytorch/pool.py
@@ -28,6 +28,7 @@
 from spconv.pytorch import ops
 from spconv.pytorch.core import IndiceData, ImplicitGemmIndiceData
 from spconv.pytorch.modules import SparseModule
+from spconv.cppconstants import CPU_ONLY_BUILD
 
 
 class SparseMaxPool(SparseModule):
@@ -63,7 +64,7 @@ def __init__(self,
         if algo is None:
             # keep in mind that this algorithm is set for Inverse Sparse Conv
             # maxpool itself don't need mask.
-            if kv <= 32:
+            if kv <= 32 and not CPU_ONLY_BUILD:
                 if kv < 8:
                     algo = ConvAlgo.MaskImplicitGemm
                 else:
@@ -72,6 +73,8 @@ def __init__(self,
                 algo = ConvAlgo.Native
         if kv > 32:
             assert algo == ConvAlgo.Native, "implicit gemm don't support kv >= 32 for now"
+        if CPU_ONLY_BUILD:
+            assert algo == ConvAlgo.Native, "cpu only build only support native algorithm"
 
         self.algo = algo
 

diff --git a/version.txt b/version.txt
@@ -1 +1 @@
-2.1.2
+2.1.3