diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
deleted file mode 100644
index c14d1cb..0000000
--- a/.github/workflows/build.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-name: build
-
-on:
- push:
- branches:
- - master
- schedule:
- # * is a special character in YAML so you have to quote this string
- - cron: '0 0 * * 6' # base builds run every saturday
-
-jobs:
- build:
- runs-on: ubuntu-latest
- env:
- DOCKER_IMAGE_NAME: scrin/dev-spconv
- DOCKER_FILE_PATH: ./Dockerfile
-
- # TODO: create a action to reuse code. the problem is how to reuse docker-login.
- steps:
- - uses: actions/checkout@master
- - name: Build Docker
- run: |
- docker build . --file ${{env.DOCKER_FILE_PATH}} --tag ${{env.DOCKER_IMAGE_NAME}}:latest
- docker tag ${{env.DOCKER_IMAGE_NAME}}:latest ${{env.DOCKER_IMAGE_NAME}}:${{ github.sha }}
-
-
- - name: Login to Registry
- uses: azure/docker-login@v1
- with:
- username: ${{ secrets.DOCKER_USERNAME }}
- password: ${{ secrets.DOCKER_PASSWORD }}
-
- - name: Publish to Registry
- run: |
- docker push ${{env.DOCKER_IMAGE_NAME}}:latest
- docker push ${{env.DOCKER_IMAGE_NAME}}:${{ github.sha }}
diff --git a/.gitignore b/.gitignore
index a68262c..657dd08 100644
--- a/.gitignore
+++ b/.gitignore
@@ -107,3 +107,5 @@ venv.bak/
.mypy_cache/
.vscode
+
+__version__.py
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
index 63f9714..e69de29 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +0,0 @@
-[submodule "third_party/pybind11"]
- path = third_party/pybind11
- url = https://github.com/pybind/pybind11.git
-[submodule "third_party/cutlass"]
- path = third_party/cutlass
- url = https://github.com/NVIDIA/cutlass
-[submodule "third_party/mp11"]
- path = third_party/mp11
- url = https://github.com/boostorg/mp11
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ff4fa57..8a3cfcf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
# Changelog
+## [2.0.0] - 2021-10-16
+### Changed
+- Change build system from cmake to pccm.
+- Change pytorch python code to spconv.pytorch
+- Rewrite All c++ code.
+
## [1.2.1] - 2020-06-04
### Changed
- The subm indice pair generation speed is greatly increased by two tricks: 1. most subm conv use only kernelsize=3, so we can unroll loops to get 100% performance increase. 2. subm indice pairs have a property: indicePairs[0, i] = indicePairs[1, kernelVolume - i - 1], so we can get another 100% performance increase.
diff --git a/CMakeLists.txt b/CMakeLists.txt
deleted file mode 100644
index 6be70cf..0000000
--- a/CMakeLists.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
-
-option(SPCONV_BuildTests "Build the unit tests when BUILD_TESTING is enabled." ON)
-option(SPCONV_BuildCUDA "Build cuda code when BUILD_TESTING is enabled." ON)
-if (SPCONV_BuildCUDA)
- project(SparseConv LANGUAGES CXX CUDA VERSION 1.1)
-else()
- project(SparseConv LANGUAGES CXX VERSION 1.1)
-endif()
-
-if(WIN32) # true if windows (32 and 64 bit)
- add_compile_definitions(TV_WINDOWS)
-endif()
-add_compile_definitions(PYTORCH_VERSION=${PYTORCH_VERSION})
-
-set(CMAKE_CXX_EXTENSIONS OFF) # avoid gnu++11 be added to CXX flags
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
- add_compile_definitions(TV_DEBUG)
-endif()
-# add_compile_definitions(TV_LOG_KERNEL_INFO)
-
-find_package(Torch REQUIRED)
-# set(CMAKE_VERBOSE_MAKEFILE ON)
-if (SPCONV_BuildCUDA)
- set(CUDA_TOOLKIT_ROOT_DIR "${CMAKE_CUDA_COMPILER}")
- get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR}" DIRECTORY)
- get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR}" DIRECTORY)
- if(WIN32) # true if windows (32 and 64 bit)
- set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64")
- else()
- set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64")
- endif()
- find_library(CUDA_CUDART NAMES cudart HINTS ${CUDA_LIB_PATH_HINTS})
- find_library(CUDA_CUBLAS NAMES cublas HINTS ${CUDA_LIB_PATH_HINTS})
- torch_cuda_get_nvcc_gencode_flag(NVCC_FLAGS_EXTRA)
- string (REPLACE ";" " " NVCC_FLAGS_EXTRA_STR "${NVCC_FLAGS_EXTRA}")
- set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA_STR}")
- add_compile_definitions(TV_CUDA)
-endif()
-# add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
-add_subdirectory(third_party/pybind11)
-
-set(ALL_LIBS ${TORCH_LIBRARIES})
-
-set(ALL_INCLUDE ${PROJECT_SOURCE_DIR}/include)
-set(MP11_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/mp11/include)
-set(CUTLASS_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/cutlass/include)
-
-if (SPCONV_BuildCUDA)
- set(ALL_LIBS ${ALL_LIBS} ${CUDA_CUDART} ${CUDA_CUBLAS})
- set(ALL_INCLUDE ${ALL_INCLUDE} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
- add_subdirectory(src/cuhash)
- add_subdirectory(src/spgemm)
-endif()
-add_subdirectory(src/spconv)
-add_subdirectory(src/utils)
-
-if (SPCONV_BuildTests)
- include(CTest) #adds option BUILD_TESTING (default ON)
- if(BUILD_TESTING)
- enable_testing()
- add_subdirectory(test)
- endif()
-endif()
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index 007ba79..0000000
--- a/Dockerfile
+++ /dev/null
@@ -1,9 +0,0 @@
-FROM scrin/dev:latest
-
-RUN PROBLEM_FILE=/usr/local/lib/python3.8/dist-packages/torch/share/cmake/Caffe2/Caffe2Targets.cmake && \
- sed -i 's/-Wall;-Wextra;-Wno-unused-parameter;-Wno-missing-field-initializers;-Wno-write-strings;-Wno-unknown-pragmas;-Wno-missing-braces;-fopenmp//g' $PROBLEM_FILE && \
- sed -i 's/-Wall;-Wextra;-Wno-unused-parameter;-Wno-missing-field-initializers;-Wno-write-strings;-Wno-unknown-pragmas;-Wno-missing-braces//g' $PROBLEM_FILE && \
- cd /root && \
- git clone --depth 1 --recursive https://www.github.com/traveller59/spconv.git && \
- cd ./spconv && \
- SPCONV_FORCE_BUILD_CUDA=1 python setup.py install
diff --git a/LICENSE b/LICENSE
index b131473..ba46142 100644
--- a/LICENSE
+++ b/LICENSE
@@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.
- Copyright 2019-2020 Yan Yan
+ Copyright 2019-2021 Yan Yan
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
diff --git a/PERFORMANCE_GUIDE.md b/PERFORMANCE_GUIDE.md
deleted file mode 100644
index 54bef8f..0000000
--- a/PERFORMANCE_GUIDE.md
+++ /dev/null
@@ -1,31 +0,0 @@
-## Performance Guide
-
-### 1. Regular sparse conv is very slow
-
-Regular sparse convolution will greatly increase the number of active points. for 3x3x3 3D convolution, we can get at most 27x active points, which means next convolution will perform 27x slower!
-
-This problem can be solved by using submanifold convolution (SubMConv3d). This kind of sparse convolution doesn't generate new active points.
-
-**NEVER** use SparseConv3d except downsample data, **NEVER** use SparseConv3dTranspose, use SparseInverseConv3d instead.
-
-### 2. Large Spatial Shape cost too much GPU memory
-
-Our implementation use dense map to generate indices in GPU for sparse convolution, which means if your spatial shape is ```[batchSize=4, 1600, 1600, 40]```, it will cost ~2GB GPU memory.
-
-To solve this problem, you can use CPU algorithm (hash map) for first layer that has large shape, then convert generated indices to GPU and use GPU algorithm for downsampled data.
-
-Another way is use cuda hash. Unfortunately this library isn't stable enough, it should only be used when the spatial shape is very large.
-
-### 3. Stacked submanifold convolution can share same indice data
-
-When you using stacked subm convolution, there is no need to generate indice data again, but this can't be done automatically. you need to specify a unique key ```indice_key="c0"``` and use it for all stacked subm convolution.
-
-### 4. Different convolution algorithm may lead to different performance
-
-There are three kind of algorithm: ```Native```, ```Batch```, ```BatchGemmGather```.
-
-* ```Native```: should be used for all submanifold convolutions. should be used when there are too much active points.
-
-* ```Batch```: **cost more GPU memory** should be used when number of active points is small.
-
-* ```BatchGemmGather```: **cost more GPU memory** can be used for regular convolution.
\ No newline at end of file
diff --git a/README.md b/README.md
index 1e95261..c4b671c 100644
--- a/README.md
+++ b/README.md
@@ -1,173 +1,95 @@
+
+
# SpConv: PyTorch Spatially Sparse Convolution Library
[data:image/s3,"s3://crabby-images/c4210/c42106d03a8eab3e49061917256ac789c69efd24" alt="Build Status"](https://github.com/traveller59/spconv/actions?query=workflow%3Abuild)
-This is a spatially sparse convolution library like [SparseConvNet](https://github.com/facebookresearch/SparseConvNet) but faster and easy to read. This library provide sparse convolution/transposed, submanifold convolution, inverse convolution and sparse maxpool.
-
-
-2020-5-2, we add ConcatTable, JoinTable, AddTable, and Identity function to build ResNet and Unet in this version of spconv.
-
-
-## Docker:
-
-```docker pull scrin/dev-spconv```, contains python 3.8, cuda 10.1, fish shell, newest pytorch and tensorflow.
-
-## Install on Ubuntu 16.04/18.04
-
-* if you are using pytorch 1.4+ and encounter "nvcc fatal: unknown -Wall", you need to go to torch package dir and remove flags contains "-Wall" in INTERFACE_COMPILE_OPTIONS in Caffe2Targets.cmake. This problem can't be fixed in this project (to avoid this, I need to remove all torch dependency in cuda sources and drop half support).
-
-0. Use ```git clone xxx.git --recursive``` to clone this repo.
-
-1. Install boost headers to your system include path, you can use either ```sudo apt-get install libboost-all-dev``` or download compressed files from boost official website and copy headers to include path.
-
-2. Download cmake >= 3.13.2, then add cmake executables to PATH.
-
-3. Ensure you have installed pytorch 1.0+ in your environment, run ```python setup.py bdist_wheel``` (don't use ```python setup.py install```).
-
-4. Run ```cd ./dist```, use pip to install generated whl file.
-
-## Install on Windows 10 with CUDA 10 and python 3.6 (python 3.7 may have problem, see [this](https://github.com/pytorch/pytorch/issues/17233))
-
-Since install newest driver and CUDA is very simple on windows, please use CUDA 10 on windows.
-
-0. Install Visual Studio 2017. Use ```git clone xxx.git --recursive``` to clone this repo.
-
-1. Download compressed files from boost official website and copy headers (i.e. boost_1_69/boost) to spconv/include.
-
-2. Download and install cmake >= 3.13.2, select add cmake to User or System PATH.
-
-3. Ensure you have installed pytorch 1.0 in your environment, run ```python setup.py bdist_wheel``` (don't use ```python setup.py install```).
-
-4. Run ```cd ./dist```, use pip to install generated whl file.
-
-## Compare with SparseConvNet
-
-### Features
-
-* SparseConvNet's Sparse Convolution don't support padding and dilation, spconv support this.
-
-* spconv only contains sparse convolutions, the batchnorm and activations can directly use layers from torch.nn, SparseConvNet contains lots of their own implementation of layers such as batchnorm and activations.
-
-### Speed
-
-* spconv is faster than SparseConvNet due to gpu indice generation and gather-gemm-scatter algorithm. SparseConvNet use hand-written gemm which is slow.
-
-## Usage
-
-### SparseConvTensor
-
-```Python
-features = # your features with shape [N, numPlanes]
-indices = # your indices/coordinates with shape [N, ndim + 1], batch index must be put in indices[:, 0]
-spatial_shape = # spatial shape of your sparse tensor, spatial_shape[i] is shape of indices[:, 1 + i].
-batch_size = # batch size of your sparse tensor.
-x = spconv.SparseConvTensor(features, indices, spatial_shape, batch_size)
-x_dense_NCHW = x.dense() # convert sparse tensor to dense NCHW tensor.
-print(x.sparity) # helper function to check sparity.
-```
-
-### Sparse Convolution
-
-```Python
-import spconv
-from torch import nn
-class ExampleNet(nn.Module):
- def __init__(self, shape):
- super().__init__()
- self.net = spconv.SparseSequential(
- spconv.SparseConv3d(32, 64, 3), # just like nn.Conv3d but don't support group and all([d > 1, s > 1])
- nn.BatchNorm1d(64), # non-spatial layers can be used directly in SparseSequential.
- nn.ReLU(),
- spconv.SubMConv3d(64, 64, 3, indice_key="subm0"),
- nn.BatchNorm1d(64),
- nn.ReLU(),
- # when use submanifold convolutions, their indices can be shared to save indices generation time.
- spconv.SubMConv3d(64, 64, 3, indice_key="subm0"),
- nn.BatchNorm1d(64),
- nn.ReLU(),
- spconv.SparseConvTranspose3d(64, 64, 3, 2),
- nn.BatchNorm1d(64),
- nn.ReLU(),
- spconv.ToDense(), # convert spconv tensor to dense and convert it to NCHW format.
- nn.Conv3d(64, 64, 3),
- nn.BatchNorm1d(64),
- nn.ReLU(),
- )
- self.shape = shape
-
- def forward(self, features, coors, batch_size):
- coors = coors.int() # unlike torch, this library only accept int coordinates.
- x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
- return self.net(x)# .dense()
-```
+# WORK IN PROGRESS, DON'T USE!!!
-### Inverse Convolution
+## Breaking changes in Spconv 2.x
-Inverse sparse convolution means "inv" of sparse convolution. the output of inverse convolution contains same indices as input of sparse convolution.
+* ```spconv.xxx``` move to ```spconv.pytorch.xxx```, change all ```import spconv``` to ```import spconv.pytorch as spconv``` and ```from spconv.xxx import``` to ```from spconv.pytorch.xxx import```.
+* ```use_hash``` in Sparse Convolution is removed, we only use hash table in 2.x.
+* weight layout has been changed to RSKC (native algorithm) or KRSC (implicit gemm), no longer RSCK (spconv 1.x). RS is kernel size, C is input channel, K is output channel.
+* all util ops are removed (pillar scatter/nms/...)
+* VoxelGenerator has been replaced by Point2VoxelGPU[1-4]d/Point2VoxelCPU[1-4]d.
+* spconv 2.x don't support CPU for now
-Inverse convolution usually used in semantic segmentation.
+## News in Spconv 2.0.0
-```Python
-class ExampleNet(nn.Module):
- def __init__(self, shape):
- super().__init__()
- self.net = spconv.SparseSequential(
- spconv.SparseConv3d(32, 64, 3, 2, indice_key="cp0"),
- spconv.SparseInverseConv3d(64, 32, 3, indice_key="cp0"), # need provide kernel size to create weight
- )
- self.shape = shape
+* training/inference speed is increased
+* support int8/tensor core
+* doesn't depend on pytorch binary.
+* If your GPU has tensor core, try mixed precision training in spconv 2.x!
+* since spconv 2.x doesn't depend on pytorch binary (never in future), it's impossible to support torch.jit/libtorch inference.
- def forward(self, features, coors, batch_size):
- coors = coors.int()
- x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
- return self.net(x)
-```
+## TODO in Spconv 2.x
+- [ ] Ampere (A100 / RTX 3000 series) feature support (work in progress)
+- [ ] torch QAT support (work in progress)
+- [ ] TensorRT (torch.fx based)
+- [ ] Build C++ only package
+- [ ] JIT compilation for CUDA kernels
+- [ ] Document (low priority)
+- [ ] CPU support (low priority)
-### Utility functions
+## Install
-* convert point cloud to voxel
+You need to install python >= 3.6 first to use spconv 2.x.
-```Python
+You need to install CUDA toolkit first before using prebuilt binaries or build from source.
-voxel_generator = spconv.utils.VoxelGenerator(
- voxel_size=[0.1, 0.1, 0.1],
- point_cloud_range=[-50, -50, -3, 50, 50, 1],
- max_num_points=30,
- max_voxels=40000
-)
+You need at least CUDA 10.2 to build and run spconv 2.x. We won't offer any support for CUDA < 10.2.
-points = # [N, 3+] tensor.
-voxels, coords, num_points_per_voxel = voxel_generator.generate(points)
-```
+### Prebuilt
-## Implementation Details
+We offer python 3.6-3.10 and cuda 10.2/11.1/11.4 prebuilt binaries for linux (manylinux) and windows 10/11.
-This implementation use gather-gemm-scatter framework to do sparse convolution.
+We will offer prebuilts for CUDA versions supported by latest pytorch release. For example, pytorch 1.9 support cuda 10.2 and 11.1, so we support them too.
-## Projects using spconv:
+For Linux users, you need to install pip >= 20.3 first to install prebuilt.
-* [second.pytorch](https://github.com/traveller59/second.pytorch): Point Cloud Object Detection in KITTI Dataset.
+```pip install spconv-cu102``` for CUDA 10.2
-## Authors
+```pip install spconv-cu111``` for CUDA 11.1
-* **Yan Yan** - *Initial work* - [traveller59](https://github.com/traveller59)
+```pip install spconv-cu114``` for CUDA 11.4
-* **Bo Li** - *gpu indice generation idea, owner of patent of the sparse conv gpu indice generation algorithm (don't include subm)* - [prclibo](https://github.com/prclibo)
+### Build from source
-## Third party libraries
+You need to rebuild ```cumm``` first if you are build along a CUDA version that not provided in prebuilts.
-* [CUDPP](https://github.com/cudpp/cudpp): A cuda library. contains a cuda hash implementation.
+#### Linux
-* [robin-map](https://github.com/Tessil/robin-map): A fast c++ hash library. almost 2x faster than std::unordered_map in this project.
+1. install build-essential, install CUDA
+2. run ```export SPCONV_DISABLE_JIT="1"```
+3. run ```python setup.py install```/```pip install -e .```/```python setup.py bdist_wheel```+```pip install dists/xxx.whl```
-* [pybind11](https://github.com/pybind/pybind11): A head-only python c++ binding library.
+#### Windows 10/11
-* [prettyprint](https://github.com/louisdx/cxx-prettyprint): A head-only library for container print.
+1. install visual studio 2019 or newer. make sure C++ development package is installed. install CUDA
+2. set [powershell script execution policy](https://docs.microsoft.com/en-us/powershell/module/microsoft.powershell.core/about/about_execution_policies?view=powershell-7.1)
+3. start a new powershell, run ```tools/msvc_setup.ps1```
+4. run ```$Env:SPCONV_DISABLE_JIT = "1"```
+5. run ```python setup.py install```/```pip install -e .```/```python setup.py bdist_wheel```+```pip install dists/xxx.whl```
-## License
+## Note
-This project is licensed under the Apache license 2.0 License - see the [LICENSE.md](LICENSE.md) file for details
+The work is done when the author is an employee at Tusimple.
-The [CUDPP](https://github.com/cudpp/cudpp) hash code is licensed under BSD License.
+## LICENSE
-The [robin-map](https://github.com/Tessil/robin-map) code is licensed under MIT license.
+Apache 2.0
\ No newline at end of file
diff --git a/codeai-devops.yaml b/codeai-devops.yaml
deleted file mode 100644
index 47afa8c..0000000
--- a/codeai-devops.yaml
+++ /dev/null
@@ -1,116 +0,0 @@
-global:
- console_url: localhost:50091
- envs:
- PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python # c++ libprotobuf and python will conflicit
-
-analyzers: # only one analyzer is allowed for one type for now.
- PythonAnalyzer:
-
- SimpleCPPAnalyzer: # $ devops.devs = ["_ci_dev_xxx"] is allowed in raw sources.
- includes: ["*.cpp", "*.cu", "*.cc", "*.h", "*.hpp", "*.hxx", "*.cxx"]
-
-observers:
- # run test functions when that function change or marked function change.
- test:
- type: TestObserver
-
- # run dev functions when that function change or marked function change.
- dev:
- type: DevObserver
- pattern: _ci_dev_.*
-
- clangdev:
- type: CPPDevObserver
- main_pattern: dev_.*\.(cc|cpp|cxx)
- pattern: .*\.(cc|cpp|cxx|h|hpp|hxx)
- compiler: clang++
- executable: build/codeai_dev
- includes: [
- include,
- /usr/local/cuda/include,
- /home/yy/anaconda3/include,
- /home/yy/anaconda3/include/python3.7m,
- third_party/pybind11/include,
- third_party/include,
- /home/yy/library/boost_1_72_0,
- ]
- libpaths: [
- /home/yy/anaconda3/lib,
- ]
- libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg]
- std: c++14
- options: [-Wall, -Wextra]
-
- cudadev:
- type: CPPDevObserver
- main_pattern: dev_.*\.cu
- pattern: .*\.(cc|cpp|cxx|h|hpp|hxx|cu)
- compiler: nvcc
- executable: build/codeai_dev_cuda
- run_cmd: [$(executable)]
- sources: []
- includes: [
- include,
- /usr/local/cuda/include,
- /home/yy/anaconda3/include,
- /home/yy/anaconda3/include/python3.7m,
- third_party/pybind11/include,
- third_party/cutlass/include,
- ]
- libpaths: [
- /usr/local/cuda/lib64,
- /home/yy/anaconda3/lib,
- ]
- libraries: [-lpython3.7m, -lcudart, -lcublas, -ljpeg]
- std: c++14
- options: [
- -Wno-deprecated-declarations,
- # "-gencode=arch=compute_52,code=sm_52",
- "-gencode=arch=compute_61,code=sm_61",
- # "-gencode=arch=compute_60,code=sm_60",
- # "-gencode=arch=compute_70,code=sm_70",
- # "-gencode=arch=compute_75,code=sm_75",
- ]
-
- torchdev:
- type: CPPDevObserver
- main_pattern: torchdev_.*\.(cu|cpp|cc|cxx)
- pattern: .*\.(cc|cpp|cxx|h|hpp|hxx|cu)
- compiler: nvcc
- executable: build/codeai_dev_torch
- run_cmd: [$(executable)]
- fail_cmds: # run cmd when pervious run fail with retcode
- -6: [gdb, -ex, run, -ex, bt, -ex, quit, $(executable)] # segfault in unix
- includes: [
- include,
- /home/yy/anaconda3/lib/python3.7/site-packages/torch/include,
- /home/yy/anaconda3/lib/python3.7/site-packages/torch/include/torch/csrc/api/include,
- /usr/local/cuda/include,
- /home/yy/anaconda3/include,
- /home/yy/anaconda3/include/python3.7m,
- third_party/pybind11/include,
- third_party/cutlass/include,
- ]
- libpaths: [
- /home/yy/anaconda3/lib/python3.7/site-packages/torch/lib,
- /usr/local/cuda/lib64,
- /home/yy/anaconda3/lib,
- ]
- libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg, -lpthread,
- "-Xcompiler=\"-Wl,--no-as-needed,-lc10\"",
- "-Xcompiler=\"-Wl,--no-as-needed,-ltorch\"",
- "-Xcompiler=\"-Wl,--no-as-needed,-ltorch_cpu\"",
- "-Xcompiler=\"-Wl,--no-as-needed,-lc10_cuda\"",
- "-Xcompiler=\"-Wl,--no-as-needed,-ltorch_cuda\""]
- std: c++14
- # options: [--cuda-gpu-arch=sm_61, -Wno-deprecated-declarations, -D_GLIBCXX_USE_CXX11_ABI=0]
-
- options: [
- -Wno-deprecated-declarations,
- --expt-relaxed-constexpr,
- "-gencode=arch=compute_61,code=sm_61",
- -D_GLIBCXX_USE_CXX11_ABI=0,
- ]
-
-
-
diff --git a/docs/API.md b/docs/API.md
new file mode 100644
index 0000000..820fea0
--- /dev/null
+++ b/docs/API.md
@@ -0,0 +1,16 @@
+
+
diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md
new file mode 100644
index 0000000..820fea0
--- /dev/null
+++ b/docs/DEVELOPMENT.md
@@ -0,0 +1,16 @@
+
+
diff --git a/docs/PERFORMANCE_GUIDE.md b/docs/PERFORMANCE_GUIDE.md
new file mode 100644
index 0000000..820fea0
--- /dev/null
+++ b/docs/PERFORMANCE_GUIDE.md
@@ -0,0 +1,16 @@
+
+
diff --git a/example/mnist_sparse.py b/example/mnist_sparse.py
index 5830c8b..537d26c 100644
--- a/example/mnist_sparse.py
+++ b/example/mnist_sparse.py
@@ -1,7 +1,21 @@
+# Copyright 2021 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
from __future__ import print_function
import argparse
import torch
-import spconv
+import spconv.pytorch as spconv
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
diff --git a/example/voxel_gen.py b/example/voxel_gen.py
new file mode 100644
index 0000000..2c910f8
--- /dev/null
+++ b/example/voxel_gen.py
@@ -0,0 +1,38 @@
+# Copyright 2021 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from cumm import tensorview as tv
+from spconv.utils import Point2VoxelCPU3d
+
+
+def main():
+ gen = Point2VoxelCPU3d(
+ vsize_xyz=[0.1, 0.1, 0.1],
+ coors_range_xyz=[-80, -80, -2, 80, 80, 6],
+ num_point_features=3,
+ max_num_voxels=5000,
+ max_num_points_per_voxel=5)
+
+ pc = np.random.uniform(-10, 10, size=[1000, 3])
+ pc_tv = tv.from_numpy(pc)
+ # generate voxels, note that voxels_tv reference to a persistent buffer in generator,
+ # so we can't run it in multi-thread.
+ voxels_tv, indices_tv, num_p_in_vx_tv = gen.point_to_voxel(pc_tv)
+ # run voxel gen and FILL MEAN VALUE to voxel remain
+ voxels_tv, indices_tv, num_p_in_vx_tv = gen.point_to_voxel_empty_mean(pc_tv)
+
+if __name__ == "__main__":
+ main()
diff --git a/include/cuhash/cuda_util.h b/include/cuhash/cuda_util.h
deleted file mode 100644
index c3ee1ca..0000000
--- a/include/cuhash/cuda_util.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef _CUDA_UTIL_H_
-#define _CUDA_UTIL_H_
-
-#if CUDART_VERSION >= 4000
-#define CUDA_DEVICE_SYNCHRONIZE() cudaDeviceSynchronize();
-#else
-#define CUDA_DEVICE_SYNCHRONIZE() cudaThreadSynchronize();
-#endif
-
-#define CUDA_SAFE_CALL_NO_SYNC(call) \
- { \
- cudaError err = call; \
- if (cudaSuccess != err) { \
- fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, \
- __LINE__, cudaGetErrorString(err)); \
- exit(EXIT_FAILURE); \
- } \
- }
-
-#define CUDA_SAFE_CALL(call) CUDA_SAFE_CALL_NO_SYNC(call);
-
-//! Check for CUDA error
-#ifdef _DEBUG
-#define CUDA_CHECK_ERROR(errorMessage) \
- { \
- cudaError_t err = cudaGetLastError(); \
- if (cudaSuccess != err) { \
- fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
- errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
- exit(EXIT_FAILURE); \
- } \
- err = CUDA_DEVICE_SYNCHRONIZE(); \
- if (cudaSuccess != err) { \
- fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
- errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
- exit(EXIT_FAILURE); \
- } \
- }
-#else
-#define CUDA_CHECK_ERROR(errorMessage) \
- { \
- cudaError_t err = cudaGetLastError(); \
- if (cudaSuccess != err) { \
- fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
- errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
- exit(EXIT_FAILURE); \
- } \
- }
-#endif
-
-#endif
\ No newline at end of file
diff --git a/include/cuhash/debugging.h b/include/cuhash/debugging.h
deleted file mode 100644
index 22d4c1c..0000000
--- a/include/cuhash/debugging.h
+++ /dev/null
@@ -1,77 +0,0 @@
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file
- * debugging.h
- *
- * @brief Debugging/statistics/performance utilities header for hash tables.
- */
-
-#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__DEBUGGING__H
-#define CUDAHT__CUCKOO__SRC__LIBRARY__DEBUGGING__H
-
-#include "definitions.h"
-#include
-#include
-
-#include
-
-namespace cuhash {
-
-//! @name Debugging functions
-/// @{
-
-void TakeHashFunctionStatistics(const unsigned num_keys, const unsigned *d_keys,
- const unsigned table_size,
- const uint2 *constants,
- const unsigned kNumHashFunctions);
-
-//! Output how many probes were required by each thread to perform the
-//! retrieval.
-/*! @param[in] n_queries Number of queries being performed.
- * @param[in] d_retrieval_probes Device array: the number of probes taken for
- * each thread's retrieval.
- * @param[in] n_functions Number of hash functions used.
- */
-void OutputRetrievalStatistics(const unsigned n_queries,
- const unsigned *d_retrieval_probes,
- const unsigned n_functions);
-
-//! Outputs information about how many iterations threads required to
-//! successfully cuckoo hash.
-/*! @param[in] n Number of keys in the input.
- * @param[in] d_iterations_taken Device mem: Number of iterations each
- * thread took.
- * @param[in] d_max_iterations_taken Device mem: Largest number of iterations
- * taken by any thread.
- */
-void OutputBuildStatistics(const unsigned n,
- const unsigned *d_iterations_taken);
-
-//! Prints out the contents of the stash.
-void PrintStashContents(const Entry *d_stash);
-
-//! Checks if a key is assigned the same slot by different hash functions.
-bool CheckAssignedSameSlot(const unsigned N, const unsigned num_keys,
- const unsigned *d_keys, const unsigned table_size,
- uint2 *constants);
-
-/// @}
-
-}; // namespace cuhash
-
-#endif
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
diff --git a/include/cuhash/definitions.h b/include/cuhash/definitions.h
deleted file mode 100644
index 658fb87..0000000
--- a/include/cuhash/definitions.h
+++ /dev/null
@@ -1,116 +0,0 @@
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file definitions.h
- *
- * @brief Stores configuration flags and definitions for hard-coded values in
- * hash table implementations.
- */
-
-#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__DEFINITIONS__H
-#define CUDAHT__CUCKOO__SRC__LIBRARY__DEFINITIONS__H
-
-#include
-#include
-#include
-
-/* --------------------------------------------------------------------------
- Debugging.
- -------------------------------------------------------------------------- */
-#ifdef _DEBUG
-//! Forces the hash functions to generate a full set of slots for each key when
-//! not using subtables.
-// #define FORCEFULLY_GENERATE_NO_CYCLES
-
-//! Count how many iterations are taken to insert/find items.
-#define TRACK_ITERATIONS
-
-//! Count how many items fail to be inserted when the hash table fails to build.
-#define COUNT_UNINSERTED
-
-//! Take some statistics on the hash functions.
-#define TAKE_HASH_FUNCTION_STATISTICS
-
-#ifdef TAKE_HASH_FUNCTION_STATISTICS
-//! Determine how many keys hash into each table slot.
-#define COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
-
-//! Determine how many unique slots a key is assigned.
-#define COUNT_HOW_MANY_HAVE_CYCLES
-#endif
-#endif
-
-#ifdef USE_DAN_OUTPUT
-#include
-//! Logs any error messages.
-inline void PrintMessage(const char *message, const bool error = false) {
- PrintIndentedMessage(message, error);
-}
-#else
-//! Prints a message out to the console.
-inline void PrintMessage(const char *message, const bool error = false) {
- if (error) {
- printf("cudahash: %s\n", message);
- } else {
- printf("%s\n", message);
- }
-}
-#endif
-
-/* -------------------------------------------------------------------------
- Hash table constants and definitions.
- ------------------------------------------------------------------------- */
-namespace cuhash {
-
-/**
- * \addtogroup cudpp_hash_data_structures
- *
- * @{
- */
-
-typedef unsigned long long
- Entry; //!< A key and its value are stored in a 64-bit number. The key is
- //!< stored in the upper 32 bits.
-
-const unsigned kMaxRestartAttempts = 10; //!< Number of build attempts.
-const unsigned kKeyEmpty = 0xffffffffu; //!< Signifies empty slots in the table.
-const unsigned kNotFound =
- 0xffffffffu; //!< Signifies that a query key was not found.
-const unsigned kMaxHashFunctions =
- 5; //!< Maximum number of hash functions allowed.
-const unsigned kStashSize =
- 101; //!< How many slots the stash hash table contains.
-
-//! Value indicating that a hash table slot has no valid item within it.
-const Entry kEntryEmpty = Entry(kKeyEmpty) << 32;
-
-//! Value returned when a query fails.
-const Entry kEntryNotFound = (Entry(kKeyEmpty) << 32) + kNotFound;
-
-//! Number of threads to put in a thread block.
-const unsigned kBlockSize = 64;
-
-//! Number of blocks to put along each axis of the grid.
-const unsigned kGridSize = 16384;
-
-//! Minimum table sizes for 2 through 5 functions.
-const float kMinimumSpaceUsages[] = {std::numeric_limits::max(),
- std::numeric_limits::max(),
- 2.01f,
- 1.1f,
- 1.03f,
- 1.02f};
-
-/** @} */ // end cudpp_hash_data_structures
-
-}; // namespace cuhash
-
-#endif
diff --git a/include/cuhash/hash_functions.h b/include/cuhash/hash_functions.h
deleted file mode 100644
index 31ce6f7..0000000
--- a/include/cuhash/hash_functions.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*! @file hash_functions.h
- * @brief Hash function code.
- */
-
-#ifndef HASH_FUNCTIONS__H
-#define HASH_FUNCTIONS__H
-
-#include "definitions.h"
-#include
-#include
-
-namespace cuhash {
-
-//! Prime number larger than the largest practical hash table size.
-const unsigned kPrimeDivisor = 4294967291u;
-// https://www.alpertron.com.ar/ECM.HTM
-// const unsigned long kPrimeDivisor = 18446744073709551557lu
-// const long kPrimeDivisor = 9223372036854775783l
-// const Entry kPrimeDivisor = 4300000013lu;
-// const unsigned kPrimeDivisor = 334214459;
-
-//! Generates a set of linear hash function constants.
-/*! @param[in] N Number of hash functions.
- @param[out] constants CPU pointer to the constants.
- @param[in] num_keys Debug only: How many keys are in the input.
- @param[in] d_keys Debug only: Device memory array containing the input
- keys.
- @param[in] table_size Debug only: Size of the hash table.
- */
-void GenerateFunctions(const unsigned N, const unsigned num_keys,
- const unsigned *d_keys, const unsigned table_size,
- uint2 *constants);
-
-//! Container for all of the hash functions.
-template struct Functions {
- //! The constants required for all of the hash functions, including the stash.
- //! Each function requires 2.
- uint2 constants[N];
-
- //! Generate new hash function constants.
- /*! The parameters are only used for debugging and examining the key
- distribution. \param[in] num_keys Debug: Number of keys in the input.
- \param[in] d_keys Debug: Device array of the input keys.
- \param[in] table_size Debug: Size of the hash table.
- */
- void Generate(const unsigned num_keys, const unsigned *d_keys,
- const unsigned table_size) {
- GenerateFunctions(N, num_keys, d_keys, table_size, constants);
- }
-};
-
-//! Computes the value of a hash function for a given key.
-/*! \param[in] constants Constants used by the hash function.
- ! \param[in] key Key being hashed.
- ! \returns The value of the hash function for the key.
- */
-inline __device__ __host__ unsigned hash_function_inner(const uint2 constants,
- const unsigned key) {
-#if 1
- // Fast version.
- return ((constants.x ^ key) + constants.y) % kPrimeDivisor;
-#else
- // Slow version.
- return ((unsigned long long)constants.x * key + constants.y) % kPrimeDivisor;
-#endif
-}
-
-//! Computes the value of a hash function for a given key.
-/*! \param[in] functions All of the constants used by the hash functions.
- ! \param[in] which_function Which hash function is being used.
- ! \param[in] key Key being hashed.
- ! \returns The value of a hash function with a given key.
- */
-template
-TV_HOST_DEVICE_INLINE unsigned
-hash_function(const Functions functions,
- const unsigned which_function, const unsigned key) {
- return hash_function_inner(functions.constants[which_function], key);
-}
-
-//! Simple hash function used by the stash.
-TV_HOST_DEVICE_INLINE
-unsigned stash_hash_function(const uint2 stash_constants, const unsigned key) {
- return (stash_constants.x ^ key + stash_constants.y) % kStashSize;
-}
-
-unsigned generate_random_uint32();
-
-}; // namespace cuhash
-
-#endif
diff --git a/include/cuhash/hash_table.cuh b/include/cuhash/hash_table.cuh
deleted file mode 100644
index 29f9b70..0000000
--- a/include/cuhash/hash_table.cuh
+++ /dev/null
@@ -1,275 +0,0 @@
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file hash_table.cuh
- *
- * @brief Implements kernel and __device__ functions for a basic hash table.
- */
-
-#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__CUH
-#define CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__CUH
-
-#include "definitions.h"
-#include "hash_table.h"
-#include
-#include
-
-namespace cuhash {
-
-//! Makes an 64-bit Entry out of a key-value pair for the hash table.
-TV_HOST_DEVICE_INLINE Entry make_entry(unsigned key, unsigned value) {
- return (Entry(key) << 32) + value;
-}
-
-//! Returns the key of an Entry.
-TV_HOST_DEVICE_INLINE unsigned get_key(Entry entry) {
- return (unsigned)(entry >> 32);
-}
-
-//! Returns the value of an Entry.
-TV_HOST_DEVICE_INLINE unsigned get_value(Entry entry) {
- return (unsigned)(entry & 0xffffffff);
-}
-
-//! @name Internal
-//! @brief Functions used for building the hash table.
-//! @{
-
-//! Fills the entire array with a specific value.
-template
-__global__ void clear_table(const unsigned table_size, const T value,
- T *table) {
- unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x +
- blockIdx.y * blockDim.x * gridDim.x;
- if (thread_index < table_size) {
- table[thread_index] = value;
- }
-}
-
-//! Determine where in the hash table the key could be located.
-template
-__device__ void KeyLocations(const Functions constants,
- const unsigned table_size, const unsigned key,
- unsigned locations[kNumHashFunctions]) {
-// Compute all possible locations for the key in the big table.
-#pragma unroll
- for (int i = 0; i < kNumHashFunctions; ++i) {
- locations[i] = hash_function(constants, i, key) % table_size;
- }
-}
-//! @}
-
-/* --------------------------------------------------------------------------
- Retrieval functions.
- -------------------------------------------------------------------------- */
-//! Answers a single query.
-/*! @ingroup PublicInterface
- * @param[in] key Query key
- * @param[in] table_size Size of the hash table
- * @param[in] table The contents of the hash table
- * @param[in] constants The hash functions used to build the table
- * @param[in] stash_constants The hash function used to build the stash
- * @param[in] stash_count The number of items in the stash
- * @param[out] num_probes_required Debug only: The number of probes required
- * to resolve the query.
- * @returns The value of the query key, if the key exists in the table.
- * Otherwise, \ref kNotFound will be returned.
- */
-template
-__device__ unsigned
-retrieve(const unsigned query_key, const unsigned table_size,
- const Entry *table, const Functions constants,
- const uint2 stash_constants, const unsigned stash_count,
- unsigned *num_probes_required = NULL) {
- // Identify all of the locations that the key can be located in.
- unsigned locations[kNumHashFunctions];
- KeyLocations(constants, table_size, query_key, locations);
-
- // Check each location until the key is found.
- unsigned num_probes = 1;
- Entry entry = table[locations[0]];
- unsigned key = get_key(entry);
-
-#pragma unroll
- for (unsigned i = 1; i < kNumHashFunctions; ++i) {
- if (key != query_key && key != kNotFound) {
- num_probes++;
- entry = table[locations[i]];
- key = get_key(entry);
- }
- }
-
- // Check the stash.
- if (stash_count && get_key(entry) != query_key) {
- num_probes++;
- const Entry *stash = table + table_size;
- unsigned slot = stash_hash_function(stash_constants, query_key);
- entry = stash[slot];
- }
-
-#ifdef TRACK_ITERATIONS
- if (num_probes_required) {
- *num_probes_required = num_probes;
- }
-#endif
-
- if (get_key(entry) == query_key) {
- return get_value(entry);
- } else {
- return kNotFound;
- }
-}
-
-//! Perform a retrieval from a basic hash table. Each thread manages a single
-//! query.
-template
-__global__ void hash_retrieve(const unsigned n_queries, const unsigned *keys_in,
- const unsigned table_size, const Entry *table,
- const Functions constants,
- const uint2 stash_constants,
- const unsigned stash_count, unsigned *values_out,
- unsigned *num_probes_required = NULL) {
- // Get the key.
- unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x +
- blockIdx.y * blockDim.x * gridDim.x;
- if (thread_index >= n_queries)
- return;
- unsigned key = keys_in[thread_index];
-
- values_out[thread_index] = retrieve(
- key, table_size, table, constants, stash_constants, stash_count,
- (num_probes_required ? num_probes_required + thread_index : NULL));
-}
-
-/* --------------------------------------------------------------------------
- Build a cuckoo hash table.
- -------------------------------------------------------------------------- */
-//! @name Internal
-//! @{
-
-//! Determine where to insert the key next. The hash functions are used in
-//! round-robin order.
-template
-__device__ unsigned
-determine_next_location(const Functions constants,
- const unsigned table_size, const unsigned key,
- const unsigned previous_location) {
- // Identify all possible locations for the entry.
- unsigned locations[kNumHashFunctions];
-#pragma unroll
- for (unsigned i = 0; i < kNumHashFunctions; ++i) {
- locations[i] = hash_function(constants, i, key) % table_size;
- }
-
- // Figure out where the item should be inserted next.
- unsigned next_location = locations[0];
-#pragma unroll
- for (int i = kNumHashFunctions - 2; i >= 0; --i) {
- next_location =
- (previous_location == locations[i] ? locations[i + 1] : next_location);
- }
- return next_location;
-}
-
-//! Attempts to insert a single entry into the hash table.
-/*! This process stops after a certain number of iterations. If the thread is
- still holding onto an item because of an eviction, it tries the stash.
- If it fails to enter the stash, it returns false.
- Otherwise, it succeeds and returns true.
- */
-template
-__device__ bool
-insert(const unsigned table_size, const Functions constants,
- const uint2 stash_constants, const unsigned max_iteration_attempts,
- Entry *table, unsigned *stash_count, Entry entry,
- unsigned *iterations_used) {
- unsigned key = get_key(entry);
-
- // The key is always inserted into its first slot at the start.
- unsigned location = hash_function(constants, 0, key) % table_size;
-
- // Keep inserting until an empty slot is found or the eviction chain grows too
- // large.
- for (unsigned its = 1; its <= max_iteration_attempts; its++) {
- // Insert the new entry.
- entry = atomicExch(&table[location], entry);
- key = get_key(entry);
-
- // If no key was evicted, we're done.
- if (key == kKeyEmpty) {
- *iterations_used = its;
- break;
- }
-
- // Otherwise, determine where the evicted key will go.
- location = determine_next_location(constants, table_size, key, location);
- }
-
- if (key != kKeyEmpty) {
- // Shove it into the stash.
- unsigned slot = stash_hash_function(stash_constants, key);
- Entry *stash = table + table_size;
- Entry replaced_entry = atomicCAS(stash + slot, kEntryEmpty, entry);
- if (replaced_entry != kEntryEmpty) {
- return false;
- } else {
- atomicAdd(stash_count, 1);
- }
- }
-
- return true;
-}
-
-// Build a basic hash table, using one big table.
-template
-__global__ void CuckooHash(const unsigned n_entries, const unsigned *keys,
- const unsigned *values, const unsigned table_size,
- const Functions constants,
- const unsigned max_iteration_attempts, Entry *table,
- uint2 stash_constants, unsigned *stash_count,
- unsigned *failures,
- unsigned *iterations_taken = nullptr) {
- // Check if this thread has an item and if any previous threads failed.
- unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x +
- blockIdx.y * blockDim.x * gridDim.x;
- if (thread_index >= n_entries || *failures)
- return;
- Entry entry = make_entry(keys[thread_index], values[thread_index]);
-
- unsigned iterations = 0;
- bool success = insert(
- table_size, constants, stash_constants, max_iteration_attempts, table,
- stash_count, entry, &iterations);
-
- if (success == false) {
- // The eviction chain grew too large. Report failure.
-#ifdef COUNT_UNINSERTED
- atomicAdd(failures, 1);
-#else
- *failures = 1;
-#endif
- }
-
-#ifdef TRACK_ITERATIONS
- iterations_taken[thread_index] = iterations;
-#endif
-}
-//! @}
-
-}; // namespace cuhash
-
-#endif
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
diff --git a/include/cuhash/hash_table.h b/include/cuhash/hash_table.h
deleted file mode 100644
index 055e08c..0000000
--- a/include/cuhash/hash_table.h
+++ /dev/null
@@ -1,228 +0,0 @@
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file hash_table.h
- *
- * @brief Header for a basic hash table that stores one value per key.
- */
-
-#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
-#define CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
-
-#include "definitions.h"
-#include "hash_functions.h"
-
-#include
-
-/** \addtogroup cudpp_app
- * @{
- */
-
-/** \addtogroup cudpp_hash_data_structures
- * @{
- */
-
-/* --------------------------------------------------------------------------
- Doxygen definitions.
- -------------------------------------------------------------------------- */
-/*! @namespace CudaHT
- * @brief Encapsulates the hash table library.
- */
-
-/*! @namespace CuckooHashing
- * @brief Encapsulates the cuckoo hash table that uses stashes.
- */
-
-/* -------------------------------------------------------------------------
- Hash table code.
- ------------------------------------------------------------------------- */
-namespace cuhash {
-
-//! Compute how many thread blocks are required for the given number of threads.
-dim3 ComputeGridDim(unsigned threads);
-
-//! Compute how long an eviction chain is allowed to become for a given input
-//! size.
-/*! \param[in] num_keys Number of keys in the input.
- * \param[in] table_size Number of slots in the hash table.
- * \param[in] num_functions Number of hash functions being used.
- * \returns The number of iterations that should be allowed.
- *
- * The latter two parameters are only needed when using an empirical
- * formula for computing the chain length.
- */
-unsigned ComputeMaxIterations(const unsigned num_keys,
- const unsigned table_size,
- const unsigned num_functions);
-
-//! Basic hash table that stores one value for each key.
-/*! The input consists of two unsigned arrays of keys and values.
- * None of the keys are expected to be repeated.
- *
- * @todo Templatize the interface without forcing the header file to
- * have CUDA calls.
- * @ingroup cudpp_app
- */
-class HashTable {
-public:
- HashTable();
-
- virtual ~HashTable() { Release(); }
-
- //! Initialize the hash table's memory. Must be called before \ref
- //! Build() and after the random number generator has been seeded.
- /*! @param[in] max_input_size Largest expected number of items in the input.
- * @param[in] space_usage Size of the hash table relative to the
- * input. Bigger tables are faster to build
- * and retrieve from.
- * @param[in] num_functions Number of hash functions to use. May be
- * 2-5. More hash functions make it easier
- * to build the table, but increase
- * retrieval times.
- * @returns Whether the hash table was initialized successfully (true)
- * or not (false).
- *
- * The minimum space usage is dependent on the number of functions
- * being used; for two through five functions, the minimum space
- * usage is 2.1, 1.1, 1.03, and 1.02 respectively.
- */
- virtual bool Initialize(const unsigned max_input_size,
- const float space_usage = 1.25,
- const unsigned num_functions = 4);
-
- //! Free all memory.
- virtual void Release();
-
- //! Build the hash table.
- /*! @param[in] input_size Number of key-value pairs being inserted.
- * @param[in] d_keys Device memory array containing all of the input
- * keys.
- * @param[in] d_vals Device memory array containing the keys' values.
- * @returns Whether the hash table was built successfully (true) or
- * not (false).
- *
- * Several attempts are allowed to build the hash table in case of failure.
- * The input keys are expected to be completely unique.
- * To reduce the chance of a failure, increase the space usage or number of
- * functions.
- * Keys are not allowed to be equal to cuhash::kKeyEmpty.
- */
- virtual bool Build(const unsigned input_size, const unsigned *d_keys,
- const unsigned *d_vals);
-
- //! Query the hash table.
- /*! @param[in] n_queries Number of keys in the query set.
- * @param[in] d_query_keys Device memory array containing all of
- * the query keys.
- * @param[in] d_query_results Values for the query keys.
- *
- * kNotFound is returned for any query key that failed to be found
- * in the table.
- */
- virtual void Retrieve(const unsigned n_queries, const unsigned *d_query_keys,
- unsigned *d_query_results);
-
- //! @name Accessors
- /// @brief Mainly needed to use the __device__ CudaHT::retrieve()
- /// function directly.
- /// @{
-
- //! Returns how many slots the hash table has.
- inline unsigned get_table_size() const { return table_size_; }
-
- //! Returns how many items are stored in the stash.
- inline unsigned get_stash_count() const { return stash_count_; }
-
- //! Returns the constants used by the stash.
- inline uint2 get_stash_constants() const { return stash_constants_; }
-
- //! Returns the hash table contents.
- inline const Entry *get_contents() const { return d_contents_; }
-
- //! Returns the number of hash functions being used.
- inline unsigned get_num_hash_functions() const { return num_hash_functions_; }
-
- //! When using two hash functions, returns the constants.
- inline Functions<2> get_constants_2() const { return constants_2_; }
-
- //! When using three hash functions, returns the constants.
- inline Functions<3> get_constants_3() const { return constants_3_; }
-
- //! When using four hash functions, returns the constants.
- inline Functions<4> get_constants_4() const { return constants_4_; }
-
- //! When using five hash functions, returns the constants.
- inline Functions<5> get_constants_5() const { return constants_5_; }
-
- /// @}
- inline Entry *data() { return d_contents_; }
- inline const Entry *data() const { return d_contents_; }
-
-protected:
- unsigned table_size_; //!< Size of the hash table.
- unsigned num_hash_functions_; //!< Number of hash functions being used.
- Entry *d_contents_; //!< Device memory: The hash table contents. The stash is
- //!< stored at the end.
- unsigned stash_count_; //!< Number of key-value pairs currently stored.
- uint2 stash_constants_; //!< Hash function constants for the stash.
-
- Functions<2> constants_2_; //!< Constants for a set of two hash functions.
- Functions<3> constants_3_; //!< Constants for a set of three hash functions.
- Functions<4> constants_4_; //!< Constants for a set of four hash functions.
- Functions<5> constants_5_; //!< Constants for a set of five hash functions.
-
- unsigned *d_failures_; //!< Device memory: General use error flag.
-};
-
-/*! @name Internal
- * @{
- */
-namespace CUDAWrapper {
-//! Fills a 64-bit array with a particular value.
-void ClearTable(const unsigned slots_in_table, const Entry fill_value,
- Entry *d_array);
-
-//! Calls the Cuckoo Hash construction kernel.
-void CallCuckooHash(const unsigned n_entries, const unsigned num_hash_functions,
- const unsigned *d_keys, const unsigned *d_values,
- const unsigned table_size, const Functions<2> constants_2,
- const Functions<3> constants_3,
- const Functions<4> constants_4,
- const Functions<5> constants_5,
- const unsigned max_iteration_attempts, Entry *d_contents,
- uint2 stash_constants, unsigned *d_stash_count,
- unsigned *d_failures, unsigned *d_iterations_taken);
-
-//! Calls the kernel that performs retrievals.
-void CallHashRetrieve(const unsigned n_queries,
- const unsigned num_hash_functions,
- const unsigned *keys_in, const unsigned table_size,
- const Entry *table, const Functions<2> constants_2,
- const Functions<3> constants_3,
- const Functions<4> constants_4,
- const Functions<5> constants_5,
- const uint2 stash_constants, const unsigned stash_count,
- unsigned *values_out);
-}; // namespace CUDAWrapper
-/// @}
-
-}; // namespace cuhash
-
-/** @} */ // end hash table data structures
-/** @} */ // end cudpp_app
-
-#endif
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
diff --git a/include/paramsgrid.h b/include/paramsgrid.h
deleted file mode 100644
index c978dfe..0000000
--- a/include/paramsgrid.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is used for c++ unit test, but pytorch jit ops don't support c++
-// debug build.
-
-#ifndef PARAMS_GRID_H_
-#define PARAMS_GRID_H_
-#include
-#include
-
-namespace detail {
-template int getTotalSize(std::vector arg) { return arg.size(); }
-
-template
-int getTotalSize(std::vector arg, std::vector... args) {
- return arg.size() * getTotalSize(args...);
-}
-template int getSize(std::vector arg) { return arg.size(); }
-
-template
-void assigner(TT &src, std::vector counter, std::vector &arg) {
- std::get(src) = arg[counter[Idx]];
-}
-
-template
-void assigner(TT &src, std::vector counter, std::vector &arg,
- std::vector &... args) {
- std::get(src) = arg[counter[Idx]];
- assigner(src, counter, args...);
-}
-} // namespace detail
-template
-std::vector> paramsGrid(std::vector... args) {
- int length = detail::getTotalSize(args...);
- std::vector sizes = {detail::getSize(args)...};
- int size = sizes.size();
-
- std::vector> params(length);
- std::vector counter(size);
- for (int i = 0; i < length; ++i) {
- detail::assigner<0>(params[i], counter, args...);
- counter[size - 1] += 1;
- for (int c = size - 1; c >= 0; --c) {
- if (counter[c] == sizes[c] && c > 0) {
- counter[c - 1] += 1;
- counter[c] = 0;
- }
- }
- }
- return params;
-}
-
-#endif
\ No newline at end of file
diff --git a/include/spconv/box_iou.h b/include/spconv/box_iou.h
deleted file mode 100644
index 15ceee8..0000000
--- a/include/spconv/box_iou.h
+++ /dev/null
@@ -1,156 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef BOX_IOU_H
-#define BOX_IOU_H
-
-#include
-// must include pybind11/eigen.h if using eigen matrix as arguments.
-#include
-#include
-#include
-
-namespace spconv {
-// #include "voxelnet/core/cc/pybind11_helper.h"
-namespace py = pybind11;
-using namespace pybind11::literals;
-template
-inline py::array_t constant(ShapeContainer shape, DType value) {
- // create ROWMAJOR array.
- py::array_t array(shape);
- std::fill(array.mutable_data(), array.mutable_data() + array.size(), value);
- return array;
-}
-
-template
-inline py::array_t zeros(std::vector shape) {
- return constant>(shape, 0);
-}
-
-template
-py::array_t
-rbbox_iou(py::array_t box_corners, py::array_t qbox_corners,
- py::array_t standup_iou, DType standup_thresh) {
- namespace bg = boost::geometry;
- typedef bg::model::point point_t;
- typedef bg::model::polygon polygon_t;
- polygon_t poly, qpoly;
- std::vector poly_inter, poly_union;
- DType inter_area, union_area;
- auto box_corners_r = box_corners.template unchecked<3>();
- auto qbox_corners_r = qbox_corners.template unchecked<3>();
- auto standup_iou_r = standup_iou.template unchecked<2>();
- auto N = box_corners_r.shape(0);
- auto K = qbox_corners_r.shape(0);
- py::array_t overlaps = zeros({int(N), int(K)});
- auto overlaps_rw = overlaps.template mutable_unchecked<2>();
- if (N == 0 || K == 0) {
- return overlaps;
- }
- for (int k = 0; k < K; ++k) {
- for (int n = 0; n < N; ++n) {
- if (standup_iou_r(n, k) <= standup_thresh)
- continue;
- bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
- bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
- bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
- bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
- bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
- bg::append(qpoly,
- point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
- bg::append(qpoly,
- point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
- bg::append(qpoly,
- point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
- bg::append(qpoly,
- point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
- bg::append(qpoly,
- point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-
- bg::intersection(poly, qpoly, poly_inter);
-
- if (!poly_inter.empty()) {
- inter_area = bg::area(poly_inter.front());
- bg::union_(poly, qpoly, poly_union);
- if (!poly_union.empty()) {
- union_area = bg::area(poly_union.front());
- overlaps_rw(n, k) = inter_area / union_area;
- }
- poly_union.clear();
- }
- poly.clear();
- qpoly.clear();
- poly_inter.clear();
- }
- }
- return overlaps;
-}
-
-template
-py::array_t rbbox_intersection(py::array_t box_corners,
- py::array_t qbox_corners,
- py::array_t standup_iou,
- DType standup_thresh) {
- namespace bg = boost::geometry;
- typedef bg::model::point point_t;
- typedef bg::model::polygon polygon_t;
- polygon_t poly, qpoly;
- std::vector poly_inter, poly_union;
- DType inter_area, union_area;
- auto box_corners_r = box_corners.template unchecked<3>();
- auto qbox_corners_r = qbox_corners.template unchecked<3>();
- auto standup_iou_r = standup_iou.template unchecked<2>();
- auto N = box_corners_r.shape(0);
- auto K = qbox_corners_r.shape(0);
- py::array_t overlaps = zeros({int(N), int(K)});
- auto overlaps_rw = overlaps.template mutable_unchecked<2>();
- if (N == 0 || K == 0) {
- return overlaps;
- }
- for (int k = 0; k < K; ++k) {
- for (int n = 0; n < N; ++n) {
- if (standup_iou_r(n, k) <= standup_thresh)
- continue;
- bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
- bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
- bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
- bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
- bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
- bg::append(qpoly,
- point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
- bg::append(qpoly,
- point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
- bg::append(qpoly,
- point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
- bg::append(qpoly,
- point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
- bg::append(qpoly,
- point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-
- bg::intersection(poly, qpoly, poly_inter);
-
- if (!poly_inter.empty()) {
- inter_area = bg::area(poly_inter.front());
- overlaps_rw(n, k) = inter_area;
- }
- poly.clear();
- qpoly.clear();
- poly_inter.clear();
- }
- }
- return overlaps;
-}
-
-} // namespace spconv
-#endif
\ No newline at end of file
diff --git a/include/spconv/cublas_gemm.h b/include/spconv/cublas_gemm.h
deleted file mode 100644
index 117a127..0000000
--- a/include/spconv/cublas_gemm.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#pragma once
-#include
-#include
-
-namespace spconv {
-
-template
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
- cublasOperation_t transb, int m, int n, int k,
- const T *alpha, const T *A, int lda, const T *B,
- int ldb, const T *beta, T *C, int ldc);
-
-template
-cublasStatus_t cublasTgemmRow(cublasHandle_t handle, cublasOperation_t transa,
- cublasOperation_t transb, int m, int n, int k,
- const T *alpha, const T *A, int lda, const T *B,
- int ldb, const T *beta, T *C, int ldc) {
- return cublasTgemm(handle, transb, transa, n, m, k, alpha, B, ldb, A, lda,
- beta, C, ldc);
-}
-
-template inline T constant_scalar(float data) { return T(data); }
-
-template
-cublasStatus_t gemm(cublasHandle_t handle, bool transa, bool transb,
- const tv::TensorView A, const tv::TensorView B,
- tv::TensorView C) {
- TV_ASSERT_RT_ERR(A.ndim() == 2, "error");
- TV_ASSERT_RT_ERR(B.ndim() == 2, "error");
- auto transa_cublas = transa ? CUBLAS_OP_T : CUBLAS_OP_N;
- auto transb_cublas = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
- int m = transa ? A.dim(1) : A.dim(0);
- int n = transb ? B.dim(0) : B.dim(1);
- int ka = transa ? A.dim(0) : A.dim(1);
- int kb = transb ? B.dim(1) : B.dim(0);
- int lda = transa ? m : ka;
- int ldb = transb ? ka : n;
- int ldc = n;
- TV_ASSERT_RT_ERR(ka == kb, "error");
- T alpha = constant_scalar(1);
- T beta = constant_scalar(0);
- return cublasTgemmRow(handle, transa_cublas, transb_cublas, m, n, ka,
- &alpha, A.data(), lda, B.data(), ldb, &beta,
- C.data(), ldc);
-}
-
-} // namespace spconv
diff --git a/include/spconv/fused_conv.cu.h b/include/spconv/fused_conv.cu.h
deleted file mode 100644
index 15533d1..0000000
--- a/include/spconv/fused_conv.cu.h
+++ /dev/null
@@ -1,629 +0,0 @@
-
-/*
-BSD License
-
-For SparseConvNet software
-
-Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
- * Neither the name Facebook nor the names of its contributors may be used to
- endorse or promote products derived from this software without specific
- prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#define TACC double
-
-template
-__global__ void
-dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w,
- int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
- int32_t input_nPlanes, int32_t input_stride,
- int32_t output_nPlanes, int32_t output_stride) {
- // nHot must be a multiple of K!!
-
- // Input x Weight -> Output
- // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
- // K is a multiple of V,
-
- // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
-
- int32_t M = input_nPlanes / K;
- // N = gridDim.y == output_nPlanes/K
- int32_t n = blockIdx.y;
- int32_t g = blockIdx.z;
- inFeatures += g * input_nPlanes;
- outFeatures += n * K + g * output_nPlanes;
- w += n * K + g * input_nPlanes * output_nPlanes;
-
- TACC O[V];
- __shared__ T W[K][K];
- __shared__ T I[K][K];
- int32_t R0[V];
- int32_t R1[V];
- const int32_t tx = threadIdx.x;
- int32_t ty[V];
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- ty[v] = threadIdx.y + v * (K / V);
-
- for (int32_t m = 0; m < M; m++) {
-// Read w
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-
- for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
- for (int32_t v = 0; v < V; v++) {
- R0[v] = rulesIn[s + ty[v]];
- R1[v] = rulesOut[s + ty[v]];
- }
- __syncthreads();
-
-// Read input, reset O[]
-#pragma unroll
- for (int32_t v = 0; v < V; v++) {
- I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
- O[v] = 0;
- }
- __syncthreads();
-
-#pragma unroll
- for (int32_t k = 0; k < K; k++)
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- O[v] += I[ty[v]][k] * W[k][tx];
-
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- O[v] += outFeatures[R1[v] * output_stride + tx];
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- outFeatures[R1[v] * output_stride + tx] = O[v];
- __syncthreads();
- }
- w += K * output_nPlanes;
- inFeatures += K;
- }
-}
-template
-__global__ void
-dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w,
- int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
- int32_t input_nPlanes, int32_t input_stride,
- int32_t output_nPlanes, int32_t output_stride) {
- // Input x Weight -> Output
- // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
- // K is a multiple of V,
-
- // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
-
- int32_t M = input_nPlanes / K;
- // N = gridDim.y == output_nPlanes/K
- int32_t n = blockIdx.y;
- int32_t g = blockIdx.z;
- inFeatures += g * input_nPlanes;
- outFeatures += n * K + g * output_nPlanes;
- w += n * K + g * input_nPlanes * output_nPlanes;
-
- TACC O[V];
- __shared__ T W[K][K];
- __shared__ T I[K][K];
- int32_t R0[V];
- int32_t R1[V];
- const int32_t tx = threadIdx.x;
- int32_t ty[V];
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- ty[v] = threadIdx.y + v * (K / V);
-
- for (int32_t m = 0; m < M; m++) {
-// Read w
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-
- for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
- for (int32_t v = 0; v < V; v++) {
- if (s + ty[v] < nHot) {
- R0[v] = rulesIn[s + ty[v]];
- R1[v] = rulesOut[s + ty[v]];
- }
- }
- __syncthreads();
-
-// Read input, reset O[]
-#pragma unroll
- for (int32_t v = 0; v < V; v++) {
- if (s + ty[v] < nHot)
- I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
- O[v] = 0;
- }
- __syncthreads();
-
-#pragma unroll
- for (int32_t k = 0; k < K; k++)
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- O[v] += I[ty[v]][k] * W[k][tx];
-
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- if (s + ty[v] < nHot)
- O[v] += outFeatures[R1[v] * output_stride + tx];
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- if (s + ty[v] < nHot)
- outFeatures[R1[v] * output_stride + tx] = O[v];
- __syncthreads();
- }
- w += K * output_nPlanes;
- inFeatures += K;
- }
-}
-
-#define FOO(T, K, V) \
- { \
- if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
- int32_t o = (nHot / K) * K; \
- if (o >= K) \
- dConvolution_KMxKN_forwardA \
- <<>>( \
- inFeatures, outFeatures, w, rulesIn, rulesOut, o, \
- input_nPlanes, input_stride, output_nPlanes, output_stride); \
- if (nHot > o) \
- dConvolution_KMxKN_forwardB \
- <<>>( \
- inFeatures, outFeatures, w, rulesIn + o, rulesOut + o, \
- nHot - o, input_nPlanes, input_stride, output_nPlanes, \
- output_stride); \
- return; \
- } \
- }
-template
-void dConvolution_forward(cudaStream_t s, T *inFeatures, T *outFeatures, T *w,
- int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
- int32_t input_nPlanes, int32_t input_stride,
- int32_t output_nPlanes, int32_t output_stride,
- int32_t nGroups) {
- FOO(T, 64, 16)
- FOO(T, 32, 8)
- FOO(T, 16, 4)
- FOO(T, 8, 2)
- assert(false);
-}
-template <>
-void dConvolution_forward(cudaStream_t s, double *inFeatures,
- double *outFeatures, double *w,
- int32_t *rulesIn, int32_t *rulesOut,
- int32_t nHot, int32_t input_nPlanes,
- int32_t input_stride, int32_t output_nPlanes,
- int32_t output_stride, int32_t nGroups) {
- FOO(double, 32, 8)
- FOO(double, 16, 4)
- FOO(double, 8, 2)
- assert(false);
-}
-#undef FOO
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
-template
-__global__ void dConvolution_KMxKN_backward_dW_A(
- T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
- int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
- int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
- // M = gridDim.y == input_nPlanes / K
- int32_t N = output_nPlanes / K;
- int32_t m = blockIdx.y;
- int32_t g = blockIdx.z;
- inFeatures += m * K + g * input_nPlanes;
- dInFeatures += m * K + g * input_nPlanes;
- dOutFeatures += g * output_nPlanes;
- w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
- dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
- TACC dI[V];
- TACC dW[V];
- __shared__ T I[K][K];
- __shared__ T dO[K][K];
- __shared__ T W[K][K];
- int32_t R0[V];
- int32_t R1[V];
- const int32_t tx = threadIdx.x;
- int32_t ty[V];
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- ty[v] = threadIdx.y + v * (K / V);
- for (int32_t n = 0; n < N; n++) {
-// Read w, reset dW
-#pragma unroll
- for (int32_t v = 0; v < V; v++) {
- W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
- dW[v] = 0;
- }
- for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
- for (int32_t v = 0; v < V; v++) {
- R0[v] = rulesIn[s + ty[v]];
- R1[v] = rulesOut[s + ty[v]];
- dI[v] = 0;
- }
- __syncthreads();
-// Read input and dOutput
-#pragma unroll
- for (int32_t v = 0; v < V; v++) {
- I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
- dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
- }
- __syncthreads();
-#pragma unroll
- for (int32_t k = 0; k < K; k++)
-#pragma unroll
- for (int32_t v = 0; v < V; v++) {
- dI[v] += dO[ty[v]][k] * W[tx][k];
- dW[v] += I[k][ty[v]] * dO[k][tx];
- }
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- dI[v] += dInFeatures[R0[v] * input_stride + tx];
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- dInFeatures[R0[v] * input_stride + tx] = dI[v];
- __syncthreads();
- }
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
- w += K;
- dw += K;
- dOutFeatures += K;
- }
-}
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
-template
-__global__ void dConvolution_KMxKN_backward_dW_B(
- T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
- int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
- int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
- // M = gridDim.y == input_nPlanes / K
- int32_t N = output_nPlanes / K;
- int32_t m = blockIdx.y;
- int32_t g = blockIdx.z;
- inFeatures += m * K + g * input_nPlanes;
- dInFeatures += m * K + g * input_nPlanes;
- dOutFeatures += g * output_nPlanes;
- w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
- dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
- TACC dI[V];
- TACC dW[V];
- __shared__ T I[K][K];
- __shared__ T dO[K][K];
- __shared__ T W[K][K];
- int32_t R0[V];
- int32_t R1[V];
- const int32_t tx = threadIdx.x;
- int32_t ty[V];
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- ty[v] = threadIdx.y + v * (K / V);
- for (int32_t n = 0; n < N; n++) {
-// Read w, reset dW
-#pragma unroll
- for (int32_t v = 0; v < V; v++) {
- W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
- dW[v] = 0;
- }
- for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
- for (int32_t v = 0; v < V; v++) {
- if (s + ty[v] < nHot) {
- R0[v] = rulesIn[s + ty[v]];
- R1[v] = rulesOut[s + ty[v]];
- }
- dI[v] = 0;
- }
- __syncthreads();
-// Read input and dOutput
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- if (s + ty[v] < nHot) {
- I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
- dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
- } else {
- I[ty[v]][tx] = 0;
- dO[ty[v]][tx] = 0;
- }
- __syncthreads();
-#pragma unroll
- for (int32_t k = 0; k < K; k++)
-#pragma unroll
- for (int32_t v = 0; v < V; v++) {
- dI[v] += dO[ty[v]][k] * W[tx][k];
- dW[v] += I[k][ty[v]] * dO[k][tx];
- }
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- if (s + ty[v] < nHot)
- dI[v] += dInFeatures[R0[v] * input_stride + tx];
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- if (s + ty[v] < nHot)
- dInFeatures[R0[v] * input_stride + tx] = dI[v];
- __syncthreads();
- }
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
- w += K;
- dw += K;
- dOutFeatures += K;
- }
-}
-#define FOO(T, K, V) \
- { \
- if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
- int32_t o = (nHot / K) * K; \
- if (o >= K) \
- dConvolution_KMxKN_backward_dW_A \
- <<>>(inFeatures, dInFeatures, dOutFeatures, \
- w, dw, rulesIn, rulesOut, o, \
- input_nPlanes, input_stride, \
- output_nPlanes, output_stride); \
- if (nHot > o) \
- dConvolution_KMxKN_backward_dW_B \
- <<>>( \
- inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn + o, \
- rulesOut + o, nHot - o, input_nPlanes, input_stride, \
- output_nPlanes, output_stride); \
- return; \
- } \
- }
-template
-void dConvolution_backward_dW(cudaStream_t s, T *inFeatures, T *dInFeatures,
- T *dOutFeatures, T *w, T *dw, int32_t *rulesIn,
- int32_t *rulesOut, int32_t nHot,
- int32_t input_nPlanes, int32_t input_stride,
- int32_t output_nPlanes, int32_t output_stride,
- int32_t nGroups) {
- FOO(T, 32, 8)
- FOO(T, 16, 4)
- FOO(T, 8, 2)
- assert(false);
-}
-#undef FOO
-template
-__global__ void
-dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w,
- int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
- int32_t input_nPlanes, int32_t input_stride,
- int32_t output_nPlanes, int32_t output_stride) {
- // Input x Weight -> Output
- // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
- // K is a multiple of V,
- // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
- // - parallel over N,nHot - loop over M
- int32_t M = (input_nPlanes + K - 1) / K;
- // N = gridDim.y ~ output_nPlanes/K
- int32_t n = blockIdx.y;
- int32_t g = blockIdx.z;
- inFeatures += g * input_nPlanes;
- outFeatures += n * K + g * output_nPlanes;
- w += n * K + g * input_nPlanes * output_nPlanes;
- int32_t KO = min(K, output_nPlanes - K * n);
- TACC O[V];
- __shared__ T W[K][K];
- __shared__ T I[K][K];
- __shared__ int32_t R[K * 2];
- const int32_t tx = threadIdx.x;
- int32_t ty[V];
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- ty[v] = threadIdx.y + v * (K / V);
- for (int32_t m = 0; m < M; m++) {
- int32_t KI = min(K, input_nPlanes - K * m);
-// Read w
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- if (ty[v] < KI and tx < KO)
- W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
- for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-// Read rules for K input/output pairs
-#pragma unroll
- for (int32_t v = 0; v < V; v++) {
- if (ty[v] < 1) {
- if (s + tx < nHot) {
- R[2 * tx] = rulesIn[s + tx];
- R[2 * tx + 1] = rulesOut[s + tx];
- }
- // R[q] = rules[2 * s + q];
- }
- }
- __syncthreads();
-// Read input, reset O[]
-#pragma unroll
- for (int32_t v = 0; v < V; v++) {
- if (tx < KI and s + ty[v] < nHot)
- I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
- O[v] = 0;
- }
- __syncthreads();
-#pragma unroll
- for (int32_t k = 0; k < KI; k++)
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- O[v] += I[ty[v]][k] * W[k][tx];
- __syncthreads();
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- if (tx < KO and s + ty[v] < nHot)
- outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v];
- __syncthreads();
- }
- w += K * output_nPlanes;
- inFeatures += K;
- }
-}
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
-template
-__global__ void dConvolution_KMxKN_backward_dW2(
- T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
- int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
- int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
- // M = gridDim.y == input_nPlanes / K
- int32_t N = (output_nPlanes + K - 1) / K;
- int32_t m = blockIdx.y;
- int32_t g = blockIdx.z;
- inFeatures += m * K + g * input_nPlanes;
- dInFeatures += m * K + g * input_nPlanes;
- dOutFeatures += g * output_nPlanes;
- w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
- dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
- int32_t KI = min(K, input_nPlanes - K * m);
- TACC dI[V];
- TACC dW[V];
- __shared__ T I[K][K];
- __shared__ T dO[K][K];
- __shared__ T W[K][K];
- __shared__ int32_t R[K * 2];
- const int32_t tx = threadIdx.x;
- int32_t ty[V];
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- ty[v] = threadIdx.y + v * (K / V);
- for (int32_t n = 0; n < N; n++) {
- int32_t KO = min(K, output_nPlanes - K * n);
-// Read w, reset dW
-#pragma unroll
- for (int32_t v = 0; v < V; v++) {
- if (ty[v] < KI and tx < KO)
- W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
- dW[v] = 0;
- }
- for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-// Read rules for K input/output pairs, reset dI[]
-#pragma unroll
- for (int32_t v = 0; v < V; v++) {
- if (ty[v] < 1) {
- if (s + tx < nHot) {
- R[2 * tx] = rulesIn[s + tx];
- R[2 * tx + 1] = rulesOut[s + tx];
- }
- // R[q] = rules[2 * s + q];
- }
- dI[v] = 0;
- }
- __syncthreads();
-// Read input and dOutput
-#pragma unroll
- for (int32_t v = 0; v < V; v++) {
- if (tx < KI and s + ty[v] < nHot)
- I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
- else
- I[ty[v]][tx] = 0;
- if (tx < KO and s + ty[v] < nHot)
- dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx];
- else
- dO[ty[v]][tx] = 0;
- }
- __syncthreads();
-#pragma unroll
- for (int32_t k = 0; k < KO; k++)
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- dI[v] += dO[ty[v]][k] * W[tx][k];
-#pragma unroll
- for (int32_t k = 0; k < K; k++)
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- dW[v] += I[k][ty[v]] * dO[k][tx];
- __syncthreads();
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- if (tx < KI and s + ty[v] < nHot)
- dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v];
- __syncthreads();
- }
-#pragma unroll
- for (int32_t v = 0; v < V; v++)
- if (ty[v] < KI and tx < KO)
- atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
- w += K;
- dw += K;
- dOutFeatures += K;
- }
-}
-template
-void dConvolution_forward2(cudaStream_t s, T *inFeatures, T *outFeatures, T *w,
- int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
- int32_t input_nPlanes, int32_t input_stride,
- int32_t output_nPlanes, int32_t output_stride,
- int32_t nGroups) {
- int32_t c = input_nPlanes * output_nPlanes * nGroups;
- if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
- const int32_t K = 16;
- const int32_t V = 4;
- dConvolution_KMxKN_forward2
- <<>>(inFeatures, outFeatures, w, rulesIn, rulesOut, nHot,
- input_nPlanes, input_stride, output_nPlanes, output_stride);
-
- } else {
- dConvolution_forward(s, inFeatures, outFeatures, w, rulesIn, rulesOut, nHot,
- input_nPlanes, input_stride, output_nPlanes,
- output_stride, nGroups);
- }
-}
-template
-void dConvolution_backward_dW2(cudaStream_t s, T *inFeatures, T *dInFeatures,
- T *dOutFeatures, T *w, T *dw, int32_t *rulesIn,
- int32_t *rulesOut, int32_t nHot,
- int32_t input_nPlanes, int32_t input_stride,
- int32_t output_nPlanes, int32_t output_stride,
- int32_t nGroups) {
- int32_t c = input_nPlanes * output_nPlanes * nGroups;
- if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
- const int32_t K = 16;
- const int32_t V = 4;
- dConvolution_KMxKN_backward_dW2
- <<>>(inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn, rulesOut,
- nHot, input_nPlanes, input_stride, output_nPlanes,
- output_stride);
- } else {
- dConvolution_backward_dW(s, inFeatures, dInFeatures, dOutFeatures, w, dw,
- rulesIn, rulesOut, nHot, input_nPlanes,
- input_stride, output_nPlanes, output_stride,
- nGroups);
- }
-}
-#undef TACC
\ No newline at end of file
diff --git a/include/spconv/fused_conv.h b/include/spconv/fused_conv.h
deleted file mode 100644
index a02d569..0000000
--- a/include/spconv/fused_conv.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include
-#include
-#include
-namespace spconv {
-
-enum FusedConvAlgo { kFSparseConvNet, kFMinkowskiEngine };
-using all_fused_conv_algos_t =
- tv::mp_list_c;
-
-void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
- torch::Tensor filters, torch::Tensor indicesIn,
- torch::Tensor indicesOut, int nHot);
-
-void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
- torch::Tensor dout, torch::Tensor filters,
- torch::Tensor dfilters, torch::Tensor indicesIn,
- torch::Tensor indicesOut, int nHot);
-
-void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
- torch::Tensor filters, torch::Tensor indicesIn,
- torch::Tensor indicesOut, int nHot);
-void fused_conv_backward_cuda_minkowski(torch::Tensor features,
- torch::Tensor din, torch::Tensor dout,
- torch::Tensor filters,
- torch::Tensor dfilters,
- torch::Tensor indicesIn,
- torch::Tensor indicesOut, int nHot);
-
-template struct FusedConvDispatch;
-
-template <> struct FusedConvDispatch {
- constexpr static auto *fwd = fused_conv_cuda;
- constexpr static auto *bwd = fused_conv_backward_cuda;
-};
-
-template <> struct FusedConvDispatch {
- constexpr static auto *fwd = fused_conv_cuda_minkowski;
- constexpr static auto *bwd = fused_conv_backward_cuda_minkowski;
-};
-
-} // namespace spconv
diff --git a/include/spconv/fused_spconv_ops.h b/include/spconv/fused_spconv_ops.h
deleted file mode 100644
index cb87e67..0000000
--- a/include/spconv/fused_spconv_ops.h
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef FUSED_SPARSE_CONV_OP_H_
-#define FUSED_SPARSE_CONV_OP_H_
-
-#include
-#include
-#include
-#include
-#include
-
-namespace spconv {
-// torch.jit's doc says only support int64, so we need to convert to int32.
-
-torch::Tensor
-fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
- torch::Tensor bias, torch::Tensor indicePairs,
- torch::Tensor indiceNum, int64_t numActOut,
- int64_t _inverse, int64_t _subM) {
- bool subM = _subM != 0;
- bool inverse = _inverse != 0;
- auto device = features.device().type();
- auto ndim = filters.dim() - 2;
- auto kernelVolume = indicePairs.size(0);
- auto numInPlanes = features.size(1);
- auto numOutPlanes = filters.size(ndim + 1);
- auto indicePairNumCpu = indiceNum.to({torch::kCPU});
- auto indicePairMaxSizeIter =
- std::max_element(indicePairNumCpu.data_ptr(),
- indicePairNumCpu.data_ptr() + kernelVolume);
- int indicePairMaxOffset =
- indicePairMaxSizeIter - indicePairNumCpu.data_ptr();
- int indicePairMaxSize = *indicePairMaxSizeIter;
-
- /*if (_subM){
- std::vector indicePairNumVec(indicePairNumCpu.data_ptr(),
- indicePairNumCpu.data_ptr() + kernelVolume);
- indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
-
- auto indicePairVecMaxSizeIter = std::max_element(
- indicePairNumVec.begin(), indicePairNumVec.end());
- indicePairMaxSize = *indicePairVecMaxSizeIter;
- }*/
-
- auto options =
- torch::TensorOptions().dtype(features.dtype()).device(features.device());
- // auto indicePairOptions =
- // torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
-
- torch::Tensor output =
- torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
- torch::Tensor inputBuffer =
- torch::zeros({indicePairMaxSize, numInPlanes}, options);
- torch::Tensor outputBuffer =
- torch::zeros({indicePairMaxSize, numOutPlanes}, options);
- filters = filters.view({-1, numInPlanes, numOutPlanes});
- if (subM) { // the center index of subm conv don't need gather and scatter
- // add.
- torch::mm_out(output, features, filters[indicePairMaxOffset]);
- }
- double totalGatherTime = 0;
- double totalGEMMTime = 0;
- double totalSAddTime = 0;
- for (int i = 0; i < kernelVolume; ++i) {
- auto nHot = indicePairNumCpu.data_ptr()[i];
- if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
- continue;
- }
- // auto timer = spconv::CudaContextTimer<>();
- auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
- {nHot, numOutPlanes}, options);
- auto inputBufferBlob =
- torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
-
- if (device == torch::kCPU) {
- sparse_gather_cpu(inputBuffer, features, indicePairs[i][inverse], nHot);
- }
-#ifdef TV_CUDA
- else if (device == torch::kCUDA) {
- sparse_gather_cuda(inputBuffer, features, indicePairs[i][inverse], nHot);
- }
-#endif
- else {
- TV_ASSERT_INVALID_ARG(false, "unknown device type");
- }
-
- // totalGatherTime += timer.report() / 1000.0;
- torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
- // totalGEMMTime += timer.report() / 1000.0;
-
- if (device == torch::kCPU) {
- sparse_scatter_add_cpu(outputBuffer, output, indicePairs[i][!inverse],
- nHot);
- }
-#ifdef TV_CUDA
- else if (device == torch::kCUDA) {
- sparse_scatter_add_cuda(outputBuffer, output, indicePairs[i][!inverse],
- nHot);
- }
-#endif
- else {
- TV_ASSERT_INVALID_ARG(false, "unknown device type");
- }
-
- // totalSAddTime += timer.report() / 1000.0;
- }
- // std::cout << "gather time " << totalGatherTime << std::endl;
- // std::cout << "gemm time " << totalGEMMTime << std::endl;
- // std::cout << "scatteradd time " << totalSAddTime << std::endl;
- return output;
-}
-} // namespace spconv
-
-#endif
\ No newline at end of file
diff --git a/include/spconv/geometry.h b/include/spconv/geometry.h
deleted file mode 100644
index d6bf3de..0000000
--- a/include/spconv/geometry.h
+++ /dev/null
@@ -1,183 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPCONV_GEOMETRY_H_
-#define SPCONV_GEOMETRY_H_
-
-#include
-#include
-#include
-#include
-#include
-namespace spconv {
-
-namespace detail {
-
-template struct ToUnsigned;
-
-template <> struct ToUnsigned { using type = uint32_t; };
-
-template <> struct ToUnsigned { using type = uint64_t; };
-
-template struct FNVInternal;
-template <> struct FNVInternal {
- constexpr static uint32_t defaultOffsetBasis = 0x811C9DC5;
- constexpr static uint32_t prime = 0x01000193;
-};
-
-template <> struct FNVInternal {
- constexpr static uint64_t defaultOffsetBasis = 0xcbf29ce484222325;
- constexpr static uint64_t prime = 0x100000001b3;
-};
-
-} // namespace detail
-template
-using to_unsigned_t = typename detail::ToUnsigned>::type;
-
-template struct FNV1a : detail::FNVInternal {
- std::size_t operator()(const T *data, std::size_t size) {
- to_unsigned_t hash = detail::FNVInternal::defaultOffsetBasis;
- for (std::size_t i = 0; i < size; ++i) {
- hash *= detail::FNVInternal::prime;
- hash ^= static_cast>(data[i]);
- }
- return hash;
- }
-};
-
-template
-TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
- const Index *kernelSize,
- const Index *stride, const Index *padding,
- const Index *dilation,
- const Index *outSpatialShape, Index *out) {
- Index lowers[NDim];
- Index uppers[NDim];
- Index counter[NDim];
- Index counterSize[NDim];
- Index pointCounter = 0;
- Index val;
- Index numPoints = 1;
- Index m, offset;
- bool valid = false;
-#pragma unroll
- for (int i = 0; i < NDim; ++i) {
- lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
- stride[i] + padding[i]) /
- stride[i];
- uppers[i] = (input_pos[i] + padding[i]) / stride[i];
- }
-
-#pragma unroll
- for (unsigned i = 0; i < NDim; ++i) {
- counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
- numPoints *= counterSize[i];
- }
-
-#pragma unroll
- for (int i = 0; i < NDim; ++i) {
- counter[i] = 0;
- }
- for (int i = 0; i < numPoints; ++i) {
- valid = true;
- m = 1;
- offset = 0;
-#pragma unroll
- for (int j = NDim - 1; j >= 0; --j) {
- val = uppers[j] - counter[j] * dilation[j];
- out[pointCounter * (NDim + 1) + j] = val;
- if (val < 0 || (val > outSpatialShape[j] - 1)) {
- valid = false;
- // break;
- }
- offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
- m *= kernelSize[j];
- }
-
- out[pointCounter * (NDim + 1) + NDim] = offset;
- if (valid)
- ++pointCounter;
- counter[NDim - 1] += 1;
-#pragma unroll
- for (int c = NDim - 1; c >= 0; --c) {
- if (counter[c] == counterSize[c] && c > 0) {
- counter[c - 1] += 1;
- counter[c] = 0;
- }
- }
- }
- return pointCounter;
-}
-
-template
-TV_HOST_DEVICE Index getValidOutPosTranspose(
- const Index *input_pos, const Index *kernelSize, const Index *stride,
- const Index *padding, const Index *dilation, const Index *outSpatialShape,
- Index *out) {
- Index lowers[NDim];
- Index uppers[NDim];
- Index counter[NDim];
- Index counterSize[NDim];
- Index pointCounter = 0;
- Index val;
- Index numPoints = 1;
- Index m, offset;
- bool valid = false;
-#pragma unroll
- for (int i = 0; i < NDim; ++i) {
- lowers[i] = input_pos[i] * stride[i] - padding[i];
- uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
- }
-#pragma unroll
- for (unsigned i = 0; i < NDim; ++i) {
- counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
- numPoints *= counterSize[i];
- }
-#pragma unroll
- for (int i = 0; i < NDim; ++i) {
- counter[i] = 0;
- }
- for (int i = 0; i < numPoints; ++i) {
- valid = true;
- m = 1;
- offset = 0;
-#pragma unroll
- for (int j = NDim - 1; j >= 0; --j) {
- val = uppers[j] - counter[j] * dilation[j];
- out[pointCounter * (NDim + 1) + j] = val;
- if (val < 0 || (val > outSpatialShape[j] - 1)) {
- valid = false;
- // break;
- }
- offset += m * (val - lowers[j]) / dilation[j];
- m *= kernelSize[j];
- }
- out[pointCounter * (NDim + 1) + NDim] = offset;
- if (valid)
- ++pointCounter;
- counter[NDim - 1] += 1;
-#pragma unroll
- for (int c = NDim - 1; c >= 0; --c) {
- if (counter[c] == counterSize[c] && c > 0) {
- counter[c - 1] += 1;
- counter[c] = 0;
- }
- }
- }
- return pointCounter;
-}
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
diff --git a/include/spconv/indice.cu.h b/include/spconv/indice.cu.h
deleted file mode 100644
index f21cc47..0000000
--- a/include/spconv/indice.cu.h
+++ /dev/null
@@ -1,571 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef INDICE_CU_H_
-#define INDICE_CU_H_
-#include
-#include
-#include
-#include
-
-namespace spconv {
-
-template
-struct ConvIndiceDispatch;
-
-template
-struct ConvIndiceDispatch {
- constexpr static auto *func = getValidOutPosTranspose;
-};
-template
-struct ConvIndiceDispatch {
- constexpr static auto *func = getValidOutPos;
-};
-
-template
-__global__ void prepareIndicePairsKernel(
- tv::TensorView indicesIn, tv::TensorView indicePairs,
- tv::TensorView indiceNum, tv::TensorView indicePairUnique,
- const tv::SimpleVector kernelSize,
- const tv::SimpleVector stride,
- const tv::SimpleVector padding,
- const tv::SimpleVector dilation,
- const tv::SimpleVector outSpatialShape) {
- auto numActIn = indicesIn.dim(0);
- Index spatialVolume = 1;
-#pragma unroll
- for (int i = 0; i < NDim; ++i) {
- spatialVolume *= outSpatialShape[i];
- }
- Index kernelVolume = 1;
-#pragma unroll
- for (int i = 0; i < NDim; ++i) {
- kernelVolume *= kernelSize[i];
- }
- Index numValidPoints = 0;
- Index validPoints[KernelMaxVolume * (NDim + 1)];
- Index *pointPtr = nullptr;
- auto indicePairsDim2 = indicePairs.dim(2);
- Index index;
- for (int ix : tv::KernelLoopX(numActIn)) {
- numValidPoints = ConvIndiceDispatch::func(
- indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
- stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
- validPoints);
- for (Index i = 0; i < numValidPoints; ++i) {
- pointPtr = validPoints + i * (NDim + 1);
- auto offset = pointPtr[NDim];
- Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
- indicePairs(0, offset, oldNum) = ix;
- index = tv::ArrayIndexRowMajor::runPtrs(
- pointPtr, outSpatialShape.data(), 0) +
- spatialVolume * indicesIn(ix, 0);
- indicePairs(1, offset, oldNum) = index;
- indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
- }
- }
-}
-
-template
-__global__ void assignGridAndIndiceOutKernel(
- tv::TensorView indicesOut, tv::TensorView gridsOut,
- int numAct, tv::TensorView indicePairs,
- tv::TensorView indicePairUnique,
- const tv::SimpleVector outSpatialShape, int batchSize) {
-
- Index index;
- auto indicesOutPtr = indicesOut.data();
- for (int ix : tv::KernelLoopX(numAct)) {
- index = indicePairUnique[ix];
- gridsOut[index] = ix;
- index = tv::rowArrayIdxInv(
- index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
- indicesOut[ix * (NDim + 1)] = index % batchSize;
- }
-}
-
-template
-__global__ void
-assignIndiceOutKernel(tv::TensorView indicesOut, int numAct,
- tv::TensorView indicePairUnique,
- const tv::SimpleVector outSpatialShape,
- int batchSize) {
-
- Index index;
- auto indicesOutPtr = indicesOut.data();
- for (unsigned ix : tv::KernelLoopX(numAct)) {
- index = indicePairUnique[ix];
- index = tv::rowArrayIdxInv(
- index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
- indicesOut[ix * (NDim + 1)] = index % batchSize;
- }
-}
-
-template
-__global__ void
-assignIndicePairsHashKernel(tv::TensorView indicesOut, int numActIn,
- tv::TensorView indicePairs,
- tv::TensorView indicePairUnique,
- unsigned table_size, const cuhash::Entry *table,
- cuhash::Functions constants,
- uint2 stash_constants, unsigned stash_count) {
-
- Index index;
- int kernelVolume = indicePairs.dim(1);
- auto indicePairsOut = indicePairs.subview(1);
- for (int ix : tv::KernelLoopX(numActIn)) {
- for (int i = 0; i < kernelVolume; ++i) {
- index = indicePairsOut(i, ix);
- if (index > -1) {
- auto val = cuhash::retrieve((unsigned)(index), table_size, table,
- constants, stash_constants, stash_count);
- assert(val != cuhash::kNotFound);
- indicePairsOut(i, ix) = (unsigned)val;
- }
- }
- }
-}
-
-template
-__global__ void
-assignIndicePairsKernel(tv::TensorView indicesOut,
- tv::TensorView gridsOut, int numActIn,
- tv::TensorView indicePairs,
- tv::TensorView indicePairUnique,
- const tv::SimpleVector outSpatialShape) {
-
- Index index;
- int kernelVolume = indicePairs.dim(1);
- auto indicePairsOut = indicePairs.subview(1);
-
- for (int ix : tv::KernelLoopX(numActIn)) {
- for (int i = 0; i < kernelVolume; ++i) {
- index = indicePairsOut(i, ix);
- if (index > -1) {
- indicePairsOut(i, ix) = gridsOut[index];
- }
- }
- }
-}
-
-template
-__global__ void
-assignIndicePairsLimitedKernel(tv::TensorView gridsOut, int numActIn,
- tv::TensorView indicePairs,
- tv::TensorView indiceNum) {
-
- Index index, val;
- int kernelVolume = indicePairs.dim(0);
- for (int ix : tv::KernelLoopX(numActIn)) {
- for (int i = 0; i < kernelVolume; ++i) {
- index = indicePairs(i, 1, ix);
- if (index != -1) {
- val = gridsOut[index];
- if (val != -1) {
- auto oldNum = atomicAdd(indiceNum.data() + i, Index(1));
- indicePairs(i, 0, oldNum) = indicePairs(i, 0, ix);
- indicePairs(i, 1, oldNum) = val;
- }
- }
- }
- }
-}
-
-template
-__global__ void prepareSubMGridKernel(
- tv::TensorView indicesIn, tv::TensorView gridsOut,
- const tv::SimpleVector outSpatialShape, Index spatialVolume) {
- auto numActIn = indicesIn.dim(0);
- Index index = 0;
- for (int ix : tv::KernelLoopX(numActIn)) {
- index =
- tv::ArrayIndexRowMajor::runPtrs(
- indicesIn.data() + ix * (NDim + 1) + 1, outSpatialShape.data(), 0) +
- spatialVolume * indicesIn(ix, 0);
- gridsOut[index] = ix;
- }
-}
-
-template
-__global__ void
-prepareSubMHashKernel(tv::TensorView indicesIn, unsigned *keys,
- unsigned *values,
- const tv::SimpleVector outSpatialShape) {
- auto numActIn = indicesIn.dim(0);
- Index spatialVolume = 1;
-#pragma unroll
- for (int i = 0; i < NDim; ++i) {
- spatialVolume *= outSpatialShape[i];
- }
- Index index = 0;
- for (int ix : tv::KernelLoopX(numActIn)) {
- index = tv::rowArrayIdx(indicesIn.data() + ix * (NDim + 1) + 1,
- outSpatialShape.data()) +
- spatialVolume * indicesIn(ix, 0);
- keys[ix] = index;
- values[ix] = ix;
- }
-}
-
-template
-__global__ void getSubMIndicePairsKernel(
- tv::TensorView indicesIn, tv::TensorView gridsOut,
- tv::TensorView indicePairs, tv::TensorView indiceNum,
- const tv::SimpleVector kernelSize,
- const tv::SimpleVector stride,
- const tv::SimpleVector padding,
- const tv::SimpleVector dilation,
- const tv::SimpleVector outSpatialShape) {
- auto numActIn = indicesIn.dim(0);
- Index spatialVolume = 1;
-#pragma unroll
- for (int i = 0; i < NDim; ++i) {
- spatialVolume *= outSpatialShape[i];
- }
- Index numValidPoints = 0;
- Index validPoints[KernelMaxVolume * (NDim + 1)];
- Index *pointPtr = nullptr;
- Index index = 0;
- for (int ix : tv::KernelLoopX(numActIn)) {
- numValidPoints = getValidOutPos(
- indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
- stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
- validPoints);
- for (int i = 0; i < numValidPoints; ++i) {
- pointPtr = validPoints + i * (NDim + 1);
- auto offset = pointPtr[NDim];
- index = tv::ArrayIndexRowMajor::runPtrs(
- pointPtr, outSpatialShape.data(), 0) +
- spatialVolume * indicesIn(ix, 0);
- if (gridsOut[index] > -1) {
- Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
- indicePairs(1, offset, oldNum) = gridsOut[index];
- indicePairs(0, offset, oldNum) = ix;
- }
- }
- }
-}
-
-template
-__global__ void getSubMIndicePairsUnrollKernel3(
- tv::TensorView indicesIn, tv::TensorView gridsOut,
- tv::TensorView indicePairs, tv::TensorView indiceNum,
- const tv::SimpleVector outSpatialShape, Index spatialVolume) {
- auto numActIn = indicesIn.dim(0);
-
- Index point[3];
- Index index = 0;
- Index offset;
- constexpr unsigned KV = K0 * K1 * K2;
- constexpr unsigned center = KV / 2;
- *(indiceNum.data() + center) = numActIn;
- for (int ix : tv::KernelLoopX(numActIn)) {
- const Index *indice_data = indicesIn.data() + ix * (3 + 1);
-#pragma unroll
- for (int i = 0; i < K0; ++i) {
-#pragma unroll
- for (int j = 0; j < K1; ++j) {
-#pragma unroll
- for (int k = 0; k < K2; ++k) {
- offset = i * K1 * K2 + j * K2 + k;
- if (offset > center) {
- continue;
- }
- if (center == offset) {
- // center of subm indice pairs dont need atomicadd
- indicePairs(1, offset, ix) = ix;
- indicePairs(0, offset, ix) = ix;
- } else {
- point[2] = indice_data[3] - k + K2 / 2;
- point[1] = indice_data[2] - j + K1 / 2;
- point[0] = indice_data[1] - i + K0 / 2;
- if (point[1] >= 0 && point[1] < outSpatialShape[1] &&
- point[2] >= 0 && point[2] < outSpatialShape[2] &&
- point[0] >= 0 && point[0] < outSpatialShape[0]) {
- index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
- point, outSpatialShape.data(), 0) +
- spatialVolume * indice_data[0];
- if (gridsOut[index] != -1) {
- // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
- // - 1]
- Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
- atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
- indicePairs(1, offset, oldNum) = gridsOut[index];
- indicePairs(0, offset, oldNum) = ix;
- indicePairs(1, KV - offset - 1, oldNum) = ix;
- indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index];
- }
- }
- }
- }
- }
- }
- }
-}
-
-template
-__global__ void getSubMIndicePairsUnrollKernel2(
- tv::TensorView indicesIn, tv::TensorView gridsOut,
- tv::TensorView